From 3dc59262f76241b342316bbac8b5ffe138995b2d Mon Sep 17 00:00:00 2001
From: Shrirang Bagul <shrirang.bagul@canonical.com>
Date: Thu, 24 Nov 2016 13:33:43 +0800
Subject: iio: st_sensors: match sensors using ACPI handle

Add support to match st sensors using information passed from ACPI DST
tables.

Signed-off-by: Shrirang Bagul <shrirang.bagul@canonical.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/common/st_sensors_i2c.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors_i2c.h b/include/linux/iio/common/st_sensors_i2c.h
index 1796af093368..254de3c7dde8 100644
--- a/include/linux/iio/common/st_sensors_i2c.h
+++ b/include/linux/iio/common/st_sensors_i2c.h
@@ -28,4 +28,13 @@ static inline void st_sensors_of_i2c_probe(struct i2c_client *client,
 }
 #endif
 
+#ifdef CONFIG_ACPI
+int st_sensors_match_acpi_device(struct device *dev);
+#else
+static inline int st_sensors_match_acpi_device(struct device *dev)
+{
+	return -ENODEV;
+}
+#endif
+
 #endif /* ST_SENSORS_I2C_H */
-- 
cgit v1.2.3


From a96cd0f901eecd9589477cc2cd46bdb4f1f3e49a Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 28 Nov 2016 14:41:16 -0800
Subject: iio: accel: hid-sensor-accel-3d: Add timestamp

Added timestamp channel. With this change, each sample has a timestamp.
This timestamp can be from the sensor hub when present or local kernel
timestamp. HID sensors can send timestamp with input data using usage id
HID_USAGE_SENSOR_TIME_TIMESTAMP. This timestamp value is converted to
nano seconds before pushing this sample to the iio core.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/hid-sensor-accel-3d.c            | 32 ++++++++++++++-----
 .../iio/common/hid-sensors/hid-sensor-attributes.c | 36 +++++++++++++++++++---
 include/linux/hid-sensor-hub.h                     |  4 +++
 include/linux/hid-sensor-ids.h                     |  1 +
 4 files changed, 60 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/hid-sensor-accel-3d.c b/drivers/iio/accel/hid-sensor-accel-3d.c
index ab1e238d5c75..39de79ce8778 100644
--- a/drivers/iio/accel/hid-sensor-accel-3d.c
+++ b/drivers/iio/accel/hid-sensor-accel-3d.c
@@ -42,11 +42,13 @@ struct accel_3d_state {
 	struct hid_sensor_hub_callbacks callbacks;
 	struct hid_sensor_common common_attributes;
 	struct hid_sensor_hub_attribute_info accel[ACCEL_3D_CHANNEL_MAX];
-	u32 accel_val[ACCEL_3D_CHANNEL_MAX];
+	/* Reserve for 3 channels + padding + timestamp */
+	u32 accel_val[ACCEL_3D_CHANNEL_MAX + 3];
 	int scale_pre_decml;
 	int scale_post_decml;
 	int scale_precision;
 	int value_offset;
+	int64_t timestamp;
 };
 
 static const u32 accel_3d_addresses[ACCEL_3D_CHANNEL_MAX] = {
@@ -87,7 +89,8 @@ static const struct iio_chan_spec accel_3d_channels[] = {
 		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
 		BIT(IIO_CHAN_INFO_HYSTERESIS),
 		.scan_index = CHANNEL_SCAN_INDEX_Z,
-	}
+	},
+	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
 /* Adjust channel real bits based on report descriptor */
@@ -192,11 +195,11 @@ static const struct iio_info accel_3d_info = {
 };
 
 /* Function to push data to buffer */
-static void hid_sensor_push_data(struct iio_dev *indio_dev, const void *data,
-	int len)
+static void hid_sensor_push_data(struct iio_dev *indio_dev, void *data,
+				 int len, int64_t timestamp)
 {
 	dev_dbg(&indio_dev->dev, "hid_sensor_push_data\n");
-	iio_push_to_buffers(indio_dev, data);
+	iio_push_to_buffers_with_timestamp(indio_dev, data, timestamp);
 }
 
 /* Callback handler to send event after all samples are received and captured */
@@ -208,10 +211,17 @@ static int accel_3d_proc_event(struct hid_sensor_hub_device *hsdev,
 	struct accel_3d_state *accel_state = iio_priv(indio_dev);
 
 	dev_dbg(&indio_dev->dev, "accel_3d_proc_event\n");
-	if (atomic_read(&accel_state->common_attributes.data_ready))
+	if (atomic_read(&accel_state->common_attributes.data_ready)) {
+		if (!accel_state->timestamp)
+			accel_state->timestamp = iio_get_time_ns(indio_dev);
+
 		hid_sensor_push_data(indio_dev,
-				accel_state->accel_val,
-				sizeof(accel_state->accel_val));
+				     accel_state->accel_val,
+				     sizeof(accel_state->accel_val),
+				     accel_state->timestamp);
+
+		accel_state->timestamp = 0;
+	}
 
 	return 0;
 }
@@ -236,6 +246,12 @@ static int accel_3d_capture_sample(struct hid_sensor_hub_device *hsdev,
 						*(u32 *)raw_data;
 		ret = 0;
 	break;
+	case HID_USAGE_SENSOR_TIME_TIMESTAMP:
+		accel_state->timestamp =
+			hid_sensor_convert_timestamp(
+					&accel_state->common_attributes,
+					*(int64_t *)raw_data);
+	break;
 	default:
 		break;
 	}
diff --git a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
index 7ef94a90ecf7..7afdac42ed42 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
@@ -58,6 +58,10 @@ static struct {
 
 	{HID_USAGE_SENSOR_PRESSURE, 0, 100, 0},
 	{HID_USAGE_SENSOR_PRESSURE, HID_USAGE_SENSOR_UNITS_PASCAL, 0, 1000000},
+
+	{HID_USAGE_SENSOR_TIME_TIMESTAMP, 0, 1000000000, 0},
+	{HID_USAGE_SENSOR_TIME_TIMESTAMP, HID_USAGE_SENSOR_UNITS_MILLISECOND,
+		1000000, 0},
 };
 
 static int pow_10(unsigned power)
@@ -346,6 +350,13 @@ int hid_sensor_format_scale(u32 usage_id,
 }
 EXPORT_SYMBOL(hid_sensor_format_scale);
 
+int64_t hid_sensor_convert_timestamp(struct hid_sensor_common *st,
+				     int64_t raw_value)
+{
+	return st->timestamp_ns_scale * raw_value;
+}
+EXPORT_SYMBOL(hid_sensor_convert_timestamp);
+
 static
 int hid_sensor_get_reporting_interval(struct hid_sensor_hub_device *hsdev,
 					u32 usage_id,
@@ -367,6 +378,7 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 					struct hid_sensor_common *st)
 {
 
+	struct hid_sensor_hub_attribute_info timestamp;
 
 	hid_sensor_get_reporting_interval(hsdev, usage_id, st);
 
@@ -385,11 +397,25 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 			HID_USAGE_SENSOR_PROP_SENSITIVITY_ABS,
 			 &st->sensitivity);
 
-	hid_dbg(hsdev->hdev, "common attributes: %x:%x, %x:%x, %x:%x %x:%x\n",
-			st->poll.index, st->poll.report_id,
-			st->report_state.index, st->report_state.report_id,
-			st->power_state.index, st->power_state.report_id,
-			st->sensitivity.index, st->sensitivity.report_id);
+	sensor_hub_input_get_attribute_info(hsdev,
+					    HID_INPUT_REPORT, usage_id,
+					    HID_USAGE_SENSOR_TIME_TIMESTAMP,
+					    &timestamp);
+	if (timestamp.index >= 0 && timestamp.report_id) {
+		int val0, val1;
+
+		hid_sensor_format_scale(HID_USAGE_SENSOR_TIME_TIMESTAMP,
+					&timestamp, &val0, &val1);
+		st->timestamp_ns_scale = val0;
+	} else
+		st->timestamp_ns_scale = 1000000000;
+
+	hid_dbg(hsdev->hdev, "common attributes: %x:%x, %x:%x, %x:%x %x:%x %x:%x\n",
+		st->poll.index, st->poll.report_id,
+		st->report_state.index, st->report_state.report_id,
+		st->power_state.index, st->power_state.report_id,
+		st->sensitivity.index, st->sensitivity.report_id,
+		timestamp.index, timestamp.report_id);
 
 	return 0;
 }
diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index dd85f3503410..7ef111d3ecc5 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -232,6 +232,7 @@ struct hid_sensor_common {
 	atomic_t data_ready;
 	atomic_t user_requested_state;
 	struct iio_trigger *trigger;
+	int timestamp_ns_scale;
 	struct hid_sensor_hub_attribute_info poll;
 	struct hid_sensor_hub_attribute_info report_state;
 	struct hid_sensor_hub_attribute_info power_state;
@@ -271,4 +272,7 @@ int hid_sensor_format_scale(u32 usage_id,
 
 s32 hid_sensor_read_poll_value(struct hid_sensor_common *st);
 
+int64_t hid_sensor_convert_timestamp(struct hid_sensor_common *st,
+				     int64_t raw_value);
+
 #endif
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index f2ee90aed0c2..9a3a9db1b39d 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -95,6 +95,7 @@
 #define HID_USAGE_SENSOR_TIME_HOUR				0x200525
 #define HID_USAGE_SENSOR_TIME_MINUTE				0x200526
 #define HID_USAGE_SENSOR_TIME_SECOND				0x200527
+#define HID_USAGE_SENSOR_TIME_TIMESTAMP				0x200529
 
 /* Units */
 #define HID_USAGE_SENSOR_UNITS_NOT_SPECIFIED			0x00
-- 
cgit v1.2.3


From 409c69be433b819c924a8d1c457a835bc6d51700 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Sat, 10 Dec 2016 11:51:11 +0200
Subject: ASoC: samsung: Remove tests of member address

The driver was checking for non-NULL address of struct's members:
 - s3c_audio_pdata->type (union),
 - s3c_audio_pdata->type.i2s (embedded struct).

This is pointless as these will be always non-NULL.  The 's3c_audio_pdata'
is always initialized in static memory so it will be zeroed.
Additionally the 'type' member was an union with only one member.

It is safe to reorganize the structures to get rid of useless union and
checks for addresses to fix the coccinelle warning:
	>> sound/soc/samsung/i2s.c:1270:2-4: ERROR: test of a variable/field address

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-s3c64xx/dev-audio.c      |  4 +---
 include/linux/platform_data/asoc-s3c.h |  6 ++----
 sound/soc/samsung/i2s.c                | 10 ++--------
 3 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c64xx/dev-audio.c b/arch/arm/mach-s3c64xx/dev-audio.c
index b57783371d52..247dcc0b691e 100644
--- a/arch/arm/mach-s3c64xx/dev-audio.c
+++ b/arch/arm/mach-s3c64xx/dev-audio.c
@@ -106,9 +106,7 @@ static struct s3c_audio_pdata i2sv4_pdata = {
 	.dma_playback = DMACH_HSI_I2SV40_TX,
 	.dma_capture = DMACH_HSI_I2SV40_RX,
 	.type = {
-		.i2s = {
-			.quirks = QUIRK_PRI_6CHAN,
-		},
+		.quirks = QUIRK_PRI_6CHAN,
 	},
 };
 
diff --git a/include/linux/platform_data/asoc-s3c.h b/include/linux/platform_data/asoc-s3c.h
index 15bf56ee8af7..90641a5daaf0 100644
--- a/include/linux/platform_data/asoc-s3c.h
+++ b/include/linux/platform_data/asoc-s3c.h
@@ -18,7 +18,7 @@
 
 extern void s3c64xx_ac97_setup_gpio(int);
 
-struct samsung_i2s {
+struct samsung_i2s_type {
 /* If the Primary DAI has 5.1 Channels */
 #define QUIRK_PRI_6CHAN		(1 << 0)
 /* If the I2S block has a Stereo Overlay Channel */
@@ -47,7 +47,5 @@ struct s3c_audio_pdata {
 	void *dma_capture;
 	void *dma_play_sec;
 	void *dma_capture_mic;
-	union {
-		struct samsung_i2s i2s;
-	} type;
+	struct samsung_i2s_type type;
 };
diff --git a/sound/soc/samsung/i2s.c b/sound/soc/samsung/i2s.c
index e00974bc5616..d55326289a4a 100644
--- a/sound/soc/samsung/i2s.c
+++ b/sound/soc/samsung/i2s.c
@@ -1218,7 +1218,6 @@ static int samsung_i2s_probe(struct platform_device *pdev)
 {
 	struct i2s_dai *pri_dai, *sec_dai = NULL;
 	struct s3c_audio_pdata *i2s_pdata = pdev->dev.platform_data;
-	struct samsung_i2s *i2s_cfg = NULL;
 	struct resource *res;
 	u32 regs_base, quirks = 0, idma_addr = 0;
 	struct device_node *np = pdev->dev.of_node;
@@ -1267,13 +1266,8 @@ static int samsung_i2s_probe(struct platform_device *pdev)
 		pri_dai->dma_capture.filter_data = i2s_pdata->dma_capture;
 		pri_dai->filter = i2s_pdata->dma_filter;
 
-		if (&i2s_pdata->type)
-			i2s_cfg = &i2s_pdata->type.i2s;
-
-		if (i2s_cfg) {
-			quirks = i2s_cfg->quirks;
-			idma_addr = i2s_cfg->idma_addr;
-		}
+		quirks = i2s_pdata->type.quirks;
+		idma_addr = i2s_pdata->type.idma_addr;
 	} else {
 		quirks = i2s_dai_data->quirks;
 		if (of_property_read_u32(np, "samsung,idma-addr",
-- 
cgit v1.2.3


From e8314d7d53c8b050aac2828a5de5f28a997b468b Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Tue, 6 Dec 2016 20:22:36 +0100
Subject: misc: atmel-ssc: register as sound DAI if #sound-dai-cells is present

The SSC is currently not usable with the ASoC simple-audio-card, as
every SSC audio user has to build a platform driver that may do as
little as calling atmel_ssc_set_audio/atmel_ssc_put_audio (which
allocates the SSC and registers a DAI with the ASoC subsystem).

So, have that happen automatically, if the #sound-dai-cells property
is present in devicetree, which it has to be anyway for simple audio
card to work.

Signed-off-by: Peter Rosin <peda@axentia.se>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/misc/atmel-ssc.txt         |  2 +
 drivers/misc/atmel-ssc.c                           | 50 ++++++++++++++++++++++
 include/linux/atmel-ssc.h                          |  1 +
 3 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/misc/atmel-ssc.txt b/Documentation/devicetree/bindings/misc/atmel-ssc.txt
index efc98ea1f23d..f8629bb73945 100644
--- a/Documentation/devicetree/bindings/misc/atmel-ssc.txt
+++ b/Documentation/devicetree/bindings/misc/atmel-ssc.txt
@@ -24,6 +24,8 @@ Optional properties:
        this parameter to choose where the clock from.
      - By default the clock is from TK pin, if the clock from RK pin, this
        property is needed.
+  - #sound-dai-cells: Should contain <0>.
+     - This property makes the SSC into an automatically registered DAI.
 
 Examples:
 - PDC transfer:
diff --git a/drivers/misc/atmel-ssc.c b/drivers/misc/atmel-ssc.c
index 0516ecda54d3..b2a0340f277e 100644
--- a/drivers/misc/atmel-ssc.c
+++ b/drivers/misc/atmel-ssc.c
@@ -20,6 +20,8 @@
 
 #include <linux/of.h>
 
+#include "../../sound/soc/atmel/atmel_ssc_dai.h"
+
 /* Serialize access to ssc_list and user count */
 static DEFINE_SPINLOCK(user_lock);
 static LIST_HEAD(ssc_list);
@@ -145,6 +147,49 @@ static inline const struct atmel_ssc_platform_data * __init
 		platform_get_device_id(pdev)->driver_data;
 }
 
+#ifdef CONFIG_SND_ATMEL_SOC_SSC
+static int ssc_sound_dai_probe(struct ssc_device *ssc)
+{
+	struct device_node *np = ssc->pdev->dev.of_node;
+	int ret;
+	int id;
+
+	ssc->sound_dai = false;
+
+	if (!of_property_read_bool(np, "#sound-dai-cells"))
+		return 0;
+
+	id = of_alias_get_id(np, "ssc");
+	if (id < 0)
+		return id;
+
+	ret = atmel_ssc_set_audio(id);
+	ssc->sound_dai = !ret;
+
+	return ret;
+}
+
+static void ssc_sound_dai_remove(struct ssc_device *ssc)
+{
+	if (!ssc->sound_dai)
+		return;
+
+	atmel_ssc_put_audio(of_alias_get_id(ssc->pdev->dev.of_node, "ssc"));
+}
+#else
+static inline int ssc_sound_dai_probe(struct ssc_device *ssc)
+{
+	if (of_property_read_bool(ssc->pdev->dev.of_node, "#sound-dai-cells"))
+		return -ENOTSUPP;
+
+	return 0;
+}
+
+static inline void ssc_sound_dai_remove(struct ssc_device *ssc)
+{
+}
+#endif
+
 static int ssc_probe(struct platform_device *pdev)
 {
 	struct resource *regs;
@@ -204,6 +249,9 @@ static int ssc_probe(struct platform_device *pdev)
 	dev_info(&pdev->dev, "Atmel SSC device at 0x%p (irq %d)\n",
 			ssc->regs, ssc->irq);
 
+	if (ssc_sound_dai_probe(ssc))
+		dev_err(&pdev->dev, "failed to auto-setup ssc for audio\n");
+
 	return 0;
 }
 
@@ -211,6 +259,8 @@ static int ssc_remove(struct platform_device *pdev)
 {
 	struct ssc_device *ssc = platform_get_drvdata(pdev);
 
+	ssc_sound_dai_remove(ssc);
+
 	spin_lock(&user_lock);
 	list_del(&ssc->list);
 	spin_unlock(&user_lock);
diff --git a/include/linux/atmel-ssc.h b/include/linux/atmel-ssc.h
index 7c0f6549898b..fdb545101ede 100644
--- a/include/linux/atmel-ssc.h
+++ b/include/linux/atmel-ssc.h
@@ -20,6 +20,7 @@ struct ssc_device {
 	int			user;
 	int			irq;
 	bool			clk_from_rk_pin;
+	bool			sound_dai;
 };
 
 struct ssc_device * __must_check ssc_request(unsigned int ssc_num);
-- 
cgit v1.2.3


From fd50d71f94fb1c8614098949db068cd4c8dbb91d Mon Sep 17 00:00:00 2001
From: Corentin LABBE <clabbe.montjoie@gmail.com>
Date: Tue, 13 Dec 2016 15:51:13 +0100
Subject: hwrng: core - Move hwrng miscdev minor number to
 include/linux/miscdevice.h

This patch move the define for hwrng's miscdev minor number to
include/linux/miscdevice.h.
It's better that all minor number are in the same place.
Rename it to HWRNG_MINOR (from RNG_MISCDEV_MINOR) in he process since
no other miscdev define have MISCDEV in their name.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/core.c | 3 +--
 include/linux/miscdevice.h    | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index f81fd72b6692..c5f131d90473 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -26,7 +26,6 @@
 
 #define RNG_MODULE_NAME		"hw_random"
 #define PFX			RNG_MODULE_NAME ": "
-#define RNG_MISCDEV_MINOR	183 /* official */
 
 static struct hwrng *current_rng;
 static struct task_struct *hwrng_fill;
@@ -285,7 +284,7 @@ static const struct file_operations rng_chrdev_ops = {
 static const struct attribute_group *rng_dev_groups[];
 
 static struct miscdevice rng_miscdev = {
-	.minor		= RNG_MISCDEV_MINOR,
+	.minor		= HWRNG_MINOR,
 	.name		= RNG_MODULE_NAME,
 	.nodename	= "hwrng",
 	.fops		= &rng_chrdev_ops,
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index ed30d5d713e3..5d81f739aa0a 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -31,6 +31,7 @@
 #define SGI_MMTIMER		153
 #define STORE_QUEUE_MINOR	155	/* unused */
 #define I2O_MINOR		166
+#define HWRNG_MINOR		183
 #define MICROCODE_MINOR		184
 #define IRNET_MINOR		187
 #define VFIO_MINOR		196
-- 
cgit v1.2.3


From a1d82aff5df760d933b6ea3a03805dbc2bd73eb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:02 -0500
Subject: kernfs: make kernfs_open_file->mmapped a bitfield

More kernfs_open_file->mutex synchronized flags are planned to be
added.  Convert ->mmapped to a bitfield in preparation.

While at it, make kernfs_fop_mmap() use "true" instead of "1" on
->mmapped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 fs/kernfs/file.c       | 2 +-
 include/linux/kernfs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 78219d5644e9..d0b6fb13ead0 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -516,7 +516,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out_put;
 
 	rc = 0;
-	of->mmapped = 1;
+	of->mmapped = true;
 	of->vm_ops = vma->vm_ops;
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 7056238fd9f5..afd4e5abc4fb 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -185,7 +185,7 @@ struct kernfs_open_file {
 	char			*prealloc_buf;
 
 	size_t			atomic_write_len;
-	bool			mmapped;
+	bool			mmapped:1;
 	const struct vm_operations_struct *vm_ops;
 };
 
-- 
cgit v1.2.3


From 0e67db2f9fe91937e798e3d7d22c50a8438187e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:03 -0500
Subject: kernfs: add kernfs_ops->open/release() callbacks

Add ->open/release() methods to kernfs_ops.  ->open() is called when
the file is opened and ->release() when the file is either released or
severed.  These callbacks can be used, for example, to manage
persistent caching objects over multiple seq_file iterations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 fs/kernfs/dir.c             |  2 +-
 fs/kernfs/file.c            | 51 +++++++++++++++++++++++++++++++++++++++------
 fs/kernfs/kernfs-internal.h |  2 +-
 include/linux/kernfs.h      | 10 +++++++++
 4 files changed, 57 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index cf4c636ff4da..0b9d23bcceb5 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -468,7 +468,7 @@ static void kernfs_drain(struct kernfs_node *kn)
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	}
 
-	kernfs_unmap_bin_file(kn);
+	kernfs_drain_open_files(kn);
 
 	mutex_lock(&kernfs_mutex);
 }
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index d0b6fb13ead0..20c396291ac1 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -708,7 +708,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	if (error)
 		goto err_free;
 
-	((struct seq_file *)file->private_data)->private = of;
+	of->seq_file = file->private_data;
+	of->seq_file->private = of;
 
 	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
 	if (file->f_mode & FMODE_WRITE)
@@ -717,13 +718,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	/* make sure we have open node struct */
 	error = kernfs_get_open_node(kn, of);
 	if (error)
-		goto err_close;
+		goto err_seq_release;
+
+	if (ops->open) {
+		/* nobody has access to @of yet, skip @of->mutex */
+		error = ops->open(of);
+		if (error)
+			goto err_put_node;
+	}
 
 	/* open succeeded, put active references */
 	kernfs_put_active(kn);
 	return 0;
 
-err_close:
+err_put_node:
+	kernfs_put_open_node(kn, of);
+err_seq_release:
 	seq_release(inode, file);
 err_free:
 	kfree(of->prealloc_buf);
@@ -733,11 +743,32 @@ err_out:
 	return error;
 }
 
+/* used from release/drain to ensure that ->release() is called exactly once */
+static void kernfs_release_file(struct kernfs_node *kn,
+				struct kernfs_open_file *of)
+{
+	if (!(kn->flags & KERNFS_HAS_RELEASE))
+		return;
+
+	mutex_lock(&of->mutex);
+	if (!of->released) {
+		/*
+		 * A file is never detached without being released and we
+		 * need to be able to release files which are deactivated
+		 * and being drained.  Don't use kernfs_ops().
+		 */
+		kn->attr.ops->release(of);
+		of->released = true;
+	}
+	mutex_unlock(&of->mutex);
+}
+
 static int kernfs_fop_release(struct inode *inode, struct file *filp)
 {
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
 	struct kernfs_open_file *of = kernfs_of(filp);
 
+	kernfs_release_file(kn, of);
 	kernfs_put_open_node(kn, of);
 	seq_release(inode, filp);
 	kfree(of->prealloc_buf);
@@ -746,12 +777,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn)
+void kernfs_drain_open_files(struct kernfs_node *kn)
 {
 	struct kernfs_open_node *on;
 	struct kernfs_open_file *of;
 
-	if (!(kn->flags & KERNFS_HAS_MMAP))
+	if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
 		return;
 
 	spin_lock_irq(&kernfs_open_node_lock);
@@ -763,10 +794,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn)
 		return;
 
 	mutex_lock(&kernfs_open_file_mutex);
+
 	list_for_each_entry(of, &on->files, list) {
 		struct inode *inode = file_inode(of->file);
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+		if (kn->flags & KERNFS_HAS_MMAP)
+			unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+		kernfs_release_file(kn, of);
 	}
+
 	mutex_unlock(&kernfs_open_file_mutex);
 
 	kernfs_put_open_node(kn, NULL);
@@ -965,6 +1002,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 		kn->flags |= KERNFS_HAS_SEQ_SHOW;
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
+	if (ops->release)
+		kn->flags |= KERNFS_HAS_RELEASE;
 
 	rc = kernfs_add_one(kn);
 	if (rc) {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index bfd551bbf231..3100987cf8ba 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
  */
 extern const struct file_operations kernfs_file_fops;
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn);
+void kernfs_drain_open_files(struct kernfs_node *kn);
 
 /*
  * symlink.c
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index afd4e5abc4fb..a9b11b8d06f2 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -46,6 +46,7 @@ enum kernfs_node_flag {
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
 	KERNFS_EMPTY_DIR	= 0x1000,
+	KERNFS_HAS_RELEASE	= 0x2000,
 };
 
 /* @flags for kernfs_create_root() */
@@ -175,6 +176,7 @@ struct kernfs_open_file {
 	/* published fields */
 	struct kernfs_node	*kn;
 	struct file		*file;
+	struct seq_file		*seq_file;
 	void			*priv;
 
 	/* private fields, do not use outside kernfs proper */
@@ -186,10 +188,18 @@ struct kernfs_open_file {
 
 	size_t			atomic_write_len;
 	bool			mmapped:1;
+	bool			released:1;
 	const struct vm_operations_struct *vm_ops;
 };
 
 struct kernfs_ops {
+	/*
+	 * Optional open/release methods.  Both are called with
+	 * @of->seq_file populated.
+	 */
+	int (*open)(struct kernfs_open_file *of);
+	void (*release)(struct kernfs_open_file *of);
+
 	/*
 	 * Read is handled by either seq_file or raw_read().
 	 *
-- 
cgit v1.2.3


From e90cbebc3fa5caea4c8bfeb0d0157a0cee53efc7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:03 -0500
Subject: cgroup add cftype->open/release() callbacks

Pipe the newly added kernfs->open/release() callbacks through cftype.
While at it, as cleanup operations now can be performed from
->release() instead of ->seq_stop(), make the latter optional.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 include/linux/cgroup-defs.h |  3 +++
 kernel/cgroup.c             | 24 +++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 861b4677fc5b..8a916dc96e84 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -388,6 +388,9 @@ struct cftype {
 	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 
+	int (*open)(struct kernfs_open_file *of);
+	void (*release)(struct kernfs_open_file *of);
+
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2ee9ec3051b2..87167e40c4fa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3503,6 +3503,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->open)
+		return cft->open(of);
+	return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->release)
+		cft->release(of);
+}
+
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
@@ -3553,7 +3570,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
-	seq_cft(seq)->seq_stop(seq, v);
+	if (seq_cft(seq)->seq_stop)
+		seq_cft(seq)->seq_stop(seq, v);
 }
 
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3593,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 
 static struct kernfs_ops cgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_show		= cgroup_seqfile_show,
 };
 
 static struct kernfs_ops cgroup_kf_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_start		= cgroup_seqfile_start,
 	.seq_next		= cgroup_seqfile_next,
-- 
cgit v1.2.3


From 5f617ebbdf10abd49312a89e3b894b927c7367f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:05 -0500
Subject: cgroup: reorder css_set fields

Reorder css_set fields so that they're roughly in the order of how hot
they are.  The rough order is

1. the actual csses
2. reference counter and the default cgroup pointer.
3. task lists and iterations
4. fields used during merge including css_set lookup
5. the rest

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 include/linux/cgroup-defs.h | 54 ++++++++++++++++++++++-----------------------
 kernel/cgroup.c             | 13 ++++++-----
 2 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8a916dc96e84..3c02404cfce9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
  * set for a task.
  */
 struct css_set {
-	/* Reference count */
-	atomic_t refcount;
-
 	/*
-	 * List running through all cgroup groups in the same hash
-	 * slot. Protected by css_set_lock
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
 	 */
-	struct hlist_node hlist;
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	/* reference count */
+	atomic_t refcount;
+
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
 
 	/*
 	 * Lists running through all tasks using this cgroup group.
@@ -167,21 +171,29 @@ struct css_set {
 	struct list_head tasks;
 	struct list_head mg_tasks;
 
+	/* all css_task_iters currently walking this cset */
+	struct list_head task_iters;
+
 	/*
-	 * List of cgrp_cset_links pointing at cgroups referenced from this
-	 * css_set.  Protected by css_set_lock.
+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * attached to an ancestor instead of the cgroup this css_set is
+	 * associated with.  The following node is anchored at
+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+	 * iterate through all css's attached to a given cgroup.
 	 */
-	struct list_head cgrp_links;
+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 
-	/* the default cgroup associated with this css_set */
-	struct cgroup *dfl_cgrp;
+	/*
+	 * List running through all cgroup groups in the same hash
+	 * slot. Protected by css_set_lock
+	 */
+	struct hlist_node hlist;
 
 	/*
-	 * Set of subsystem states, one for each subsystem. This array is
-	 * immutable after creation apart from the init_css_set during
-	 * subsystem registration (at boot time).
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
+	 * css_set.  Protected by css_set_lock.
 	 */
-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+	struct list_head cgrp_links;
 
 	/*
 	 * List of csets participating in the on-going migration either as
@@ -201,18 +213,6 @@ struct css_set {
 	struct cgroup *mg_dst_cgrp;
 	struct css_set *mg_dst_cset;
 
-	/*
-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
-	 * attached to an ancestor instead of the cgroup this css_set is
-	 * associated with.  The following node is anchored at
-	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-	 * iterate through all css's attached to a given cgroup.
-	 */
-	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-	/* all css_task_iters currently walking this cset */
-	struct list_head task_iters;
-
 	/* dead and being drained, ignore for migration */
 	bool dead;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b9e2d85d59b1..1a815f275849 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -647,12 +647,12 @@ struct cgrp_cset_link {
  */
 struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
-	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
-	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
@@ -1095,13 +1095,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	}
 
 	atomic_set(&cset->refcount, 1);
-	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
-	INIT_LIST_HEAD(&cset->mg_preload_node);
-	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_HLIST_NODE(&cset->hlist);
+	INIT_LIST_HEAD(&cset->cgrp_links);
+	INIT_LIST_HEAD(&cset->mg_preload_node);
+	INIT_LIST_HEAD(&cset->mg_node);
 
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
@@ -4384,6 +4384,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	struct task_struct *task;
 	int ret;
 
+	if (cgroup_on_dfl(to))
+		return -EINVAL;
+
 	if (!cgroup_may_migrate_to(to))
 		return -EBUSY;
 
-- 
cgit v1.2.3


From 7b4632f048415263669676dda20fd5d811c3d3e4 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Sat, 24 Dec 2016 23:28:35 +0800
Subject: cgroup: fix a comment typo

Fix a comment typo in cgroup.h.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c83c23f0577b..f6b43fbb141c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
  * @leader: the loop cursor
  * @dst_css: the destination css
- * @tset: takset to iterate
+ * @tset: taskset to iterate
  *
  * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
  * may not contain any.
-- 
cgit v1.2.3


From 3d48b53fb2ae37158e700ffef3f45461ff15c965 Mon Sep 17 00:00:00 2001
From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Thu, 29 Dec 2016 21:37:21 +0100
Subject: net: dev_weight: TX/RX orthogonality

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.

Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
 include/linux/netdevice.h    |  4 ++++
 net/core/dev.c               |  8 ++++++--
 net/core/sysctl_net_core.c   | 31 ++++++++++++++++++++++++++++++-
 net/sched/sch_generic.c      |  2 +-
 5 files changed, 62 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7ea740..b80fbd4e5575 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f7423a74b..ecd78b3c9aba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
+extern int		dev_weight_rx_bias;
+extern int		dev_weight_tx_bias;
+extern int		dev_rx_weight;
+extern int		dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b4b520..56818f7eab2b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3427,7 +3427,11 @@ EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64;            /* old backlog weight */
+int weight_p __read_mostly = 64;           /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = dev_rx_weight;
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e4009f62..eaa72eb0399c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret != 0)
+		return ret;
+
+	dev_rx_weight = weight_p * dev_weight_rx_bias;
+	dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+	return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
 		.data		= &weight_p,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_rx_bias",
+		.data		= &dev_weight_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_tx_bias",
+		.data		= &dev_weight_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
 	},
 	{
 		.procname	= "netdev_max_backlog",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e88519..b052b27a984e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = dev_tx_weight;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
cgit v1.2.3


From 58ae74683ae2c07cd717a91799edb50231061938 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Mon, 19 Dec 2016 12:25:32 +0100
Subject: fscrypt: factor out bio specific functions

That way we can get rid of the direct dependency on CONFIG_BLOCK.

Fixes: d475a507457b ("ubifs: Add skeleton for fscrypto")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Gstir <david@sigma-star.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/Kconfig           |   1 -
 fs/crypto/Makefile          |   1 +
 fs/crypto/bio.c             | 145 ++++++++++++++++++++++++++++++++++++++++
 fs/crypto/crypto.c          | 157 +++++---------------------------------------
 fs/crypto/fscrypt_private.h |  16 ++++-
 include/linux/fscrypto.h    |  11 ++--
 6 files changed, 184 insertions(+), 147 deletions(-)
 create mode 100644 fs/crypto/bio.c

(limited to 'include/linux')

diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index f514978f6688..08b46e6e3995 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,6 +1,5 @@
 config FS_ENCRYPTION
 	tristate "FS Encryption (Per-file encryption)"
-	depends on BLOCK
 	select CRYPTO
 	select CRYPTO_AES
 	select CRYPTO_CBC
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index f17684c48739..9f6607f17b53 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 
 fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
new file mode 100644
index 000000000000..a409a84f1bca
--- /dev/null
+++ b/fs/crypto/bio.c
@@ -0,0 +1,145 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/namei.h>
+#include "fscrypt_private.h"
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+	struct fscrypt_ctx *ctx =
+		container_of(work, struct fscrypt_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+		int ret = fscrypt_decrypt_page(page->mapping->host, page,
+				PAGE_SIZE, 0, page->index);
+
+		if (ret) {
+			WARN_ON_ONCE(1);
+			SetPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	fscrypt_release_ctx(ctx);
+	bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+	INIT_WORK(&ctx->r.work, completion_pages);
+	ctx->r.bio = bio;
+	queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *bounce_page;
+
+	/* The bounce data pages are unmapped. */
+	if ((*page)->mapping)
+		return;
+
+	/* The bounce data page is unmapped. */
+	bounce_page = *page;
+	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+	/* restore control page */
+	*page = ctx->w.control_page;
+
+	if (restore)
+		fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+				sector_t pblk, unsigned int len)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	struct bio *bio;
+	int ret, err = 0;
+
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
+
+	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT);
+	if (IS_ERR(ciphertext_page)) {
+		err = PTR_ERR(ciphertext_page);
+		goto errout;
+	}
+
+	while (len--) {
+		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk,
+					     ZERO_PAGE(0), ciphertext_page,
+					     PAGE_SIZE, 0, GFP_NOFS);
+		if (err)
+			goto errout;
+
+		bio = bio_alloc(GFP_NOWAIT, 1);
+		if (!bio) {
+			err = -ENOMEM;
+			goto errout;
+		}
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_iter.bi_sector =
+			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+		ret = bio_add_page(bio, ciphertext_page,
+					inode->i_sb->s_blocksize, 0);
+		if (ret != inode->i_sb->s_blocksize) {
+			/* should never happen! */
+			WARN_ON(1);
+			bio_put(bio);
+			err = -EIO;
+			goto errout;
+		}
+		err = submit_bio_wait(bio);
+		if ((err == 0) && bio->bi_error)
+			err = -EIO;
+		bio_put(bio);
+		if (err)
+			goto errout;
+		lblk++;
+		pblk++;
+	}
+	err = 0;
+errout:
+	fscrypt_release_ctx(ctx);
+	return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ac8e4f6a3773..02a7a9286449 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -24,7 +24,6 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
-#include <linux/bio.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include "fscrypt_private.h"
@@ -44,7 +43,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static LIST_HEAD(fscrypt_free_ctxs);
 static DEFINE_SPINLOCK(fscrypt_ctx_lock);
 
-static struct workqueue_struct *fscrypt_read_workqueue;
+struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 static struct kmem_cache *fscrypt_ctx_cachep;
@@ -141,16 +140,10 @@ static void page_crypt_complete(struct crypto_async_request *req, int res)
 	complete(&ecr->completion);
 }
 
-typedef enum {
-	FS_DECRYPT = 0,
-	FS_ENCRYPT,
-} fscrypt_direction_t;
-
-static int do_page_crypto(const struct inode *inode,
-			fscrypt_direction_t rw, u64 lblk_num,
-			struct page *src_page, struct page *dest_page,
-			unsigned int len, unsigned int offs,
-			gfp_t gfp_flags)
+int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
+			   u64 lblk_num, struct page *src_page,
+			   struct page *dest_page, unsigned int len,
+			   unsigned int offs, gfp_t gfp_flags)
 {
 	struct {
 		__le64 index;
@@ -205,7 +198,8 @@ static int do_page_crypto(const struct inode *inode,
 	return 0;
 }
 
-static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
+struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+				       gfp_t gfp_flags)
 {
 	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 	if (ctx->w.bounce_page == NULL)
@@ -260,9 +254,9 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 
 	if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
 		/* with inplace-encryption we just encrypt the page */
-		err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
-					page, ciphertext_page,
-					len, offs, gfp_flags);
+		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page,
+					     ciphertext_page, len, offs,
+					     gfp_flags);
 		if (err)
 			return ERR_PTR(err);
 
@@ -276,14 +270,14 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 		return (struct page *)ctx;
 
 	/* The encryption operation will require a bounce page. */
-	ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+	ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
 	if (IS_ERR(ciphertext_page))
 		goto errout;
 
 	ctx->w.control_page = page;
-	err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
-					page, ciphertext_page,
-					len, offs, gfp_flags);
+	err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+				     page, ciphertext_page, len, offs,
+				     gfp_flags);
 	if (err) {
 		ciphertext_page = ERR_PTR(err);
 		goto errout;
@@ -320,72 +314,11 @@ int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
 	if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
 		BUG_ON(!PageLocked(page));
 
-	return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len,
-			offs, GFP_NOFS);
+	return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page,
+				      len, offs, GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
-int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
-				sector_t pblk, unsigned int len)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
-	struct bio *bio;
-	int ret, err = 0;
-
-	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
-
-	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
-	if (IS_ERR(ciphertext_page)) {
-		err = PTR_ERR(ciphertext_page);
-		goto errout;
-	}
-
-	while (len--) {
-		err = do_page_crypto(inode, FS_ENCRYPT, lblk,
-					ZERO_PAGE(0), ciphertext_page,
-					PAGE_SIZE, 0, GFP_NOFS);
-		if (err)
-			goto errout;
-
-		bio = bio_alloc(GFP_NOWAIT, 1);
-		if (!bio) {
-			err = -ENOMEM;
-			goto errout;
-		}
-		bio->bi_bdev = inode->i_sb->s_bdev;
-		bio->bi_iter.bi_sector =
-			pblk << (inode->i_sb->s_blocksize_bits - 9);
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-		ret = bio_add_page(bio, ciphertext_page,
-					inode->i_sb->s_blocksize, 0);
-		if (ret != inode->i_sb->s_blocksize) {
-			/* should never happen! */
-			WARN_ON(1);
-			bio_put(bio);
-			err = -EIO;
-			goto errout;
-		}
-		err = submit_bio_wait(bio);
-		if ((err == 0) && bio->bi_error)
-			err = -EIO;
-		bio_put(bio);
-		if (err)
-			goto errout;
-		lblk++;
-		pblk++;
-	}
-	err = 0;
-errout:
-	fscrypt_release_ctx(ctx);
-	return err;
-}
-EXPORT_SYMBOL(fscrypt_zeroout_range);
-
 /*
  * Validate dentries for encrypted directories to make sure we aren't
  * potentially caching stale data after a key has been added or
@@ -442,64 +375,6 @@ const struct dentry_operations fscrypt_d_ops = {
 };
 EXPORT_SYMBOL(fscrypt_d_ops);
 
-/*
- * Call fscrypt_decrypt_page on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
-	struct fscrypt_ctx *ctx =
-		container_of(work, struct fscrypt_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0, page->index);
-
-		if (ret) {
-			WARN_ON_ONCE(1);
-			SetPageError(page);
-		} else {
-			SetPageUptodate(page);
-		}
-		unlock_page(page);
-	}
-	fscrypt_release_ctx(ctx);
-	bio_put(bio);
-}
-
-void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
-{
-	INIT_WORK(&ctx->r.work, completion_pages);
-	ctx->r.bio = bio;
-	queue_work(fscrypt_read_workqueue, &ctx->r.work);
-}
-EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
-
-void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *bounce_page;
-
-	/* The bounce data pages are unmapped. */
-	if ((*page)->mapping)
-		return;
-
-	/* The bounce data page is unmapped. */
-	bounce_page = *page;
-	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
-
-	/* restore control page */
-	*page = ctx->w.control_page;
-
-	if (restore)
-		fscrypt_restore_control_page(bounce_page);
-}
-EXPORT_SYMBOL(fscrypt_pullback_bio_page);
-
 void fscrypt_restore_control_page(struct page *page)
 {
 	struct fscrypt_ctx *ctx;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index aeab032d7d35..7bff7b4c7498 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -71,6 +71,11 @@ struct fscrypt_info {
 	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
 };
 
+typedef enum {
+	FS_DECRYPT = 0,
+	FS_ENCRYPT,
+} fscrypt_direction_t;
+
 #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
 #define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
 
@@ -85,7 +90,16 @@ struct fscrypt_completion_result {
 
 
 /* crypto.c */
-int fscrypt_initialize(unsigned int cop_flags);
+extern int fscrypt_initialize(unsigned int cop_flags);
+extern struct workqueue_struct *fscrypt_read_workqueue;
+extern int fscrypt_do_page_crypto(const struct inode *inode,
+				  fscrypt_direction_t rw, u64 lblk_num,
+				  struct page *src_page,
+				  struct page *dest_page,
+				  unsigned int len, unsigned int offs,
+				  gfp_t gfp_flags);
+extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+					      gfp_t gfp_flags);
 
 /* keyinfo.c */
 extern int fscrypt_get_crypt_info(struct inode *);
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index c074b670aa99..2a2815702095 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -174,11 +174,8 @@ extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
 						u64, gfp_t);
 extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
 				unsigned int, u64);
-extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
-extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern void fscrypt_restore_control_page(struct page *);
-extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
-						unsigned int);
+
 /* policy.c */
 extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
 extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
@@ -201,6 +198,12 @@ extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
 			const struct fscrypt_str *, struct fscrypt_str *);
 extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
 			struct fscrypt_str *);
+
+/* bio.c */
+extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
+				 unsigned int);
 #endif
 
 /* crypto.c */
-- 
cgit v1.2.3


From e8f1cb507d01205e03f69809af4347ed8ec9db5b Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Sun, 1 Jan 2017 13:57:00 +0200
Subject: qed*: Update to dual-license

Since the submission of the qedr driver, there's inconsistency
in the licensing of the various qed/qede files - some are GPLv2
and some are dual-license.
Since qedr requires dual-license and it's dependent on both,
we're updating the licensing of all qed/qede source files.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h              | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_cxt.h          | 34 +++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c         | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_dcbx.h         | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_dev.c          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h      | 34 +++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_hw.c           | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_hw.h           | 32 ++++++++++++++++---
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_init_ops.c     | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_init_ops.h     | 34 +++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_int.c          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_int.h          | 34 +++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c        | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_iscsi.h        | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_l2.c           | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_l2.h           | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c          | 31 +++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_ll2.h          | 31 +++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_main.c         | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_mcp.c          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_mcp.h          | 34 +++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_ooo.c          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_ooo.h          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h     | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_roce.c         |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_roce.h         |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_selftest.c     | 32 +++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sp.h           | 34 +++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c  | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_spq.c          | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_sriov.c        | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_sriov.h        | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_vf.c           | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_vf.h           | 32 ++++++++++++++++---
 drivers/net/ethernet/qlogic/qede/qede.h            | 37 ++++++++++++++++++----
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c    | 37 ++++++++++++++++++----
 drivers/net/ethernet/qlogic/qede/qede_main.c       | 37 ++++++++++++++++++----
 drivers/net/ethernet/qlogic/qede/qede_roce.c       |  2 +-
 include/linux/qed/common_hsi.h                     | 33 ++++++++++++++++---
 include/linux/qed/eth_common.h                     | 32 ++++++++++++++++---
 include/linux/qed/iscsi_common.h                   | 32 ++++++++++++++++---
 include/linux/qed/qed_chain.h                      | 34 +++++++++++++++++---
 include/linux/qed/qed_eth_if.h                     | 32 ++++++++++++++++---
 include/linux/qed/qed_if.h                         | 35 ++++++++++++++++----
 include/linux/qed/qed_iov_if.h                     | 32 ++++++++++++++++---
 include/linux/qed/qed_iscsi_if.h                   | 32 ++++++++++++++++---
 include/linux/qed/qed_ll2_if.h                     | 31 +++++++++++++++---
 include/linux/qed/qed_roce_if.h                    |  2 +-
 include/linux/qed/qede_roce.h                      |  2 +-
 include/linux/qed/rdma_common.h                    | 32 ++++++++++++++++---
 include/linux/qed/roce_common.h                    | 32 ++++++++++++++++---
 include/linux/qed/storage_common.h                 | 32 ++++++++++++++++---
 include/linux/qed/tcp_common.h                     | 32 ++++++++++++++++---
 56 files changed, 1449 insertions(+), 223 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 44c184ebe3b0..f6e0ff439b1c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 0c42c240b5cf..dcb8fc185df7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index 2b8bdaa77800..98f4973cac9d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_CXT_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index a4789a93b692..dc0d2c9ad6b5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
index 9ba681643d05..d70300fda020 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_DCBX_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 3b2250021c5f..33e720143b8d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index b6711c106597..5d37ba24da40 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_DEV_API_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 785ab03683eb..5d31189288e8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_HSI_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index 6e4fae9b1430..1f606516b6aa 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.h b/drivers/net/ethernet/qlogic/qed/qed_hw.h
index d01557092868..9277264d2e65 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_HW_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index 23e455f22adc..d891a6852695 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
index d567ba94c8d1..243b64e0d4dc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.h b/drivers/net/ethernet/qlogic/qed/qed_init_ops.h
index 1e832049983d..555dd086796d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_ops.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_INIT_OPS_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c
index c68dbf7092b1..84310b60849b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.h b/drivers/net/ethernet/qlogic/qed/qed_int.h
index 0948be64dc78..0ae0bb4593ef 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_INT_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
index 17a70122df05..3a44d6b395fa 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.h b/drivers/net/ethernet/qlogic/qed/qed_iscsi.h
index 67c25f3db4d5..20c187f4ed0b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iscsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_ISCSI_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 6a3727c4c0c6..fd153d27e3a6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index 48c9bfc28140..2f0303761d6b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 #ifndef _QED_L2_H
 #define _QED_L2_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 8e5cb7605b0f..05e32f4322eb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1,10 +1,33 @@
 /* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * Copyright (c) 2015 QLogic Corporation
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
index 6625a3ae5a33..c7f2975590ee 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
@@ -1,10 +1,33 @@
 /* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * Copyright (c) 2015 QLogic Corporation
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_LL2_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index aeb98d8c5626..e1568428569b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/stddef.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 6dd3ce443484..256674131c47 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 407a2c1830fb..363dce0f16b1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_MCP_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ooo.c b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
index 155abcb507fd..7d731c6cb892 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ooo.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ooo.h b/drivers/net/ethernet/qlogic/qed/qed_ooo.h
index 7a0670a9a074..4f138fb5f533 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ooo.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_ooo.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_OOO_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index 97544205a8c1..b6722c6ff761 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef REG_ADDR_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 2a16547c8966..bd4cad2b343b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -1,5 +1,5 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015-2016  QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h
index 279f342af8db..36cf4b2ab7fa 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h
@@ -1,5 +1,5 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015-2016  QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.c b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
index 48bfaecaf6dc..1bafc05db2b8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_selftest.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
@@ -1,3 +1,35 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <linux/crc32.h>
 #include "qed.h"
 #include "qed_dev_api.h"
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 9c897bc68d05..043882959606 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_SP_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index a39ef2e7a9a6..097a72987572 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index f022469bdcf8..645328a9f0cf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 85b09dd1787a..469e857f5dcd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/etherdevice.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 509c02b4772e..1738d5bd2e8e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_SRIOV_H
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 60b31a8ede73..af0542c0351c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/crc32.h>
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 11eb3854e6f2..7da0b165d8bc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_VF_H
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index c79dc78746fc..b2c6cc997bc0 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -1,11 +1,34 @@
 /* QLogic qede NIC Driver
-* Copyright (c) 2015 QLogic Corporation
-*
-* This software is available under the terms of the GNU General Public License
-* (GPL) Version 2, available from the file COPYING in the main directory of
-* this source tree.
-*/
-
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 #ifndef _QEDE_H_
 #define _QEDE_H_
 #include <linux/compiler.h>
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 1c48f445c93b..8e971b289d4b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -1,11 +1,34 @@
 /* QLogic qede NIC Driver
-* Copyright (c) 2015 QLogic Corporation
-*
-* This software is available under the terms of the GNU General Public License
-* (GPL) Version 2, available from the file COPYING in the main directory of
-* this source tree.
-*/
-
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 #include <linux/version.h>
 #include <linux/types.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index aecdd1c5c0ea..0851fe3a8f25 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1,11 +1,34 @@
 /* QLogic qede NIC Driver
-* Copyright (c) 2015 QLogic Corporation
-*
-* This software is available under the terms of the GNU General Public License
-* (GPL) Version 2, available from the file COPYING in the main directory of
-* this source tree.
-*/
-
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/version.h>
diff --git a/drivers/net/ethernet/qlogic/qede/qede_roce.c b/drivers/net/ethernet/qlogic/qede/qede_roce.c
index 49272716a7c4..f00657ce7c8f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_roce.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_roce.c
@@ -1,5 +1,5 @@
 /* QLogic qedr NIC Driver
- * Copyright (c) 2015-2016  QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 734deb094618..c33080baf38c 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -1,10 +1,35 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2016  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
+
 #ifndef _COMMON_HSI_H
 #define _COMMON_HSI_H
 #include <linux/types.h>
diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h
index 1aa0727c4136..4b402fb0eaad 100644
--- a/include/linux/qed/eth_common.h
+++ b/include/linux/qed/eth_common.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef __ETH_COMMON__
diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h
index 8f64b1223c2f..4c5747babcf6 100644
--- a/include/linux/qed/iscsi_common.h
+++ b/include/linux/qed/iscsi_common.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef __ISCSI_COMMON__
diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 37dfba101c6c..5cd7a4608c9b 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_CHAIN_H
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 7a52f7c58c37..d91651ecfd26 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_ETH_IF_H
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 4b454f4f5b25..d1576a2bcfc9 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -1,10 +1,33 @@
 /* QLogic qed NIC Driver
- *
- * Copyright (c) 2015 QLogic Corporation
- *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_IF_H
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index 5a4f8d0899e9..b1f979135f97 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_IOV_IF_H
diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h
index d27912480cb3..f70bb81b8b6a 100644
--- a/include/linux/qed/qed_iscsi_if.h
+++ b/include/linux/qed/qed_iscsi_if.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_ISCSI_IF_H
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index fd75c265dba3..4fb4666ea879 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -1,10 +1,33 @@
 /* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * Copyright (c) 2015 QLogic Corporation
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef _QED_LL2_IF_H
diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h
index 53047d3fa678..f742d4312c9d 100644
--- a/include/linux/qed/qed_roce_if.h
+++ b/include/linux/qed/qed_roce_if.h
@@ -1,5 +1,5 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015-2016  QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/include/linux/qed/qede_roce.h b/include/linux/qed/qede_roce.h
index f48d64b0e2fb..3b8dd551a98c 100644
--- a/include/linux/qed/qede_roce.h
+++ b/include/linux/qed/qede_roce.h
@@ -1,5 +1,5 @@
 /* QLogic qedr NIC Driver
- * Copyright (c) 2015-2016  QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h
index 7663725faa94..f773aa5e746f 100644
--- a/include/linux/qed/rdma_common.h
+++ b/include/linux/qed/rdma_common.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef __RDMA_COMMON__
diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h
index 2eeaf3dc6646..bad02df213df 100644
--- a/include/linux/qed/roce_common.h
+++ b/include/linux/qed/roce_common.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef __ROCE_COMMON__
diff --git a/include/linux/qed/storage_common.h b/include/linux/qed/storage_common.h
index 3b8e1efd9bc2..03f3e37ab059 100644
--- a/include/linux/qed/storage_common.h
+++ b/include/linux/qed/storage_common.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef __STORAGE_COMMON__
diff --git a/include/linux/qed/tcp_common.h b/include/linux/qed/tcp_common.h
index dc3889d1bbe6..46fe7856f1b2 100644
--- a/include/linux/qed/tcp_common.h
+++ b/include/linux/qed/tcp_common.h
@@ -1,9 +1,33 @@
 /* QLogic qed NIC Driver
- * Copyright (c) 2015 QLogic Corporation
+ * Copyright (c) 2015-2017  QLogic Corporation
  *
- * This software is available under the terms of the GNU General Public License
- * (GPL) Version 2, available from the file COPYING in the main directory of
- * this source tree.
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #ifndef __TCP_COMMON__
-- 
cgit v1.2.3


From f29ffdb65ff0eaf95d2a2b80f0dee3fbd5a64772 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Sun, 1 Jan 2017 13:57:07 +0200
Subject: qed*: RSS indirection based on queue-handles

A step toward having qede agnostic to the queue configurations
in firmware/hardware - let the RSS indirections use queue handles
instead of actual queue indices.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 232 +++++++++++++++---------
 drivers/net/ethernet/qlogic/qed/qed_l2.h        |  28 +--
 drivers/net/ethernet/qlogic/qed/qed_sriov.c     |  82 +++++----
 drivers/net/ethernet/qlogic/qed/qed_vf.c        |  12 +-
 drivers/net/ethernet/qlogic/qede/qede.h         |   6 +-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  77 ++++----
 drivers/net/ethernet/qlogic/qede/qede_filter.c  |  92 ++++++++--
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 126 +++++--------
 include/linux/qed/qed_eth_if.h                  |   2 +-
 9 files changed, 392 insertions(+), 265 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 03d31b394df7..a35db6951147 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -98,6 +98,7 @@ _qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
 	p_cid->cid = cid;
 	p_cid->vf_qid = vf_qid;
 	p_cid->rel = *p_params;
+	p_cid->p_owner = p_hwfn;
 
 	/* Don't try calculating the absolute indices for VFs */
 	if (IS_VF(p_hwfn->cdev)) {
@@ -272,76 +273,103 @@ static int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
 static int
 qed_sp_vport_update_rss(struct qed_hwfn *p_hwfn,
 			struct vport_update_ramrod_data *p_ramrod,
-			struct qed_rss_params *p_params)
+			struct qed_rss_params *p_rss)
 {
-	struct eth_vport_rss_config *rss = &p_ramrod->rss_config;
-	u16 abs_l2_queue = 0, capabilities = 0;
-	int rc = 0, i;
+	struct eth_vport_rss_config *p_config;
+	u16 capabilities = 0;
+	int i, table_size;
+	int rc = 0;
 
-	if (!p_params) {
+	if (!p_rss) {
 		p_ramrod->common.update_rss_flg = 0;
 		return rc;
 	}
+	p_config = &p_ramrod->rss_config;
 
-	BUILD_BUG_ON(QED_RSS_IND_TABLE_SIZE !=
-		     ETH_RSS_IND_TABLE_ENTRIES_NUM);
+	BUILD_BUG_ON(QED_RSS_IND_TABLE_SIZE != ETH_RSS_IND_TABLE_ENTRIES_NUM);
 
-	rc = qed_fw_rss_eng(p_hwfn, p_params->rss_eng_id, &rss->rss_id);
+	rc = qed_fw_rss_eng(p_hwfn, p_rss->rss_eng_id, &p_config->rss_id);
 	if (rc)
 		return rc;
 
-	p_ramrod->common.update_rss_flg = p_params->update_rss_config;
-	rss->update_rss_capabilities = p_params->update_rss_capabilities;
-	rss->update_rss_ind_table = p_params->update_rss_ind_table;
-	rss->update_rss_key = p_params->update_rss_key;
+	p_ramrod->common.update_rss_flg = p_rss->update_rss_config;
+	p_config->update_rss_capabilities = p_rss->update_rss_capabilities;
+	p_config->update_rss_ind_table = p_rss->update_rss_ind_table;
+	p_config->update_rss_key = p_rss->update_rss_key;
 
-	rss->rss_mode = p_params->rss_enable ?
-			ETH_VPORT_RSS_MODE_REGULAR :
-			ETH_VPORT_RSS_MODE_DISABLED;
+	p_config->rss_mode = p_rss->rss_enable ?
+			     ETH_VPORT_RSS_MODE_REGULAR :
+			     ETH_VPORT_RSS_MODE_DISABLED;
 
 	SET_FIELD(capabilities,
 		  ETH_VPORT_RSS_CONFIG_IPV4_CAPABILITY,
-		  !!(p_params->rss_caps & QED_RSS_IPV4));
+		  !!(p_rss->rss_caps & QED_RSS_IPV4));
 	SET_FIELD(capabilities,
 		  ETH_VPORT_RSS_CONFIG_IPV6_CAPABILITY,
-		  !!(p_params->rss_caps & QED_RSS_IPV6));
+		  !!(p_rss->rss_caps & QED_RSS_IPV6));
 	SET_FIELD(capabilities,
 		  ETH_VPORT_RSS_CONFIG_IPV4_TCP_CAPABILITY,
-		  !!(p_params->rss_caps & QED_RSS_IPV4_TCP));
+		  !!(p_rss->rss_caps & QED_RSS_IPV4_TCP));
 	SET_FIELD(capabilities,
 		  ETH_VPORT_RSS_CONFIG_IPV6_TCP_CAPABILITY,
-		  !!(p_params->rss_caps & QED_RSS_IPV6_TCP));
+		  !!(p_rss->rss_caps & QED_RSS_IPV6_TCP));
 	SET_FIELD(capabilities,
 		  ETH_VPORT_RSS_CONFIG_IPV4_UDP_CAPABILITY,
-		  !!(p_params->rss_caps & QED_RSS_IPV4_UDP));
+		  !!(p_rss->rss_caps & QED_RSS_IPV4_UDP));
 	SET_FIELD(capabilities,
 		  ETH_VPORT_RSS_CONFIG_IPV6_UDP_CAPABILITY,
-		  !!(p_params->rss_caps & QED_RSS_IPV6_UDP));
-	rss->tbl_size = p_params->rss_table_size_log;
+		  !!(p_rss->rss_caps & QED_RSS_IPV6_UDP));
+	p_config->tbl_size = p_rss->rss_table_size_log;
 
-	rss->capabilities = cpu_to_le16(capabilities);
+	p_config->capabilities = cpu_to_le16(capabilities);
 
 	DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP,
 		   "update rss flag %d, rss_mode = %d, update_caps = %d, capabilities = %d, update_ind = %d, update_rss_key = %d\n",
 		   p_ramrod->common.update_rss_flg,
-		   rss->rss_mode, rss->update_rss_capabilities,
-		   capabilities, rss->update_rss_ind_table,
-		   rss->update_rss_key);
+		   p_config->rss_mode,
+		   p_config->update_rss_capabilities,
+		   p_config->capabilities,
+		   p_config->update_rss_ind_table, p_config->update_rss_key);
 
-	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
-		rc = qed_fw_l2_queue(p_hwfn,
-				     (u8)p_params->rss_ind_table[i],
-				     &abs_l2_queue);
-		if (rc)
-			return rc;
+	table_size = min_t(int, QED_RSS_IND_TABLE_SIZE,
+			   1 << p_config->tbl_size);
+	for (i = 0; i < table_size; i++) {
+		struct qed_queue_cid *p_queue = p_rss->rss_ind_table[i];
 
-		rss->indirection_table[i] = cpu_to_le16(abs_l2_queue);
-		DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP, "i= %d, queue = %d\n",
-			   i, rss->indirection_table[i]);
+		if (!p_queue)
+			return -EINVAL;
+
+		p_config->indirection_table[i] =
+		    cpu_to_le16(p_queue->abs.queue_id);
+	}
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP,
+		   "Configured RSS indirection table [%d entries]:\n",
+		   table_size);
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i += 0x10) {
+		DP_VERBOSE(p_hwfn,
+			   NETIF_MSG_IFUP,
+			   "%04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x\n",
+			   le16_to_cpu(p_config->indirection_table[i]),
+			   le16_to_cpu(p_config->indirection_table[i + 1]),
+			   le16_to_cpu(p_config->indirection_table[i + 2]),
+			   le16_to_cpu(p_config->indirection_table[i + 3]),
+			   le16_to_cpu(p_config->indirection_table[i + 4]),
+			   le16_to_cpu(p_config->indirection_table[i + 5]),
+			   le16_to_cpu(p_config->indirection_table[i + 6]),
+			   le16_to_cpu(p_config->indirection_table[i + 7]),
+			   le16_to_cpu(p_config->indirection_table[i + 8]),
+			   le16_to_cpu(p_config->indirection_table[i + 9]),
+			   le16_to_cpu(p_config->indirection_table[i + 10]),
+			   le16_to_cpu(p_config->indirection_table[i + 11]),
+			   le16_to_cpu(p_config->indirection_table[i + 12]),
+			   le16_to_cpu(p_config->indirection_table[i + 13]),
+			   le16_to_cpu(p_config->indirection_table[i + 14]),
+			   le16_to_cpu(p_config->indirection_table[i + 15]));
 	}
 
 	for (i = 0; i < 10; i++)
-		rss->rss_key[i] = cpu_to_le32(p_params->rss_key[i]);
+		p_config->rss_key[i] = cpu_to_le32(p_rss->rss_key[i]);
 
 	return rc;
 }
@@ -1899,18 +1927,84 @@ static int qed_stop_vport(struct qed_dev *cdev, u8 vport_id)
 	return 0;
 }
 
+static int qed_update_vport_rss(struct qed_dev *cdev,
+				struct qed_update_vport_rss_params *input,
+				struct qed_rss_params *rss)
+{
+	int i, fn;
+
+	/* Update configuration with what's correct regardless of CMT */
+	rss->update_rss_config = 1;
+	rss->rss_enable = 1;
+	rss->update_rss_capabilities = 1;
+	rss->update_rss_ind_table = 1;
+	rss->update_rss_key = 1;
+	rss->rss_caps = input->rss_caps;
+	memcpy(rss->rss_key, input->rss_key, QED_RSS_KEY_SIZE * sizeof(u32));
+
+	/* In regular scenario, we'd simply need to take input handlers.
+	 * But in CMT, we'd have to split the handlers according to the
+	 * engine they were configured on. We'd then have to understand
+	 * whether RSS is really required, since 2-queues on CMT doesn't
+	 * require RSS.
+	 */
+	if (cdev->num_hwfns == 1) {
+		memcpy(rss->rss_ind_table,
+		       input->rss_ind_table,
+		       QED_RSS_IND_TABLE_SIZE * sizeof(void *));
+		rss->rss_table_size_log = 7;
+		return 0;
+	}
+
+	/* Start by copying the non-spcific information to the 2nd copy */
+	memcpy(&rss[1], &rss[0], sizeof(struct qed_rss_params));
+
+	/* CMT should be round-robin */
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+		struct qed_queue_cid *cid = input->rss_ind_table[i];
+		struct qed_rss_params *t_rss;
+
+		if (cid->p_owner == QED_LEADING_HWFN(cdev))
+			t_rss = &rss[0];
+		else
+			t_rss = &rss[1];
+
+		t_rss->rss_ind_table[i / cdev->num_hwfns] = cid;
+	}
+
+	/* Make sure RSS is actually required */
+	for_each_hwfn(cdev, fn) {
+		for (i = 1; i < QED_RSS_IND_TABLE_SIZE / cdev->num_hwfns; i++) {
+			if (rss[fn].rss_ind_table[i] !=
+			    rss[fn].rss_ind_table[0])
+				break;
+		}
+		if (i == QED_RSS_IND_TABLE_SIZE / cdev->num_hwfns) {
+			DP_VERBOSE(cdev, NETIF_MSG_IFUP,
+				   "CMT - 1 queue per-hwfn; Disabling RSS\n");
+			return -EINVAL;
+		}
+		rss[fn].rss_table_size_log = 6;
+	}
+
+	return 0;
+}
+
 static int qed_update_vport(struct qed_dev *cdev,
 			    struct qed_update_vport_params *params)
 {
 	struct qed_sp_vport_update_params sp_params;
-	struct qed_rss_params sp_rss_params;
-	int rc, i;
+	struct qed_rss_params *rss;
+	int rc = 0, i;
 
 	if (!cdev)
 		return -ENODEV;
 
+	rss = vzalloc(sizeof(*rss) * cdev->num_hwfns);
+	if (!rss)
+		return -ENOMEM;
+
 	memset(&sp_params, 0, sizeof(sp_params));
-	memset(&sp_rss_params, 0, sizeof(sp_rss_params));
 
 	/* Translate protocol params into sp params */
 	sp_params.vport_id = params->vport_id;
@@ -1924,66 +2018,24 @@ static int qed_update_vport(struct qed_dev *cdev,
 	sp_params.update_accept_any_vlan_flg =
 		params->update_accept_any_vlan_flg;
 
-	/* RSS - is a bit tricky, since upper-layer isn't familiar with hwfns.
-	 * We need to re-fix the rss values per engine for CMT.
-	 */
-	if (cdev->num_hwfns > 1 && params->update_rss_flg) {
-		struct qed_update_vport_rss_params *rss = &params->rss_params;
-		int k, max = 0;
-
-		/* Find largest entry, since it's possible RSS needs to
-		 * be disabled [in case only 1 queue per-hwfn]
-		 */
-		for (k = 0; k < QED_RSS_IND_TABLE_SIZE; k++)
-			max = (max > rss->rss_ind_table[k]) ?
-				max : rss->rss_ind_table[k];
-
-		/* Either fix RSS values or disable RSS */
-		if (cdev->num_hwfns < max + 1) {
-			int divisor = (max + cdev->num_hwfns - 1) /
-				cdev->num_hwfns;
-
-			DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
-				   "CMT - fixing RSS values (modulo %02x)\n",
-				   divisor);
-
-			for (k = 0; k < QED_RSS_IND_TABLE_SIZE; k++)
-				rss->rss_ind_table[k] =
-					rss->rss_ind_table[k] % divisor;
-		} else {
-			DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
-				   "CMT - 1 queue per-hwfn; Disabling RSS\n");
+	/* Prepare the RSS configuration */
+	if (params->update_rss_flg)
+		if (qed_update_vport_rss(cdev, &params->rss_params, rss))
 			params->update_rss_flg = 0;
-		}
-	}
-
-	/* Now, update the RSS configuration for actual configuration */
-	if (params->update_rss_flg) {
-		sp_rss_params.update_rss_config = 1;
-		sp_rss_params.rss_enable = 1;
-		sp_rss_params.update_rss_capabilities = 1;
-		sp_rss_params.update_rss_ind_table = 1;
-		sp_rss_params.update_rss_key = 1;
-		sp_rss_params.rss_caps = params->rss_params.rss_caps;
-		sp_rss_params.rss_table_size_log = 7; /* 2^7 = 128 */
-		memcpy(sp_rss_params.rss_ind_table,
-		       params->rss_params.rss_ind_table,
-		       QED_RSS_IND_TABLE_SIZE * sizeof(u16));
-		memcpy(sp_rss_params.rss_key, params->rss_params.rss_key,
-		       QED_RSS_KEY_SIZE * sizeof(u32));
-		sp_params.rss_params = &sp_rss_params;
-	}
 
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		if (params->update_rss_flg)
+			sp_params.rss_params = &rss[i];
+
 		sp_params.opaque_fid = p_hwfn->hw_info.opaque_fid;
 		rc = qed_sp_vport_update(p_hwfn, &sp_params,
 					 QED_SPQ_MODE_EBLOCK,
 					 NULL);
 		if (rc) {
 			DP_ERR(cdev, "Failed to update VPORT\n");
-			return rc;
+			goto out;
 		}
 
 		DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
@@ -1992,7 +2044,9 @@ static int qed_update_vport(struct qed_dev *cdev,
 			   params->update_vport_active_flg);
 	}
 
-	return 0;
+out:
+	vfree(rss);
+	return rc;
 }
 
 static int qed_start_rxq(struct qed_dev *cdev,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index 2f0303761d6b..93cb932ef663 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -39,6 +39,20 @@
 #include "qed.h"
 #include "qed_hw.h"
 #include "qed_sp.h"
+struct qed_rss_params {
+	u8 update_rss_config;
+	u8 rss_enable;
+	u8 rss_eng_id;
+	u8 update_rss_capabilities;
+	u8 update_rss_ind_table;
+	u8 update_rss_key;
+	u8 rss_caps;
+	u8 rss_table_size_log;
+
+	/* Indirection table consist of rx queue handles */
+	void *rss_ind_table[QED_RSS_IND_TABLE_SIZE];
+	u32 rss_key[QED_RSS_KEY_SIZE];
+};
 
 struct qed_sge_tpa_params {
 	u8 max_buffers_per_cqe;
@@ -156,18 +170,6 @@ struct qed_sp_vport_start_params {
 int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
 			   struct qed_sp_vport_start_params *p_params);
 
-struct qed_rss_params {
-	u8	update_rss_config;
-	u8	rss_enable;
-	u8	rss_eng_id;
-	u8	update_rss_capabilities;
-	u8	update_rss_ind_table;
-	u8	update_rss_key;
-	u8	rss_caps;
-	u8	rss_table_size_log;
-	u16	rss_ind_table[QED_RSS_IND_TABLE_SIZE];
-	u32	rss_key[QED_RSS_KEY_SIZE];
-};
 
 struct qed_filter_accept_flags {
 	u8	update_rx_mode_config;
@@ -287,6 +289,8 @@ struct qed_queue_cid {
 
 	/* Legacy VFs might have Rx producer located elsewhere */
 	bool b_legacy_vf;
+
+	struct qed_hwfn *p_owner;
 };
 
 void qed_eth_queue_cid_release(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 469e857f5dcd..b22baf5e5daf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -32,6 +32,7 @@
 
 #include <linux/etherdevice.h>
 #include <linux/crc32.h>
+#include <linux/vmalloc.h>
 #include <linux/qed/qed_iov_if.h>
 #include "qed_cxt.h"
 #include "qed_hsi.h"
@@ -2318,12 +2319,14 @@ qed_iov_vp_update_rss_param(struct qed_hwfn *p_hwfn,
 			    struct qed_vf_info *vf,
 			    struct qed_sp_vport_update_params *p_data,
 			    struct qed_rss_params *p_rss,
-			    struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+			    struct qed_iov_vf_mbx *p_mbx,
+			    u16 *tlvs_mask, u16 *tlvs_accepted)
 {
 	struct vfpf_vport_update_rss_tlv *p_rss_tlv;
 	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_RSS;
-	u16 i, q_idx, max_q_idx;
+	bool b_reject = false;
 	u16 table_size;
+	u16 i, q_idx;
 
 	p_rss_tlv = (struct vfpf_vport_update_rss_tlv *)
 		    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt, tlv);
@@ -2347,34 +2350,39 @@ qed_iov_vp_update_rss_param(struct qed_hwfn *p_hwfn,
 	p_rss->rss_eng_id = vf->relative_vf_id + 1;
 	p_rss->rss_caps = p_rss_tlv->rss_caps;
 	p_rss->rss_table_size_log = p_rss_tlv->rss_table_size_log;
-	memcpy(p_rss->rss_ind_table, p_rss_tlv->rss_ind_table,
-	       sizeof(p_rss->rss_ind_table));
 	memcpy(p_rss->rss_key, p_rss_tlv->rss_key, sizeof(p_rss->rss_key));
 
 	table_size = min_t(u16, ARRAY_SIZE(p_rss->rss_ind_table),
 			   (1 << p_rss_tlv->rss_table_size_log));
 
-	max_q_idx = ARRAY_SIZE(vf->vf_queues);
-
 	for (i = 0; i < table_size; i++) {
-		u16 index = vf->vf_queues[0].fw_rx_qid;
+		q_idx = p_rss_tlv->rss_ind_table[i];
+		if (!qed_iov_validate_rxq(p_hwfn, vf, q_idx)) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d]: Omitting RSS due to wrong queue %04x\n",
+				   vf->relative_vf_id, q_idx);
+			b_reject = true;
+			goto out;
+		}
 
-		q_idx = p_rss->rss_ind_table[i];
-		if (q_idx >= max_q_idx)
-			DP_NOTICE(p_hwfn,
-				  "rss_ind_table[%d] = %d, rxq is out of range\n",
-				  i, q_idx);
-		else if (!vf->vf_queues[q_idx].p_rx_cid)
-			DP_NOTICE(p_hwfn,
-				  "rss_ind_table[%d] = %d, rxq is not active\n",
-				  i, q_idx);
-		else
-			index = vf->vf_queues[q_idx].fw_rx_qid;
-		p_rss->rss_ind_table[i] = index;
+		if (!vf->vf_queues[q_idx].p_rx_cid) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d]: Omitting RSS due to inactive queue %08x\n",
+				   vf->relative_vf_id, q_idx);
+			b_reject = true;
+			goto out;
+		}
+
+		p_rss->rss_ind_table[i] = vf->vf_queues[q_idx].p_rx_cid;
 	}
 
 	p_data->rss_params = p_rss;
+out:
 	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_RSS;
+	if (!b_reject)
+		*tlvs_accepted |= 1 << QED_IOV_VP_UPDATE_RSS;
 }
 
 static void
@@ -2429,12 +2437,12 @@ static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn,
 					struct qed_ptt *p_ptt,
 					struct qed_vf_info *vf)
 {
+	struct qed_rss_params *p_rss_params = NULL;
 	struct qed_sp_vport_update_params params;
 	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
 	struct qed_sge_tpa_params sge_tpa_params;
-	struct qed_rss_params rss_params;
+	u16 tlvs_mask = 0, tlvs_accepted = 0;
 	u8 status = PFVF_STATUS_SUCCESS;
-	u16 tlvs_mask = 0;
 	u16 length;
 	int rc;
 
@@ -2447,6 +2455,11 @@ static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn,
 		status = PFVF_STATUS_FAILURE;
 		goto out;
 	}
+	p_rss_params = vzalloc(sizeof(*p_rss_params));
+	if (p_rss_params == NULL) {
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
 
 	memset(&params, 0, sizeof(params));
 	params.opaque_fid = vf->opaque_fid;
@@ -2461,20 +2474,26 @@ static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn,
 	qed_iov_vp_update_tx_switch(p_hwfn, &params, mbx, &tlvs_mask);
 	qed_iov_vp_update_mcast_bin_param(p_hwfn, &params, mbx, &tlvs_mask);
 	qed_iov_vp_update_accept_flag(p_hwfn, &params, mbx, &tlvs_mask);
-	qed_iov_vp_update_rss_param(p_hwfn, vf, &params, &rss_params,
-				    mbx, &tlvs_mask);
 	qed_iov_vp_update_accept_any_vlan(p_hwfn, &params, mbx, &tlvs_mask);
 	qed_iov_vp_update_sge_tpa_param(p_hwfn, vf, &params,
 					&sge_tpa_params, mbx, &tlvs_mask);
 
-	/* Just log a message if there is no single extended tlv in buffer.
-	 * When all features of vport update ramrod would be requested by VF
-	 * as extended TLVs in buffer then an error can be returned in response
-	 * if there is no extended TLV present in buffer.
+	tlvs_accepted = tlvs_mask;
+
+	/* Some of the extended TLVs need to be validated first; In that case,
+	 * they can update the mask without updating the accepted [so that
+	 * PF could communicate to VF it has rejected request].
 	 */
-	if (!tlvs_mask) {
-		DP_NOTICE(p_hwfn,
-			  "No feature tlvs found for vport update\n");
+	qed_iov_vp_update_rss_param(p_hwfn, vf, &params, p_rss_params,
+				    mbx, &tlvs_mask, &tlvs_accepted);
+
+	if (!tlvs_accepted) {
+		if (tlvs_mask)
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "Upper-layer prevents VF vport configuration\n");
+		else
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "No feature tlvs found for vport update\n");
 		status = PFVF_STATUS_NOT_SUPPORTED;
 		goto out;
 	}
@@ -2485,8 +2504,9 @@ static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn,
 		status = PFVF_STATUS_FAILURE;
 
 out:
+	vfree(p_rss_params);
 	length = qed_iov_prep_vp_update_resp_tlvs(p_hwfn, vf, mbx, status,
-						  tlvs_mask, tlvs_mask);
+						  tlvs_mask, tlvs_accepted);
 	qed_iov_send_response(p_hwfn, p_ptt, vf, length, status);
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index af0542c0351c..9667059b15bd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -838,6 +838,7 @@ int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
 	if (p_params->rss_params) {
 		struct qed_rss_params *rss_params = p_params->rss_params;
 		struct vfpf_vport_update_rss_tlv *p_rss_tlv;
+		int i, table_size;
 
 		size = sizeof(struct vfpf_vport_update_rss_tlv);
 		p_rss_tlv = qed_add_tlv(p_hwfn,
@@ -860,8 +861,15 @@ int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
 		p_rss_tlv->rss_enable = rss_params->rss_enable;
 		p_rss_tlv->rss_caps = rss_params->rss_caps;
 		p_rss_tlv->rss_table_size_log = rss_params->rss_table_size_log;
-		memcpy(p_rss_tlv->rss_ind_table, rss_params->rss_ind_table,
-		       sizeof(rss_params->rss_ind_table));
+
+		table_size = min_t(int, T_ETH_INDIRECTION_TABLE_SIZE,
+				   1 << p_rss_tlv->rss_table_size_log);
+		for (i = 0; i < table_size; i++) {
+			struct qed_queue_cid *p_queue;
+
+			p_queue = rss_params->rss_ind_table[i];
+			p_rss_tlv->rss_ind_table[i] = p_queue->rel.queue_id;
+		}
 		memcpy(p_rss_tlv->rss_key, rss_params->rss_key,
 		       sizeof(rss_params->rss_key));
 	}
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 1c5aac4b6139..f4e9423cd90a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -164,6 +164,7 @@ struct qede_dev {
 	u16				num_queues;
 #define QEDE_QUEUE_CNT(edev)	((edev)->num_queues)
 #define QEDE_RSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_tx)
+#define QEDE_RX_QUEUE_IDX(edev, i)	(i)
 #define QEDE_TSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_rx)
 
 	struct qed_int_info		int_info;
@@ -194,7 +195,10 @@ struct qede_dev {
 #define QEDE_RSS_KEY_INITED	BIT(1)
 #define QEDE_RSS_CAPS_INITED	BIT(2)
 	u32 rss_params_inited; /* bit-field to track initialized rss params */
-	struct qed_update_vport_rss_params	rss_params;
+	u16 rss_ind_table[128];
+	u32 rss_key[10];
+	u8 rss_caps;
+
 	u16			q_num_rx_buffers; /* Must be a power of two */
 	u16			q_num_tx_buffers; /* Must be a power of two */
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 8e971b289d4b..baf264225c12 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -37,6 +37,7 @@
 #include <linux/string.h>
 #include <linux/pci.h>
 #include <linux/capability.h>
+#include <linux/vmalloc.h>
 #include "qede.h"
 
 #define QEDE_RQSTAT_OFFSET(stat_name) \
@@ -931,8 +932,7 @@ static int qede_set_channels(struct net_device *dev,
 	/* Reset the indirection table if rx queue count is updated */
 	if ((edev->req_queues - edev->req_num_tx) != QEDE_RSS_COUNT(edev)) {
 		edev->rss_params_inited &= ~QEDE_RSS_INDIR_INITED;
-		memset(&edev->rss_params.rss_ind_table, 0,
-		       sizeof(edev->rss_params.rss_ind_table));
+		memset(edev->rss_ind_table, 0, sizeof(edev->rss_ind_table));
 	}
 
 	qede_reload(edev, NULL, false);
@@ -978,11 +978,11 @@ static int qede_get_rss_flags(struct qede_dev *edev, struct ethtool_rxnfc *info)
 		info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
 		break;
 	case UDP_V4_FLOW:
-		if (edev->rss_params.rss_caps & QED_RSS_IPV4_UDP)
+		if (edev->rss_caps & QED_RSS_IPV4_UDP)
 			info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
 		break;
 	case UDP_V6_FLOW:
-		if (edev->rss_params.rss_caps & QED_RSS_IPV6_UDP)
+		if (edev->rss_caps & QED_RSS_IPV6_UDP)
 			info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
 		break;
 	case IPV4_FLOW:
@@ -1015,8 +1015,9 @@ static int qede_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
 
 static int qede_set_rss_flags(struct qede_dev *edev, struct ethtool_rxnfc *info)
 {
-	struct qed_update_vport_params vport_update_params;
+	struct qed_update_vport_params *vport_update_params;
 	u8 set_caps = 0, clr_caps = 0;
+	int rc = 0;
 
 	DP_VERBOSE(edev, QED_MSG_DEBUG,
 		   "Set rss flags command parameters: flow type = %d, data = %llu\n",
@@ -1091,27 +1092,29 @@ static int qede_set_rss_flags(struct qede_dev *edev, struct ethtool_rxnfc *info)
 	}
 
 	/* No action is needed if there is no change in the rss capability */
-	if (edev->rss_params.rss_caps == ((edev->rss_params.rss_caps &
-					   ~clr_caps) | set_caps))
+	if (edev->rss_caps == ((edev->rss_caps & ~clr_caps) | set_caps))
 		return 0;
 
 	/* Update internal configuration */
-	edev->rss_params.rss_caps = (edev->rss_params.rss_caps & ~clr_caps) |
-				    set_caps;
+	edev->rss_caps = ((edev->rss_caps & ~clr_caps) | set_caps);
 	edev->rss_params_inited |= QEDE_RSS_CAPS_INITED;
 
 	/* Re-configure if possible */
-	if (netif_running(edev->ndev)) {
-		memset(&vport_update_params, 0, sizeof(vport_update_params));
-		vport_update_params.update_rss_flg = 1;
-		vport_update_params.vport_id = 0;
-		memcpy(&vport_update_params.rss_params, &edev->rss_params,
-		       sizeof(vport_update_params.rss_params));
-		return edev->ops->vport_update(edev->cdev,
-					       &vport_update_params);
+	__qede_lock(edev);
+	if (edev->state == QEDE_STATE_OPEN) {
+		vport_update_params = vzalloc(sizeof(*vport_update_params));
+		if (!vport_update_params) {
+			__qede_unlock(edev);
+			return -ENOMEM;
+		}
+		qede_fill_rss_params(edev, &vport_update_params->rss_params,
+				     &vport_update_params->update_rss_flg);
+		rc = edev->ops->vport_update(edev->cdev, vport_update_params);
+		vfree(vport_update_params);
 	}
+	__qede_unlock(edev);
 
-	return 0;
+	return rc;
 }
 
 static int qede_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
@@ -1136,7 +1139,7 @@ static u32 qede_get_rxfh_key_size(struct net_device *dev)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 
-	return sizeof(edev->rss_params.rss_key);
+	return sizeof(edev->rss_key);
 }
 
 static int qede_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
@@ -1151,11 +1154,10 @@ static int qede_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
 		return 0;
 
 	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++)
-		indir[i] = edev->rss_params.rss_ind_table[i];
+		indir[i] = edev->rss_ind_table[i];
 
 	if (key)
-		memcpy(key, edev->rss_params.rss_key,
-		       qede_get_rxfh_key_size(dev));
+		memcpy(key, edev->rss_key, qede_get_rxfh_key_size(dev));
 
 	return 0;
 }
@@ -1163,9 +1165,9 @@ static int qede_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
 static int qede_set_rxfh(struct net_device *dev, const u32 *indir,
 			 const u8 *key, const u8 hfunc)
 {
-	struct qed_update_vport_params vport_update_params;
+	struct qed_update_vport_params *vport_update_params;
 	struct qede_dev *edev = netdev_priv(dev);
-	int i;
+	int i, rc = 0;
 
 	if (edev->dev_info.common.num_hwfns > 1) {
 		DP_INFO(edev,
@@ -1181,27 +1183,30 @@ static int qede_set_rxfh(struct net_device *dev, const u32 *indir,
 
 	if (indir) {
 		for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++)
-			edev->rss_params.rss_ind_table[i] = indir[i];
+			edev->rss_ind_table[i] = indir[i];
 		edev->rss_params_inited |= QEDE_RSS_INDIR_INITED;
 	}
 
 	if (key) {
-		memcpy(&edev->rss_params.rss_key, key,
-		       qede_get_rxfh_key_size(dev));
+		memcpy(&edev->rss_key, key, qede_get_rxfh_key_size(dev));
 		edev->rss_params_inited |= QEDE_RSS_KEY_INITED;
 	}
 
-	if (netif_running(edev->ndev)) {
-		memset(&vport_update_params, 0, sizeof(vport_update_params));
-		vport_update_params.update_rss_flg = 1;
-		vport_update_params.vport_id = 0;
-		memcpy(&vport_update_params.rss_params, &edev->rss_params,
-		       sizeof(vport_update_params.rss_params));
-		return edev->ops->vport_update(edev->cdev,
-					       &vport_update_params);
+	__qede_lock(edev);
+	if (edev->state == QEDE_STATE_OPEN) {
+		vport_update_params = vzalloc(sizeof(*vport_update_params));
+		if (!vport_update_params) {
+			__qede_unlock(edev);
+			return -ENOMEM;
+		}
+		qede_fill_rss_params(edev, &vport_update_params->rss_params,
+				     &vport_update_params->update_rss_flg);
+		rc = edev->ops->vport_update(edev->cdev, vport_update_params);
+		vfree(vport_update_params);
 	}
+	__qede_unlock(edev);
 
-	return 0;
+	return rc;
 }
 
 /* This function enables the interrupt generation and the NAPI on the device */
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 6161e093a127..03e2a81b30c6 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -33,6 +33,7 @@
 #include <linux/etherdevice.h>
 #include <net/udp_tunnel.h>
 #include <linux/bitops.h>
+#include <linux/vmalloc.h>
 
 #include <linux/qed/qed_if.h>
 #include "qede.h"
@@ -49,6 +50,60 @@ void qede_force_mac(void *dev, u8 *mac, bool forced)
 	ether_addr_copy(edev->primary_mac, mac);
 }
 
+void qede_fill_rss_params(struct qede_dev *edev,
+			  struct qed_update_vport_rss_params *rss, u8 *update)
+{
+	bool need_reset = false;
+	int i;
+
+	if (QEDE_RSS_COUNT(edev) <= 1) {
+		memset(rss, 0, sizeof(*rss));
+		*update = 0;
+		return;
+	}
+
+	/* Need to validate current RSS config uses valid entries */
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+		if (edev->rss_ind_table[i] >= QEDE_RSS_COUNT(edev)) {
+			need_reset = true;
+			break;
+		}
+	}
+
+	if (!(edev->rss_params_inited & QEDE_RSS_INDIR_INITED) || need_reset) {
+		for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+			u16 indir_val, val;
+
+			val = QEDE_RSS_COUNT(edev);
+			indir_val = ethtool_rxfh_indir_default(i, val);
+			edev->rss_ind_table[i] = indir_val;
+		}
+		edev->rss_params_inited |= QEDE_RSS_INDIR_INITED;
+	}
+
+	/* Now that we have the queue-indirection, prepare the handles */
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+		u16 idx = QEDE_RX_QUEUE_IDX(edev, edev->rss_ind_table[i]);
+
+		rss->rss_ind_table[i] = edev->fp_array[idx].rxq->handle;
+	}
+
+	if (!(edev->rss_params_inited & QEDE_RSS_KEY_INITED)) {
+		netdev_rss_key_fill(edev->rss_key, sizeof(edev->rss_key));
+		edev->rss_params_inited |= QEDE_RSS_KEY_INITED;
+	}
+	memcpy(rss->rss_key, edev->rss_key, sizeof(rss->rss_key));
+
+	if (!(edev->rss_params_inited & QEDE_RSS_CAPS_INITED)) {
+		edev->rss_caps = QED_RSS_IPV4 | QED_RSS_IPV6 |
+		    QED_RSS_IPV4_TCP | QED_RSS_IPV6_TCP;
+		edev->rss_params_inited |= QEDE_RSS_CAPS_INITED;
+	}
+	rss->rss_caps = edev->rss_caps;
+
+	*update = 1;
+}
+
 static int qede_set_ucast_rx_mac(struct qede_dev *edev,
 				 enum qed_filter_xcast_params_type opcode,
 				 unsigned char mac[ETH_ALEN])
@@ -79,22 +134,24 @@ static int qede_set_ucast_rx_vlan(struct qede_dev *edev,
 	return edev->ops->filter_config(edev->cdev, &filter_cmd);
 }
 
-static void qede_config_accept_any_vlan(struct qede_dev *edev, bool action)
+static int qede_config_accept_any_vlan(struct qede_dev *edev, bool action)
 {
-	struct qed_update_vport_params params;
+	struct qed_update_vport_params *params;
 	int rc;
 
 	/* Proceed only if action actually needs to be performed */
 	if (edev->accept_any_vlan == action)
-		return;
+		return 0;
 
-	memset(&params, 0, sizeof(params));
+	params = vzalloc(sizeof(*params));
+	if (!params)
+		return -ENOMEM;
 
-	params.vport_id = 0;
-	params.accept_any_vlan = action;
-	params.update_accept_any_vlan_flg = 1;
+	params->vport_id = 0;
+	params->accept_any_vlan = action;
+	params->update_accept_any_vlan_flg = 1;
 
-	rc = edev->ops->vport_update(edev->cdev, &params);
+	rc = edev->ops->vport_update(edev->cdev, params);
 	if (rc) {
 		DP_ERR(edev, "Failed to %s accept-any-vlan\n",
 		       action ? "enable" : "disable");
@@ -103,6 +160,9 @@ static void qede_config_accept_any_vlan(struct qede_dev *edev, bool action)
 			action ? "enabled" : "disabled");
 		edev->accept_any_vlan = action;
 	}
+
+	vfree(params);
+	return 0;
 }
 
 int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
@@ -166,8 +226,13 @@ int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 			edev->configured_vlans++;
 	} else {
 		/* Out of quota; Activate accept-any-VLAN mode */
-		if (!edev->non_configured_vlans)
-			qede_config_accept_any_vlan(edev, true);
+		if (!edev->non_configured_vlans) {
+			rc = qede_config_accept_any_vlan(edev, true);
+			if (rc) {
+				kfree(vlan);
+				goto out;
+			}
+		}
 
 		edev->non_configured_vlans++;
 	}
@@ -242,9 +307,12 @@ int qede_configure_vlan_filters(struct qede_dev *edev)
 	 */
 
 	if (accept_any_vlan)
-		qede_config_accept_any_vlan(edev, true);
+		rc = qede_config_accept_any_vlan(edev, true);
 	else if (!edev->non_configured_vlans)
-		qede_config_accept_any_vlan(edev, false);
+		rc = qede_config_accept_any_vlan(edev, false);
+
+	if (rc && !real_rc)
+		real_rc = rc;
 
 	return real_rc;
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index be4121c867c3..88d47d6f35ac 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -59,6 +59,7 @@
 #include <linux/random.h>
 #include <net/ip6_checksum.h>
 #include <linux/bitops.h>
+#include <linux/vmalloc.h>
 #include <linux/qed/qede_roce.h>
 #include "qede.h"
 
@@ -177,8 +178,12 @@ static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 {
 	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
 	struct qed_dev_info *qed_info = &edev->dev_info.common;
+	struct qed_update_vport_params *vport_params;
 	int rc;
 
+	vport_params = vzalloc(sizeof(*vport_params));
+	if (!vport_params)
+		return -ENOMEM;
 	DP_VERBOSE(edev, QED_MSG_IOV, "Requested %d VFs\n", num_vfs_param);
 
 	rc = edev->ops->iov->configure(edev->cdev, num_vfs_param);
@@ -186,15 +191,13 @@ static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 	/* Enable/Disable Tx switching for PF */
 	if ((rc == num_vfs_param) && netif_running(edev->ndev) &&
 	    qed_info->mf_mode != QED_MF_NPAR && qed_info->tx_switching) {
-		struct qed_update_vport_params params;
-
-		memset(&params, 0, sizeof(params));
-		params.vport_id = 0;
-		params.update_tx_switching_flg = 1;
-		params.tx_switching_flg = num_vfs_param ? 1 : 0;
-		edev->ops->vport_update(edev->cdev, &params);
+		vport_params->vport_id = 0;
+		vport_params->update_tx_switching_flg = 1;
+		vport_params->tx_switching_flg = num_vfs_param ? 1 : 0;
+		edev->ops->vport_update(edev->cdev, vport_params);
 	}
 
+	vfree(vport_params);
 	return rc;
 }
 #endif
@@ -1504,19 +1507,24 @@ static int qede_stop_txq(struct qede_dev *edev,
 
 static int qede_stop_queues(struct qede_dev *edev)
 {
-	struct qed_update_vport_params vport_update_params;
+	struct qed_update_vport_params *vport_update_params;
 	struct qed_dev *cdev = edev->cdev;
 	struct qede_fastpath *fp;
 	int rc, i;
 
 	/* Disable the vport */
-	memset(&vport_update_params, 0, sizeof(vport_update_params));
-	vport_update_params.vport_id = 0;
-	vport_update_params.update_vport_active_flg = 1;
-	vport_update_params.vport_active_flg = 0;
-	vport_update_params.update_rss_flg = 0;
+	vport_update_params = vzalloc(sizeof(*vport_update_params));
+	if (!vport_update_params)
+		return -ENOMEM;
+
+	vport_update_params->vport_id = 0;
+	vport_update_params->update_vport_active_flg = 1;
+	vport_update_params->vport_active_flg = 0;
+	vport_update_params->update_rss_flg = 0;
+
+	rc = edev->ops->vport_update(cdev, vport_update_params);
+	vfree(vport_update_params);
 
-	rc = edev->ops->vport_update(cdev, &vport_update_params);
 	if (rc) {
 		DP_ERR(edev, "Failed to update vport\n");
 		return rc;
@@ -1628,11 +1636,10 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 {
 	int vlan_removal_en = 1;
 	struct qed_dev *cdev = edev->cdev;
-	struct qed_update_vport_params vport_update_params;
-	struct qed_queue_start_common_params q_params;
 	struct qed_dev_info *qed_info = &edev->dev_info.common;
+	struct qed_update_vport_params *vport_update_params;
+	struct qed_queue_start_common_params q_params;
 	struct qed_start_vport_params start = {0};
-	bool reset_rss_indir = false;
 	int rc, i;
 
 	if (!edev->num_queues) {
@@ -1641,6 +1648,10 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 		return -EINVAL;
 	}
 
+	vport_update_params = vzalloc(sizeof(*vport_update_params));
+	if (!vport_update_params)
+		return -ENOMEM;
+
 	start.gro_enable = !edev->gro_disable;
 	start.mtu = edev->ndev->mtu;
 	start.vport_id = 0;
@@ -1652,7 +1663,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 
 	if (rc) {
 		DP_ERR(edev, "Start V-PORT failed %d\n", rc);
-		return rc;
+		goto out;
 	}
 
 	DP_VERBOSE(edev, NETIF_MSG_IFUP,
@@ -1688,7 +1699,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			if (rc) {
 				DP_ERR(edev, "Start RXQ #%d failed %d\n", i,
 				       rc);
-				return rc;
+				goto out;
 			}
 
 			/* Use the return parameters */
@@ -1704,93 +1715,46 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 		if (fp->type & QEDE_FASTPATH_XDP) {
 			rc = qede_start_txq(edev, fp, fp->xdp_tx, i, XDP_PI);
 			if (rc)
-				return rc;
+				goto out;
 
 			fp->rxq->xdp_prog = bpf_prog_add(edev->xdp_prog, 1);
 			if (IS_ERR(fp->rxq->xdp_prog)) {
 				rc = PTR_ERR(fp->rxq->xdp_prog);
 				fp->rxq->xdp_prog = NULL;
-				return rc;
+				goto out;
 			}
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
 			rc = qede_start_txq(edev, fp, fp->txq, i, TX_PI(0));
 			if (rc)
-				return rc;
+				goto out;
 		}
 	}
 
 	/* Prepare and send the vport enable */
-	memset(&vport_update_params, 0, sizeof(vport_update_params));
-	vport_update_params.vport_id = start.vport_id;
-	vport_update_params.update_vport_active_flg = 1;
-	vport_update_params.vport_active_flg = 1;
+	vport_update_params->vport_id = start.vport_id;
+	vport_update_params->update_vport_active_flg = 1;
+	vport_update_params->vport_active_flg = 1;
 
 	if ((qed_info->mf_mode == QED_MF_NPAR || pci_num_vf(edev->pdev)) &&
 	    qed_info->tx_switching) {
-		vport_update_params.update_tx_switching_flg = 1;
-		vport_update_params.tx_switching_flg = 1;
+		vport_update_params->update_tx_switching_flg = 1;
+		vport_update_params->tx_switching_flg = 1;
 	}
 
-	/* Fill struct with RSS params */
-	if (QEDE_RSS_COUNT(edev) > 1) {
-		vport_update_params.update_rss_flg = 1;
-
-		/* Need to validate current RSS config uses valid entries */
-		for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
-			if (edev->rss_params.rss_ind_table[i] >=
-			    QEDE_RSS_COUNT(edev)) {
-				reset_rss_indir = true;
-				break;
-			}
-		}
-
-		if (!(edev->rss_params_inited & QEDE_RSS_INDIR_INITED) ||
-		    reset_rss_indir) {
-			u16 val;
+	qede_fill_rss_params(edev, &vport_update_params->rss_params,
+			     &vport_update_params->update_rss_flg);
 
-			for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
-				u16 indir_val;
-
-				val = QEDE_RSS_COUNT(edev);
-				indir_val = ethtool_rxfh_indir_default(i, val);
-				edev->rss_params.rss_ind_table[i] = indir_val;
-			}
-			edev->rss_params_inited |= QEDE_RSS_INDIR_INITED;
-		}
-
-		if (!(edev->rss_params_inited & QEDE_RSS_KEY_INITED)) {
-			netdev_rss_key_fill(edev->rss_params.rss_key,
-					    sizeof(edev->rss_params.rss_key));
-			edev->rss_params_inited |= QEDE_RSS_KEY_INITED;
-		}
-
-		if (!(edev->rss_params_inited & QEDE_RSS_CAPS_INITED)) {
-			edev->rss_params.rss_caps = QED_RSS_IPV4 |
-						    QED_RSS_IPV6 |
-						    QED_RSS_IPV4_TCP |
-						    QED_RSS_IPV6_TCP;
-			edev->rss_params_inited |= QEDE_RSS_CAPS_INITED;
-		}
-
-		memcpy(&vport_update_params.rss_params, &edev->rss_params,
-		       sizeof(vport_update_params.rss_params));
-	} else {
-		memset(&vport_update_params.rss_params, 0,
-		       sizeof(vport_update_params.rss_params));
-	}
-
-	rc = edev->ops->vport_update(cdev, &vport_update_params);
-	if (rc) {
+	rc = edev->ops->vport_update(cdev, vport_update_params);
+	if (rc)
 		DP_ERR(edev, "Update V-PORT failed %d\n", rc);
-		return rc;
-	}
 
-	return 0;
+out:
+	vfree(vport_update_params);
+	return rc;
 }
 
-
 enum qede_unload_mode {
 	QEDE_UNLOAD_NORMAL,
 };
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index d91651ecfd26..3613d63cd5d0 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -77,7 +77,7 @@ struct qed_dev_eth_info {
 };
 
 struct qed_update_vport_rss_params {
-	u16	rss_ind_table[128];
+	void	*rss_ind_table[128];
 	u32	rss_key[10];
 	u8	rss_caps;
 };
-- 
cgit v1.2.3


From f990c82c385b1d9ce6acadb668df313c693cf48f Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Sun, 1 Jan 2017 13:57:08 +0200
Subject: qed*: Add support for ndo_set_vf_trust

Trusted VFs would be allowed to receive promiscuous and
multicast promiscuous data.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 128 +++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h    |   9 ++
 drivers/net/ethernet/qlogic/qede/qede_filter.c |  20 ++--
 drivers/net/ethernet/qlogic/qede/qede_main.c   |  11 +++
 include/linux/qed/qed_iov_if.h                 |   2 +
 5 files changed, 162 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index b22baf5e5daf..b1213643bbfd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -1225,6 +1225,9 @@ static void qed_iov_clean_vf(struct qed_hwfn *p_hwfn, u8 vfid)
 
 	/* Clear the VF mac */
 	memset(vf_info->mac, 0, ETH_ALEN);
+
+	vf_info->rx_accept_mode = 0;
+	vf_info->tx_accept_mode = 0;
 }
 
 static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
@@ -2433,6 +2436,39 @@ qed_iov_vp_update_sge_tpa_param(struct qed_hwfn *p_hwfn,
 	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_SGE_TPA;
 }
 
+static int qed_iov_pre_update_vport(struct qed_hwfn *hwfn,
+				    u8 vfid,
+				    struct qed_sp_vport_update_params *params,
+				    u16 *tlvs)
+{
+	u8 mask = QED_ACCEPT_UCAST_UNMATCHED | QED_ACCEPT_MCAST_UNMATCHED;
+	struct qed_filter_accept_flags *flags = &params->accept_flags;
+	struct qed_public_vf_info *vf_info;
+
+	/* Untrusted VFs can't even be trusted to know that fact.
+	 * Simply indicate everything is configured fine, and trace
+	 * configuration 'behind their back'.
+	 */
+	if (!(*tlvs & BIT(QED_IOV_VP_UPDATE_ACCEPT_PARAM)))
+		return 0;
+
+	vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+
+	if (flags->update_rx_mode_config) {
+		vf_info->rx_accept_mode = flags->rx_accept_filter;
+		if (!vf_info->is_trusted_configured)
+			flags->rx_accept_filter &= ~mask;
+	}
+
+	if (flags->update_tx_mode_config) {
+		vf_info->tx_accept_mode = flags->tx_accept_filter;
+		if (!vf_info->is_trusted_configured)
+			flags->tx_accept_filter &= ~mask;
+	}
+
+	return 0;
+}
+
 static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn,
 					struct qed_ptt *p_ptt,
 					struct qed_vf_info *vf)
@@ -2487,6 +2523,13 @@ static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn,
 	qed_iov_vp_update_rss_param(p_hwfn, vf, &params, p_rss_params,
 				    mbx, &tlvs_mask, &tlvs_accepted);
 
+	if (qed_iov_pre_update_vport(p_hwfn, vf->relative_vf_id,
+				     &params, &tlvs_accepted)) {
+		tlvs_accepted = 0;
+		status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
 	if (!tlvs_accepted) {
 		if (tlvs_mask)
 			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
@@ -3936,6 +3979,32 @@ static int qed_set_vf_rate(struct qed_dev *cdev,
 	return 0;
 }
 
+static int qed_set_vf_trust(struct qed_dev *cdev, int vfid, bool trust)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		if (!qed_iov_pf_sanity_check(hwfn, vfid)) {
+			DP_NOTICE(hwfn,
+				  "SR-IOV sanity check failed, can't set trust\n");
+			return -EINVAL;
+		}
+
+		vf = qed_iov_get_public_vf_info(hwfn, vfid, true);
+
+		if (vf->is_trusted_request == trust)
+			return 0;
+		vf->is_trusted_request = trust;
+
+		qed_schedule_iov(hwfn, QED_IOV_WQ_TRUST_FLAG);
+	}
+
+	return 0;
+}
+
 static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 {
 	u64 events[QED_VF_ARRAY_LENGTH];
@@ -4040,6 +4109,61 @@ static void qed_handle_bulletin_post(struct qed_hwfn *hwfn)
 	qed_ptt_release(hwfn, ptt);
 }
 
+static void qed_iov_handle_trust_change(struct qed_hwfn *hwfn)
+{
+	struct qed_sp_vport_update_params params;
+	struct qed_filter_accept_flags *flags;
+	struct qed_public_vf_info *vf_info;
+	struct qed_vf_info *vf;
+	u8 mask;
+	int i;
+
+	mask = QED_ACCEPT_UCAST_UNMATCHED | QED_ACCEPT_MCAST_UNMATCHED;
+	flags = &params.accept_flags;
+
+	qed_for_each_vf(hwfn, i) {
+		/* Need to make sure current requested configuration didn't
+		 * flip so that we'll end up configuring something that's not
+		 * needed.
+		 */
+		vf_info = qed_iov_get_public_vf_info(hwfn, i, true);
+		if (vf_info->is_trusted_configured ==
+		    vf_info->is_trusted_request)
+			continue;
+		vf_info->is_trusted_configured = vf_info->is_trusted_request;
+
+		/* Validate that the VF has a configured vport */
+		vf = qed_iov_get_vf_info(hwfn, i, true);
+		if (!vf->vport_instance)
+			continue;
+
+		memset(&params, 0, sizeof(params));
+		params.opaque_fid = vf->opaque_fid;
+		params.vport_id = vf->vport_id;
+
+		if (vf_info->rx_accept_mode & mask) {
+			flags->update_rx_mode_config = 1;
+			flags->rx_accept_filter = vf_info->rx_accept_mode;
+		}
+
+		if (vf_info->tx_accept_mode & mask) {
+			flags->update_tx_mode_config = 1;
+			flags->tx_accept_filter = vf_info->tx_accept_mode;
+		}
+
+		/* Remove if needed; Otherwise this would set the mask */
+		if (!vf_info->is_trusted_configured) {
+			flags->rx_accept_filter &= ~mask;
+			flags->tx_accept_filter &= ~mask;
+		}
+
+		if (flags->update_rx_mode_config ||
+		    flags->update_tx_mode_config)
+			qed_sp_vport_update(hwfn, &params,
+					    QED_SPQ_MODE_EBLOCK, NULL);
+	}
+}
+
 static void qed_iov_pf_task(struct work_struct *work)
 
 {
@@ -4075,6 +4199,9 @@ static void qed_iov_pf_task(struct work_struct *work)
 	if (test_and_clear_bit(QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
 			       &hwfn->iov_task_flags))
 		qed_handle_bulletin_post(hwfn);
+
+	if (test_and_clear_bit(QED_IOV_WQ_TRUST_FLAG, &hwfn->iov_task_flags))
+		qed_iov_handle_trust_change(hwfn);
 }
 
 void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
@@ -4137,4 +4264,5 @@ const struct qed_iov_hv_ops qed_iov_ops_pass = {
 	.set_link_state = &qed_set_vf_link_state,
 	.set_spoof = &qed_spoof_configure,
 	.set_rate = &qed_set_vf_rate,
+	.set_trust = &qed_set_vf_trust,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 1738d5bd2e8e..0a2e3a36d2cf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -80,6 +80,14 @@ struct qed_public_vf_info {
 
 	/* Currently configured Tx rate in MB/sec. 0 if unconfigured */
 	int tx_rate;
+
+	/* Trusted VFs can configure promiscuous mode.
+	 * Also store shadow promisc configuration if needed.
+	 */
+	bool is_trusted_configured;
+	bool is_trusted_request;
+	u8 rx_accept_mode;
+	u8 tx_accept_mode;
 };
 
 struct qed_iov_vf_init_params {
@@ -245,6 +253,7 @@ enum qed_iov_wq_flag {
 	QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
 	QED_IOV_WQ_STOP_WQ_FLAG,
 	QED_IOV_WQ_FLR_FLAG,
+	QED_IOV_WQ_TRUST_FLAG,
 };
 
 #ifdef CONFIG_QED_SRIOV
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 03e2a81b30c6..107c3fda4792 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -714,11 +714,13 @@ void qede_config_rx_mode(struct net_device *ndev)
 		goto out;
 
 	/* Check for promiscuous */
-	if ((ndev->flags & IFF_PROMISC) ||
-	    (uc_count > edev->dev_info.num_mac_filters - 1)) {
+	if (ndev->flags & IFF_PROMISC)
 		accept_flags = QED_FILTER_RX_MODE_TYPE_PROMISC;
-	} else {
-		/* Add MAC filters according to the unicast secondary macs */
+	else
+		accept_flags = QED_FILTER_RX_MODE_TYPE_REGULAR;
+
+	/* Configure all filters regardless, in case promisc is rejected */
+	if (uc_count < edev->dev_info.num_mac_filters) {
 		int i;
 
 		temp = uc_macs;
@@ -731,12 +733,14 @@ void qede_config_rx_mode(struct net_device *ndev)
 
 			temp += ETH_ALEN;
 		}
-
-		rc = qede_configure_mcast_filtering(ndev, &accept_flags);
-		if (rc)
-			goto out;
+	} else {
+		accept_flags = QED_FILTER_RX_MODE_TYPE_PROMISC;
 	}
 
+	rc = qede_configure_mcast_filtering(ndev, &accept_flags);
+	if (rc)
+		goto out;
+
 	/* take care of VLAN mode */
 	if (ndev->flags & IFF_PROMISC) {
 		qede_config_accept_any_vlan(edev, true);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 88d47d6f35ac..b58509feecd5 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -475,6 +475,16 @@ static int qede_set_vf_link_state(struct net_device *dev, int vfidx,
 
 	return edev->ops->iov->set_link_state(edev->cdev, vfidx, link_state);
 }
+
+static int qede_set_vf_trust(struct net_device *dev, int vfidx, bool setting)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_trust(edev->cdev, vfidx, setting);
+}
 #endif
 
 static const struct net_device_ops qede_netdev_ops = {
@@ -488,6 +498,7 @@ static const struct net_device_ops qede_netdev_ops = {
 #ifdef CONFIG_QED_SRIOV
 	.ndo_set_vf_mac = qede_set_vf_mac,
 	.ndo_set_vf_vlan = qede_set_vf_vlan,
+	.ndo_set_vf_trust = qede_set_vf_trust,
 #endif
 	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
index b1f979135f97..ac2e6a3199a3 100644
--- a/include/linux/qed/qed_iov_if.h
+++ b/include/linux/qed/qed_iov_if.h
@@ -53,6 +53,8 @@ struct qed_iov_hv_ops {
 
 	int (*set_rate) (struct qed_dev *cdev, int vfid,
 			 u32 min_rate, u32 max_rate);
+
+	int (*set_trust) (struct qed_dev *cdev, int vfid, bool trust);
 };
 
 #endif
-- 
cgit v1.2.3


From 31b95c9bdc20663a20b3261303c2a5fc34aae133 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Fri, 30 Dec 2016 13:56:46 +0100
Subject: net: stmmac: remove unused duplicate property snps,axi_all

For core revision 3.x Address-Aligned Beats is available in two registers.
The DT property snps,aal was created for AAL in the DMA bus register,
which is a read/write bit.
The DT property snps,axi_all was created for AXI_AAL in the AXI bus mode
register, which is a read only bit that reflects the value of AAL in the
DMA bus register.

Since the value of snps,axi_all is never used in the driver,
and since the property was created for a bit that is read only,
it should be safe to remove the property.

Acked-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt      | 1 -
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 -
 include/linux/stmmac.h                                | 1 -
 3 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 128da752fec9..c3d2fd480a1b 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -65,7 +65,6 @@ Optional properties:
 	- snps,wr_osr_lmt: max write outstanding req. limit
 	- snps,rd_osr_lmt: max read outstanding req. limit
 	- snps,kbbe: do not cross 1KiB boundary.
-	- snps,axi_all: align address
 	- snps,blen: this is a vector of supported burst length.
 	- snps,fb: fixed-burst
 	- snps,mb: mixed-burst
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 082cd48db6a7..60ba8993c650 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -121,7 +121,6 @@ static struct stmmac_axi *stmmac_axi_setup(struct platform_device *pdev)
 	axi->axi_lpi_en = of_property_read_bool(np, "snps,lpi_en");
 	axi->axi_xit_frm = of_property_read_bool(np, "snps,xit_frm");
 	axi->axi_kbbe = of_property_read_bool(np, "snps,axi_kbbe");
-	axi->axi_axi_all = of_property_read_bool(np, "snps,axi_all");
 	axi->axi_fb = of_property_read_bool(np, "snps,axi_fb");
 	axi->axi_mb = of_property_read_bool(np, "snps,axi_mb");
 	axi->axi_rb =  of_property_read_bool(np, "snps,axi_rb");
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 266dab9ad782..889e0e9a3f1c 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -103,7 +103,6 @@ struct stmmac_axi {
 	u32 axi_wr_osr_lmt;
 	u32 axi_rd_osr_lmt;
 	bool axi_kbbe;
-	bool axi_axi_all;
 	u32 axi_blen[AXI_BLEN];
 	bool axi_fb;
 	bool axi_mb;
-- 
cgit v1.2.3


From 7b13558f242747db90e52fdc510441a2ed0bafba Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:38 +0200
Subject: net/mlx5: Fix offset naming for reserved fields in hca_cap_bits

Fix offset for reserved fields.

Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates")
Fixes: b4ff3a36d3e4 ("net/mlx5: Use offset based reserved field names in the IFC header file")
Fixes: 7d5e14237a55 ("net/mlx5: Update mlx5_ifc hardware features")
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 57bec544e20a..4792c856531e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -826,9 +826,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         port_module_event[0x1];
-	u8         reserved_at_1b0[0x1];
+	u8         reserved_at_1b1[0x1];
 	u8         ports_check[0x1];
-	u8         reserved_at_1b2[0x1];
+	u8         reserved_at_1b3[0x1];
 	u8         disable_link_up[0x1];
 	u8         beacon_led[0x1];
 	u8         port_type[0x2];
@@ -858,7 +858,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         compact_address_vector[0x1];
 	u8         striding_rq[0x1];
-	u8         reserved_at_201[0x2];
+	u8         reserved_at_202[0x2];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -1009,10 +1009,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         rndv_offload_rc[0x1];
 	u8         rndv_offload_dc[0x1];
 	u8         log_tag_matching_list_sz[0x5];
-	u8         reserved_at_5e8[0x3];
+	u8         reserved_at_5f8[0x3];
 	u8         log_max_xrq[0x5];
 
-	u8         reserved_at_5f0[0x200];
+	u8         reserved_at_600[0x200];
 };
 
 enum mlx5_flow_destination_type {
-- 
cgit v1.2.3


From bcda1aca7712539439d53dd5351c6223e03bf6e1 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:41 +0200
Subject: net/mlx5: Support new MR features

This patch adds the following items to IFC file.

1. MLX5_MKC_ACCESS_MODE_KSM enum value for creating KSM memory keys.
KSM access mode used when indirect MKey associated with fixed memory
size entries.

2. null_mkey field that is used to indicate non-present KLM/KSM
entries, where it causes the device to generate page fault event
when trying to access it.

3. struct mlx5_ifc_cmd_hca_cap_bits capability bits indicating
related value/field is supported:
* fixed_buffer_size - MLX5_MKC_ACCESS_MODE_KSM
* umr_extended_translation_offset - translation_offset_42_16
    in UMR ctrl segment
* null_mkey - null_mkey in QUERY_SPECIAL_CONTEXTS

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4792c856531e..7c760e580268 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -782,11 +782,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_eq[0x4];
 
 	u8         max_indirection[0x8];
-	u8         reserved_at_108[0x1];
+	u8         fixed_buffer_size[0x1];
 	u8         log_max_mrw_sz[0x7];
 	u8         reserved_at_110[0x2];
 	u8         log_max_bsf_list_size[0x6];
-	u8         reserved_at_118[0x2];
+	u8         umr_extended_translation_offset[0x1];
+	u8         null_mkey[0x1];
 	u8         log_max_klm_list_size[0x6];
 
 	u8         reserved_at_120[0xa];
@@ -2569,6 +2570,7 @@ enum {
 	MLX5_MKC_ACCESS_MODE_PA    = 0x0,
 	MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
 	MLX5_MKC_ACCESS_MODE_KLMS  = 0x2,
+	MLX5_MKC_ACCESS_MODE_KSM   = 0x3,
 };
 
 struct mlx5_ifc_mkc_bits {
@@ -3677,6 +3679,10 @@ struct mlx5_ifc_query_special_contexts_out_bits {
 	u8         dump_fill_mkey[0x20];
 
 	u8         resd_lkey[0x20];
+
+	u8         null_mkey[0x20];
+
+	u8         reserved_at_a0[0x60];
 };
 
 struct mlx5_ifc_query_special_contexts_in_bits {
-- 
cgit v1.2.3


From 3161625589c1d7c54e949d462f4d0c327664881a Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:42 +0200
Subject: IB/mlx5: Refactor UMR post send format

* Update struct mlx5_wqe_umr_ctrl_seg.
* Currenlty UMR send_flags aim only certain use cases: enabled/disable
  cached MR, modifying XLT for ODP. By making flags independent make UMR
  more flexible allowing arbitrary manipulations.
* Since different UMR formats have different entry sizes UMR request
  should receive exact size of translation table update instead of
  number of entries. Rename field npages to xlt_size in struct mlx5_umr_wr
  and update relevant code accordingly.
* Add support of length64 bit.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h              |  24 ++--
 drivers/infiniband/hw/mlx5/mr.c                   |  50 +++++----
 drivers/infiniband/hw/mlx5/odp.c                  |   3 +-
 drivers/infiniband/hw/mlx5/qp.c                   | 128 +++++++++-------------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   2 +-
 include/linux/mlx5/qp.h                           |  14 ++-
 6 files changed, 103 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 6c6057eb60ea..d79580dcf20f 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -174,13 +174,12 @@ struct mlx5_ib_flow_db {
  * enum ib_send_flags and enum ib_qp_type for low-level driver
  */
 
-#define MLX5_IB_SEND_UMR_UNREG	IB_SEND_RESERVED_START
-#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
-#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)
-
-#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION	(IB_SEND_RESERVED_START << 3)
-#define MLX5_IB_SEND_UMR_UPDATE_PD		(IB_SEND_RESERVED_START << 4)
-#define MLX5_IB_SEND_UMR_UPDATE_ACCESS		IB_SEND_RESERVED_END
+#define MLX5_IB_SEND_UMR_ENABLE_MR	       (IB_SEND_RESERVED_START << 0)
+#define MLX5_IB_SEND_UMR_DISABLE_MR	       (IB_SEND_RESERVED_START << 1)
+#define MLX5_IB_SEND_UMR_FAIL_IF_FREE	       (IB_SEND_RESERVED_START << 2)
+#define MLX5_IB_SEND_UMR_UPDATE_XLT	       (IB_SEND_RESERVED_START << 3)
+#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION    (IB_SEND_RESERVED_START << 4)
+#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS       IB_SEND_RESERVED_END
 
 #define MLX5_IB_QPT_REG_UMR	IB_QPT_RESERVED1
 /*
@@ -190,6 +189,9 @@ struct mlx5_ib_flow_db {
 #define MLX5_IB_QPT_HW_GSI	IB_QPT_RESERVED2
 #define MLX5_IB_WR_UMR		IB_WR_RESERVED1
 
+#define MLX5_IB_UMR_OCTOWORD	       16
+#define MLX5_IB_UMR_XLT_ALIGNMENT      64
+
 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
  *
  * These flags are intended for internal use by the mlx5_ib driver, and they
@@ -414,13 +416,11 @@ enum mlx5_ib_qp_flags {
 
 struct mlx5_umr_wr {
 	struct ib_send_wr		wr;
-	union {
-		u64			virt_addr;
-		u64			offset;
-	} target;
+	u64				virt_addr;
+	u64				offset;
 	struct ib_pd		       *pd;
 	unsigned int			page_shift;
-	unsigned int			npages;
+	unsigned int			xlt_size;
 	u64				length;
 	int				access_flags;
 	u32				mkey;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 7ab9b67ce43b..be8d38d7ca12 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -774,7 +774,7 @@ static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	 * To avoid copying garbage after the pas array, we allocate
 	 * a little more.
 	 */
-	*size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
+	*size = ALIGN(sizeof(struct mlx5_mtt) * npages, MLX5_UMR_MTT_ALIGNMENT);
 	*mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
 	if (!(*mr_pas))
 		return -ENOMEM;
@@ -782,7 +782,7 @@ static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
 	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
 	/* Clear padding after the actual pages. */
-	memset(pas + npages, 0, *size - npages * sizeof(u64));
+	memset(pas + npages, 0, *size - npages * sizeof(struct mlx5_mtt));
 
 	*dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
 	if (dma_mapping_error(ddev, *dma)) {
@@ -801,7 +801,8 @@ static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
 
 	sg->addr = dma;
-	sg->length = ALIGN(sizeof(u64) * n, 64);
+	sg->length = ALIGN(sizeof(struct mlx5_mtt) * n,
+			   MLX5_IB_UMR_XLT_ALIGNMENT);
 	sg->lkey = dev->umrc.pd->local_dma_lkey;
 
 	wr->next = NULL;
@@ -813,7 +814,7 @@ static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
 
 	wr->opcode = MLX5_IB_WR_UMR;
 
-	umrwr->npages = n;
+	umrwr->xlt_size = sg->length;
 	umrwr->page_shift = page_shift;
 	umrwr->mkey = key;
 }
@@ -827,9 +828,11 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
 
 	prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
 
-	wr->send_flags = 0;
+	wr->send_flags = MLX5_IB_SEND_UMR_ENABLE_MR |
+			 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION |
+			 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
 
-	umrwr->target.virt_addr = virt_addr;
+	umrwr->virt_addr = virt_addr;
 	umrwr->length = len;
 	umrwr->access_flags = access_flags;
 	umrwr->pd = pd;
@@ -840,7 +843,8 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
 {
 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
 
-	wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	wr->send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
+			 MLX5_IB_SEND_UMR_FAIL_IF_FREE;
 	wr->opcode = MLX5_IB_WR_UMR;
 	umrwr->mkey = key;
 }
@@ -993,7 +997,8 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 	struct mlx5_umr_wr wr;
 	struct ib_sge sg;
 	int err = 0;
-	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
+	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT /
+					 sizeof(struct mlx5_mtt);
 	const int page_index_mask = page_index_alignment - 1;
 	size_t pages_mapped = 0;
 	size_t pages_to_map = 0;
@@ -1012,7 +1017,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
 		return -EINVAL;
 
-	size = sizeof(u64) * pages_to_map;
+	size = sizeof(struct mlx5_mtt) * pages_to_map;
 	size = min_t(int, PAGE_SIZE, size);
 	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
 	 * code, when we are called from an invalidation. The pas buffer must
@@ -1026,7 +1031,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
 		memset(pas, 0, size);
 	}
-	pages_iter = size / sizeof(u64);
+	pages_iter = size / sizeof(struct mlx5_mtt);
 	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
 	if (dma_mapping_error(ddev, dma)) {
 		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
@@ -1049,7 +1054,8 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 					       MLX5_IB_MTT_PRESENT);
 			/* Clear padding after the pages brought from the
 			 * umem. */
-			memset(pas + npages, 0, size - npages * sizeof(u64));
+			memset(pas + npages, 0, size - npages *
+			       sizeof(struct mlx5_mtt));
 		}
 
 		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
@@ -1057,19 +1063,19 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 		memset(&wr, 0, sizeof(wr));
 
 		sg.addr = dma;
-		sg.length = ALIGN(npages * sizeof(u64),
+		sg.length = ALIGN(npages * sizeof(struct mlx5_mtt),
 				MLX5_UMR_MTT_ALIGNMENT);
 		sg.lkey = dev->umrc.pd->local_dma_lkey;
 
 		wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
-				MLX5_IB_SEND_UMR_UPDATE_MTT;
+				   MLX5_IB_SEND_UMR_UPDATE_XLT;
 		wr.wr.sg_list = &sg;
 		wr.wr.num_sge = 1;
 		wr.wr.opcode = MLX5_IB_WR_UMR;
-		wr.npages = sg.length / sizeof(u64);
+		wr.xlt_size = sg.length;
 		wr.page_shift = PAGE_SHIFT;
 		wr.mkey = mr->mmkey.key;
-		wr.target.offset = start_page_index;
+		wr.offset = start_page_index * sizeof(struct mlx5_mtt);
 
 		err = mlx5_ib_post_send_wait(dev, &wr);
 	}
@@ -1272,7 +1278,7 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
 		if (err)
 			return err;
 
-		umrwr.target.virt_addr = virt_addr;
+		umrwr.virt_addr = virt_addr;
 		umrwr.length = length;
 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
 	}
@@ -1280,14 +1286,10 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
 	prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
 			    page_shift);
 
-	if (flags & IB_MR_REREG_PD) {
+	if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) {
 		umrwr.pd = pd;
-		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD;
-	}
-
-	if (flags & IB_MR_REREG_ACCESS) {
 		umrwr.access_flags = access_flags;
-		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS;
+		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
 	}
 
 	/* post send request to UMR QP */
@@ -1552,11 +1554,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
 		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
 		err = mlx5_alloc_priv_descs(pd->device, mr,
-					    ndescs, sizeof(u64));
+					    ndescs, sizeof(struct mlx5_mtt));
 		if (err)
 			goto err_free_in;
 
-		mr->desc_size = sizeof(u64);
+		mr->desc_size = sizeof(struct mlx5_mtt);
 		mr->max_descs = ndescs;
 	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
 		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index cacb631a7b0a..67651eca59c5 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -47,7 +47,8 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end)
 {
 	struct mlx5_ib_mr *mr;
-	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
+	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
+				    sizeof(struct mlx5_mtt)) - 1;
 	u64 idx = 0, blk_start_idx = 0;
 	int in_block = 0;
 	u64 addr;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index a1b3125f0a6e..ec2301ac0fde 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3080,9 +3080,10 @@ static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
 	dseg->addr       = cpu_to_be64(sg->addr);
 }
 
-static __be16 get_klm_octo(int npages)
+static u64 get_xlt_octo(u64 bytes)
 {
-	return cpu_to_be16(ALIGN(npages, 8) / 2);
+	return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) /
+	       MLX5_IB_UMR_OCTOWORD;
 }
 
 static __be64 frwr_mkey_mask(void)
@@ -3127,18 +3128,14 @@ static __be64 sig_mkey_mask(void)
 }
 
 static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
-				struct mlx5_ib_mr *mr)
+			    struct mlx5_ib_mr *mr)
 {
-	int ndescs = mr->ndescs;
+	int size = mr->ndescs * mr->desc_size;
 
 	memset(umr, 0, sizeof(*umr));
 
-	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
-		/* KLMs take twice the size of MTTs */
-		ndescs *= 2;
-
 	umr->flags = MLX5_UMR_CHECK_NOT_FREE;
-	umr->klm_octowords = get_klm_octo(ndescs);
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
 	umr->mkey_mask = frwr_mkey_mask();
 }
 
@@ -3149,37 +3146,17 @@ static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
 	umr->flags = MLX5_UMR_INLINE;
 }
 
-static __be64 get_umr_reg_mr_mask(int atomic)
+static __be64 get_umr_enable_mr_mask(void)
 {
 	u64 result;
 
-	result = MLX5_MKEY_MASK_LEN		|
-		 MLX5_MKEY_MASK_PAGE_SIZE	|
-		 MLX5_MKEY_MASK_START_ADDR	|
-		 MLX5_MKEY_MASK_PD		|
-		 MLX5_MKEY_MASK_LR		|
-		 MLX5_MKEY_MASK_LW		|
-		 MLX5_MKEY_MASK_KEY		|
-		 MLX5_MKEY_MASK_RR		|
-		 MLX5_MKEY_MASK_RW		|
+	result = MLX5_MKEY_MASK_KEY |
 		 MLX5_MKEY_MASK_FREE;
 
-	if (atomic)
-		result |= MLX5_MKEY_MASK_A;
-
-	return cpu_to_be64(result);
-}
-
-static __be64 get_umr_unreg_mr_mask(void)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_FREE;
-
 	return cpu_to_be64(result);
 }
 
-static __be64 get_umr_update_mtt_mask(void)
+static __be64 get_umr_disable_mr_mask(void)
 {
 	u64 result;
 
@@ -3194,23 +3171,22 @@ static __be64 get_umr_update_translation_mask(void)
 
 	result = MLX5_MKEY_MASK_LEN |
 		 MLX5_MKEY_MASK_PAGE_SIZE |
-		 MLX5_MKEY_MASK_START_ADDR |
-		 MLX5_MKEY_MASK_KEY |
-		 MLX5_MKEY_MASK_FREE;
+		 MLX5_MKEY_MASK_START_ADDR;
 
 	return cpu_to_be64(result);
 }
 
-static __be64 get_umr_update_access_mask(void)
+static __be64 get_umr_update_access_mask(int atomic)
 {
 	u64 result;
 
-	result = MLX5_MKEY_MASK_LW |
+	result = MLX5_MKEY_MASK_LR |
+		 MLX5_MKEY_MASK_LW |
 		 MLX5_MKEY_MASK_RR |
-		 MLX5_MKEY_MASK_RW |
-		 MLX5_MKEY_MASK_A |
-		 MLX5_MKEY_MASK_KEY |
-		 MLX5_MKEY_MASK_FREE;
+		 MLX5_MKEY_MASK_RW;
+
+	if (atomic)
+		result |= MLX5_MKEY_MASK_A;
 
 	return cpu_to_be64(result);
 }
@@ -3219,9 +3195,7 @@ static __be64 get_umr_update_pd_mask(void)
 {
 	u64 result;
 
-	result = MLX5_MKEY_MASK_PD |
-		 MLX5_MKEY_MASK_KEY |
-		 MLX5_MKEY_MASK_FREE;
+	result = MLX5_MKEY_MASK_PD;
 
 	return cpu_to_be64(result);
 }
@@ -3238,24 +3212,24 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
 	else
 		umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
 
-	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
-		umr->klm_octowords = get_klm_octo(umrwr->npages);
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) {
-			umr->mkey_mask = get_umr_update_mtt_mask();
-			umr->bsf_octowords = get_klm_octo(umrwr->target.offset);
-			umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
-		}
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
-			umr->mkey_mask |= get_umr_update_translation_mask();
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS)
-			umr->mkey_mask |= get_umr_update_access_mask();
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD)
-			umr->mkey_mask |= get_umr_update_pd_mask();
-		if (!umr->mkey_mask)
-			umr->mkey_mask = get_umr_reg_mr_mask(atomic);
-	} else {
-		umr->mkey_mask = get_umr_unreg_mr_mask();
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) {
+		u64 offset = get_xlt_octo(umrwr->offset);
+
+		umr->xlt_offset = cpu_to_be16(offset & 0xffff);
+		umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16);
+		umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
 	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
+		umr->mkey_mask |= get_umr_update_translation_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) {
+		umr->mkey_mask |= get_umr_update_access_mask(atomic);
+		umr->mkey_mask |= get_umr_update_pd_mask();
+	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR)
+		umr->mkey_mask |= get_umr_enable_mr_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
+		umr->mkey_mask |= get_umr_disable_mr_mask();
 
 	if (!wr->num_sge)
 		umr->flags |= MLX5_UMR_INLINE;
@@ -3303,17 +3277,17 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w
 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
 
 	memset(seg, 0, sizeof(*seg));
-	if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
 		seg->status = MLX5_MKEY_STATUS_FREE;
-		return;
-	}
 
 	seg->flags = convert_access(umrwr->access_flags);
-	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) {
-		if (umrwr->pd)
-			seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
-		seg->start_addr = cpu_to_be64(umrwr->target.virt_addr);
-	}
+	if (umrwr->pd)
+		seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION &&
+	    !umrwr->length)
+		seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64);
+
+	seg->start_addr = cpu_to_be64(umrwr->virt_addr);
 	seg->len = cpu_to_be64(umrwr->length);
 	seg->log2_page_size = umrwr->page_shift;
 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
@@ -3611,7 +3585,7 @@ static int set_sig_data_segment(struct ib_sig_handover_wr *wr,
 }
 
 static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
-				 struct ib_sig_handover_wr *wr, u32 nelements,
+				 struct ib_sig_handover_wr *wr, u32 size,
 				 u32 length, u32 pdn)
 {
 	struct ib_mr *sig_mr = wr->sig_mr;
@@ -3626,17 +3600,17 @@ static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
 	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
 				    MLX5_MKEY_BSF_EN | pdn);
 	seg->len = cpu_to_be64(length);
-	seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements)));
+	seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size));
 	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
 }
 
 static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
-				u32 nelements)
+				u32 size)
 {
 	memset(umr, 0, sizeof(*umr));
 
 	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
-	umr->klm_octowords = get_klm_octo(nelements);
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
 	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
 	umr->mkey_mask = sig_mkey_mask();
 }
@@ -3648,7 +3622,7 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp,
 	struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
 	struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
 	u32 pdn = get_pd(qp)->pdn;
-	u32 klm_oct_size;
+	u32 xlt_size;
 	int region_len, ret;
 
 	if (unlikely(wr->wr.num_sge != 1) ||
@@ -3670,15 +3644,15 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp,
 	 * then we use strided block format (3 octowords),
 	 * else we use single KLM (1 octoword)
 	 **/
-	klm_oct_size = wr->prot ? 3 : 1;
+	xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm);
 
-	set_sig_umr_segment(*seg, klm_oct_size);
+	set_sig_umr_segment(*seg, xlt_size);
 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
 	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
 	if (unlikely((*seg == qp->sq.qend)))
 		*seg = mlx5_get_send_wqe(qp, 0);
 
-	set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn);
+	set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn);
 	*seg += sizeof(struct mlx5_mkey_seg);
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
 	if (unlikely((*seg == qp->sq.qend)))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cbfa38fc72c0..5ff86f0ecb7b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -396,7 +396,7 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, struct mlx5e_sq *sq,
 	cseg->imm       = rq->mkey_be;
 
 	ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN;
-	ucseg->klm_octowords =
+	ucseg->xlt_octowords =
 		cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE));
 	ucseg->bsf_octowords =
 		cpu_to_be16(MLX5_MTT_OCTW(umr_wqe_mtt_offset));
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 0aacb2a7480d..693811e0cb24 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -292,10 +292,14 @@ struct mlx5_wqe_data_seg {
 struct mlx5_wqe_umr_ctrl_seg {
 	u8		flags;
 	u8		rsvd0[3];
-	__be16		klm_octowords;
-	__be16		bsf_octowords;
+	__be16		xlt_octowords;
+	union {
+		__be16	xlt_offset;
+		__be16	bsf_octowords;
+	};
 	__be64		mkey_mask;
-	u8		rsvd1[32];
+	__be32		xlt_offset_47_16;
+	u8		rsvd1[28];
 };
 
 struct mlx5_seg_set_psv {
@@ -389,6 +393,10 @@ struct mlx5_bsf {
 	struct mlx5_bsf_inl	m_inl;
 };
 
+struct mlx5_mtt {
+	__be64		ptag;
+};
+
 struct mlx5_klm {
 	__be32		bcount;
 	__be32		key;
-- 
cgit v1.2.3


From 7d0cc6edcc7011133c45f62a7796a98b8cb5da0f Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:44 +0200
Subject: IB/mlx5: Add MR cache for large UMR regions

In this change we turn mlx5_ib_update_mtt() into generic
mlx5_ib_update_xlt() to perfrom HCA translation table modifiactions
supporting both atomic and process contexts and not limited by number
of modified entries.
Using this function we increase preallocated MRs up to 16GB.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c              |  14 +-
 drivers/infiniband/hw/mlx5/mem.c               |  32 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h           |  15 +-
 drivers/infiniband/hw/mlx5/mr.c                | 386 ++++++++++---------------
 drivers/infiniband/hw/mlx5/odp.c               |  19 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  20 ++
 include/linux/mlx5/driver.h                    |   2 +-
 7 files changed, 240 insertions(+), 248 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 2ab4e3219c84..b87127206ef2 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1112,11 +1112,18 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
 #endif
 
+	context->upd_xlt_page = __get_free_page(GFP_KERNEL);
+	if (!context->upd_xlt_page) {
+		err = -ENOMEM;
+		goto out_uars;
+	}
+	mutex_init(&context->upd_xlt_page_mutex);
+
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
 		err = mlx5_core_alloc_transport_domain(dev->mdev,
 						       &context->tdn);
 		if (err)
-			goto out_uars;
+			goto out_page;
 	}
 
 	INIT_LIST_HEAD(&context->vma_private_list);
@@ -1168,6 +1175,9 @@ out_td:
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
 
+out_page:
+	free_page(context->upd_xlt_page);
+
 out_uars:
 	for (i--; i >= 0; i--)
 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
@@ -1195,6 +1205,8 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
 
+	free_page(context->upd_xlt_page);
+
 	for (i = 0; i < uuari->num_uars; i++) {
 		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
 			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 6851357c16f4..778d8a18925f 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -159,7 +159,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	unsigned long umem_page_shift = ilog2(umem->page_size);
 	int shift = page_shift - umem_page_shift;
 	int mask = (1 << shift) - 1;
-	int i, k;
+	int i, k, idx;
 	u64 cur = 0;
 	u64 base;
 	int len;
@@ -185,18 +185,36 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
 		len = sg_dma_len(sg) >> umem_page_shift;
 		base = sg_dma_address(sg);
-		for (k = 0; k < len; k++) {
+
+		/* Skip elements below offset */
+		if (i + len < offset << shift) {
+			i += len;
+			continue;
+		}
+
+		/* Skip pages below offset */
+		if (i < offset << shift) {
+			k = (offset << shift) - i;
+			i = offset << shift;
+		} else {
+			k = 0;
+		}
+
+		for (; k < len; k++) {
 			if (!(i & mask)) {
 				cur = base + (k << umem_page_shift);
 				cur |= access_flags;
+				idx = (i >> shift) - offset;
 
-				pas[i >> shift] = cpu_to_be64(cur);
+				pas[idx] = cpu_to_be64(cur);
 				mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
-					    i >> shift, be64_to_cpu(pas[i >> shift]));
-			}  else
-				mlx5_ib_dbg(dev, "=====> 0x%llx\n",
-					    base + (k << umem_page_shift));
+					    i >> shift, be64_to_cpu(pas[idx]));
+			}
 			i++;
+
+			/* Stop after num_pages reached */
+			if (i >> shift >= offset + num_pages)
+				return;
 		}
 	}
 }
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 73bff77ab20c..02d925573945 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -125,6 +125,10 @@ struct mlx5_ib_ucontext {
 	/* Transport Domain number */
 	u32			tdn;
 	struct list_head	vma_private_list;
+
+	unsigned long		upd_xlt_page;
+	/* protect ODP/KSM */
+	struct mutex		upd_xlt_page_mutex;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -192,6 +196,13 @@ struct mlx5_ib_flow_db {
 #define MLX5_IB_UMR_OCTOWORD	       16
 #define MLX5_IB_UMR_XLT_ALIGNMENT      64
 
+#define MLX5_IB_UPD_XLT_ZAP	      BIT(0)
+#define MLX5_IB_UPD_XLT_ENABLE	      BIT(1)
+#define MLX5_IB_UPD_XLT_ATOMIC	      BIT(2)
+#define MLX5_IB_UPD_XLT_ADDR	      BIT(3)
+#define MLX5_IB_UPD_XLT_PD	      BIT(4)
+#define MLX5_IB_UPD_XLT_ACCESS	      BIT(5)
+
 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
  *
  * These flags are intended for internal use by the mlx5_ib driver, and they
@@ -788,8 +799,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 			       struct ib_udata *udata);
 int mlx5_ib_dealloc_mw(struct ib_mw *mw);
-int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
-		       int npages, int zap);
+int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
+		       int page_shift, int flags);
 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 			  u64 length, u64 virt_addr, int access_flags,
 			  struct ib_pd *pd, struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 4d40fe0556bd..f4ecc10cbbba 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -46,14 +46,9 @@ enum {
 };
 
 #define MLX5_UMR_ALIGN 2048
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-static __be64 mlx5_ib_update_mtt_emergency_buffer[
-		MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
-	__aligned(MLX5_UMR_ALIGN);
-static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
-#endif
 
 static int clean_mr(struct mlx5_ib_mr *mr);
+static int use_umr(struct mlx5_ib_dev *dev, int order);
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
@@ -629,7 +624,8 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		ent->dev = dev;
 
 		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
-		    (mlx5_core_is_pf(dev->mdev)))
+		    mlx5_core_is_pf(dev->mdev) &&
+		    use_umr(dev, ent->order))
 			limit = dev->mdev->profile->mr_cache[i].limit;
 		else
 			limit = 0;
@@ -757,98 +753,13 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
 	return (npages + 1) / 2;
 }
 
-static int use_umr(int order)
+static int use_umr(struct mlx5_ib_dev *dev, int order)
 {
+	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+		return order < MAX_MR_CACHE_ENTRIES + 2;
 	return order <= MLX5_MAX_UMR_SHIFT;
 }
 
-static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
-			  int npages, int page_shift, int *size,
-			  __be64 **mr_pas, dma_addr_t *dma)
-{
-	__be64 *pas;
-	struct device *ddev = dev->ib_dev.dma_device;
-
-	/*
-	 * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
-	 * To avoid copying garbage after the pas array, we allocate
-	 * a little more.
-	 */
-	*size = ALIGN(sizeof(struct mlx5_mtt) * npages, MLX5_UMR_MTT_ALIGNMENT);
-	*mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
-	if (!(*mr_pas))
-		return -ENOMEM;
-
-	pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
-	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
-	/* Clear padding after the actual pages. */
-	memset(pas + npages, 0, *size - npages * sizeof(struct mlx5_mtt));
-
-	*dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
-	if (dma_mapping_error(ddev, *dma)) {
-		kfree(*mr_pas);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
-				struct ib_sge *sg, u64 dma, int n, u32 key,
-				int page_shift)
-{
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_umr_wr *umrwr = umr_wr(wr);
-
-	sg->addr = dma;
-	sg->length = ALIGN(sizeof(struct mlx5_mtt) * n,
-			   MLX5_IB_UMR_XLT_ALIGNMENT);
-	sg->lkey = dev->umrc.pd->local_dma_lkey;
-
-	wr->next = NULL;
-	wr->sg_list = sg;
-	if (n)
-		wr->num_sge = 1;
-	else
-		wr->num_sge = 0;
-
-	wr->opcode = MLX5_IB_WR_UMR;
-
-	umrwr->xlt_size = sg->length;
-	umrwr->page_shift = page_shift;
-	umrwr->mkey = key;
-}
-
-static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
-			     struct ib_sge *sg, u64 dma, int n, u32 key,
-			     int page_shift, u64 virt_addr, u64 len,
-			     int access_flags)
-{
-	struct mlx5_umr_wr *umrwr = umr_wr(wr);
-
-	prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
-
-	wr->send_flags = MLX5_IB_SEND_UMR_ENABLE_MR |
-			 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION |
-			 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
-
-	umrwr->virt_addr = virt_addr;
-	umrwr->length = len;
-	umrwr->access_flags = access_flags;
-	umrwr->pd = pd;
-}
-
-static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
-			       struct ib_send_wr *wr, u32 key)
-{
-	struct mlx5_umr_wr *umrwr = umr_wr(wr);
-
-	wr->send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
-			 MLX5_IB_SEND_UMR_FAIL_IF_FREE;
-	wr->opcode = MLX5_IB_WR_UMR;
-	umrwr->mkey = key;
-}
-
 static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
 		       int access_flags, struct ib_umem **umem,
 		       int *npages, int *page_shift, int *ncont,
@@ -927,13 +838,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
 				  int page_shift, int order, int access_flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct device *ddev = dev->ib_dev.dma_device;
-	struct mlx5_umr_wr umrwr = {};
 	struct mlx5_ib_mr *mr;
-	struct ib_sge sg;
-	int size;
-	__be64 *mr_pas;
-	dma_addr_t dma;
 	int err = 0;
 	int i;
 
@@ -952,144 +857,174 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
 	if (!mr)
 		return ERR_PTR(-EAGAIN);
 
-	err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas,
-			     &dma);
-	if (err)
-		goto free_mr;
-
-	prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
-			 page_shift, virt_addr, len, access_flags);
-
-	err = mlx5_ib_post_send_wait(dev, &umrwr);
-	if (err && err != -EFAULT)
-		goto unmap_dma;
-
+	mr->ibmr.pd = pd;
+	mr->umem = umem;
+	mr->access_flags = access_flags;
+	mr->desc_size = sizeof(struct mlx5_mtt);
 	mr->mmkey.iova = virt_addr;
 	mr->mmkey.size = len;
 	mr->mmkey.pd = to_mpd(pd)->pdn;
 
-	mr->live = 1;
-
-unmap_dma:
-	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
+	err = mlx5_ib_update_xlt(mr, 0, npages, page_shift,
+				 MLX5_IB_UPD_XLT_ENABLE);
 
-	kfree(mr_pas);
-
-free_mr:
 	if (err) {
 		free_cached_mr(dev, mr);
 		return ERR_PTR(err);
 	}
 
+	mr->live = 1;
+
 	return mr;
 }
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
-		       int zap)
+static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
+			       void *xlt, int page_shift, size_t size,
+			       int flags)
 {
 	struct mlx5_ib_dev *dev = mr->dev;
-	struct device *ddev = dev->ib_dev.dma_device;
 	struct ib_umem *umem = mr->umem;
+
+	npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
+
+	if (!(flags & MLX5_IB_UPD_XLT_ZAP)) {
+		__mlx5_ib_populate_pas(dev, umem, page_shift,
+				       idx, npages, xlt,
+				       MLX5_IB_MTT_PRESENT);
+		/* Clear padding after the pages
+		 * brought from the umem.
+		 */
+		memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0,
+		       size - npages * sizeof(struct mlx5_mtt));
+	}
+
+	return npages;
+}
+
+#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
+			    MLX5_UMR_MTT_ALIGNMENT)
+#define MLX5_SPARE_UMR_CHUNK 0x10000
+
+int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
+		       int page_shift, int flags)
+{
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct device *ddev = dev->ib_dev.dma_device;
+	struct mlx5_ib_ucontext *uctx = NULL;
 	int size;
-	__be64 *pas;
+	void *xlt;
 	dma_addr_t dma;
 	struct mlx5_umr_wr wr;
 	struct ib_sge sg;
 	int err = 0;
-	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT /
-					 sizeof(struct mlx5_mtt);
-	const int page_index_mask = page_index_alignment - 1;
+	int desc_size = sizeof(struct mlx5_mtt);
+	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
+	const int page_mask = page_align - 1;
 	size_t pages_mapped = 0;
 	size_t pages_to_map = 0;
 	size_t pages_iter = 0;
-	int use_emergency_buf = 0;
+	gfp_t gfp;
 
 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
-	 * so we need to align the offset and length accordingly */
-	if (start_page_index & page_index_mask) {
-		npages += start_page_index & page_index_mask;
-		start_page_index &= ~page_index_mask;
+	 * so we need to align the offset and length accordingly
+	 */
+	if (idx & page_mask) {
+		npages += idx & page_mask;
+		idx &= ~page_mask;
 	}
 
-	pages_to_map = ALIGN(npages, page_index_alignment);
+	gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL;
+	gfp |= __GFP_ZERO | __GFP_NOWARN;
 
-	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
-		return -EINVAL;
+	pages_to_map = ALIGN(npages, page_align);
+	size = desc_size * pages_to_map;
+	size = min_t(int, size, MLX5_MAX_UMR_CHUNK);
 
-	size = sizeof(struct mlx5_mtt) * pages_to_map;
-	size = min_t(int, PAGE_SIZE, size);
-	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
-	 * code, when we are called from an invalidation. The pas buffer must
-	 * be 2k-aligned for Connect-IB. */
-	pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
-	if (!pas) {
-		mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
-		pas = mlx5_ib_update_mtt_emergency_buffer;
-		size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
-		use_emergency_buf = 1;
-		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
-		memset(pas, 0, size);
+	xlt = (void *)__get_free_pages(gfp, get_order(size));
+	if (!xlt && size > MLX5_SPARE_UMR_CHUNK) {
+		mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n",
+			    size, get_order(size), MLX5_SPARE_UMR_CHUNK);
+
+		size = MLX5_SPARE_UMR_CHUNK;
+		xlt = (void *)__get_free_pages(gfp, get_order(size));
 	}
-	pages_iter = size / sizeof(struct mlx5_mtt);
-	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
+
+	if (!xlt) {
+		uctx = to_mucontext(mr->ibmr.uobject->context);
+		mlx5_ib_warn(dev, "Using XLT emergency buffer\n");
+		size = PAGE_SIZE;
+		xlt = (void *)uctx->upd_xlt_page;
+		mutex_lock(&uctx->upd_xlt_page_mutex);
+		memset(xlt, 0, size);
+	}
+	pages_iter = size / desc_size;
+	dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE);
 	if (dma_mapping_error(ddev, dma)) {
-		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
+		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
 		err = -ENOMEM;
-		goto free_pas;
+		goto free_xlt;
 	}
 
+	sg.addr = dma;
+	sg.lkey = dev->umrc.pd->local_dma_lkey;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
+	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
+		wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	wr.wr.sg_list = &sg;
+	wr.wr.num_sge = 1;
+	wr.wr.opcode = MLX5_IB_WR_UMR;
+
+	wr.pd = mr->ibmr.pd;
+	wr.mkey = mr->mmkey.key;
+	wr.length = mr->mmkey.size;
+	wr.virt_addr = mr->mmkey.iova;
+	wr.access_flags = mr->access_flags;
+	wr.page_shift = page_shift;
+
 	for (pages_mapped = 0;
 	     pages_mapped < pages_to_map && !err;
-	     pages_mapped += pages_iter, start_page_index += pages_iter) {
+	     pages_mapped += pages_iter, idx += pages_iter) {
 		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
-
-		npages = min_t(size_t,
-			       pages_iter,
-			       ib_umem_num_pages(umem) - start_page_index);
-
-		if (!zap) {
-			__mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
-					       start_page_index, npages, pas,
-					       MLX5_IB_MTT_PRESENT);
-			/* Clear padding after the pages brought from the
-			 * umem. */
-			memset(pas + npages, 0, size - npages *
-			       sizeof(struct mlx5_mtt));
-		}
+		npages = populate_xlt(mr, idx, pages_iter, xlt,
+				      page_shift, size, flags);
 
 		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
 
-		memset(&wr, 0, sizeof(wr));
-
-		sg.addr = dma;
-		sg.length = ALIGN(npages * sizeof(struct mlx5_mtt),
-				MLX5_UMR_MTT_ALIGNMENT);
-		sg.lkey = dev->umrc.pd->local_dma_lkey;
+		sg.length = ALIGN(npages * desc_size,
+				  MLX5_UMR_MTT_ALIGNMENT);
+
+		if (pages_mapped + pages_iter >= pages_to_map) {
+			if (flags & MLX5_IB_UPD_XLT_ENABLE)
+				wr.wr.send_flags |=
+					MLX5_IB_SEND_UMR_ENABLE_MR |
+					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
+					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+			if (flags & MLX5_IB_UPD_XLT_PD ||
+			    flags & MLX5_IB_UPD_XLT_ACCESS)
+				wr.wr.send_flags |=
+					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
+			if (flags & MLX5_IB_UPD_XLT_ADDR)
+				wr.wr.send_flags |=
+					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+		}
 
-		wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
-				   MLX5_IB_SEND_UMR_UPDATE_XLT;
-		wr.wr.sg_list = &sg;
-		wr.wr.num_sge = 1;
-		wr.wr.opcode = MLX5_IB_WR_UMR;
+		wr.offset = idx * desc_size;
 		wr.xlt_size = sg.length;
-		wr.page_shift = PAGE_SHIFT;
-		wr.mkey = mr->mmkey.key;
-		wr.offset = start_page_index * sizeof(struct mlx5_mtt);
 
 		err = mlx5_ib_post_send_wait(dev, &wr);
 	}
 	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
 
-free_pas:
-	if (!use_emergency_buf)
-		free_page((unsigned long)pas);
+free_xlt:
+	if (uctx)
+		mutex_unlock(&uctx->upd_xlt_page_mutex);
 	else
-		mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
+		free_pages((unsigned long)xlt, get_order(size));
 
 	return err;
 }
-#endif
 
 /*
  * If ibmr is NULL it will be allocated by reg_create.
@@ -1204,7 +1139,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
         if (err < 0)
 		return ERR_PTR(err);
 
-	if (use_umr(order)) {
+	if (use_umr(dev, order)) {
 		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
 			     order, access_flags);
 		if (PTR_ERR(mr) == -EAGAIN) {
@@ -1254,39 +1189,25 @@ static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		return 0;
 
-	prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key);
+	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
+			      MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	umrwr.wr.opcode = MLX5_IB_WR_UMR;
+	umrwr.mkey = mr->mmkey.key;
 
 	return mlx5_ib_post_send_wait(dev, &umrwr);
 }
 
-static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
-		     u64 length, int npages, int page_shift, int order,
+static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
 		     int access_flags, int flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct device *ddev = dev->ib_dev.dma_device;
 	struct mlx5_umr_wr umrwr = {};
-	struct ib_sge sg;
-	dma_addr_t dma = 0;
-	__be64 *mr_pas = NULL;
-	int size;
 	int err;
 
 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
 
-	if (flags & IB_MR_REREG_TRANS) {
-		err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size,
-				     &mr_pas, &dma);
-		if (err)
-			return err;
-
-		umrwr.virt_addr = virt_addr;
-		umrwr.length = length;
-		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
-	}
-
-	prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
-			    page_shift);
+	umrwr.wr.opcode = MLX5_IB_WR_UMR;
+	umrwr.mkey = mr->mmkey.key;
 
 	if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) {
 		umrwr.pd = pd;
@@ -1294,13 +1215,8 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
 	}
 
-	/* post send request to UMR QP */
 	err = mlx5_ib_post_send_wait(dev, &umrwr);
 
-	if (flags & IB_MR_REREG_TRANS) {
-		dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
-		kfree(mr_pas);
-	}
 	return err;
 }
 
@@ -1317,6 +1233,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address;
 	u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length;
 	int page_shift = 0;
+	int upd_flags = 0;
 	int npages = 0;
 	int ncont = 0;
 	int order = 0;
@@ -1325,6 +1242,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 		    start, virt_addr, length, access_flags);
 
+	atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
+
 	if (flags != IB_MR_REREG_PD) {
 		/*
 		 * Replace umem. This needs to be done whether or not UMR is
@@ -1335,7 +1254,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
 				  &npages, &page_shift, &ncont, &order);
 		if (err < 0) {
-			mr->umem = NULL;
+			clean_mr(mr);
 			return err;
 		}
 	}
@@ -1367,32 +1286,37 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		/*
 		 * Send a UMR WQE
 		 */
-		err = rereg_umr(pd, mr, addr, len, npages, page_shift,
-				order, access_flags, flags);
+		mr->ibmr.pd = pd;
+		mr->access_flags = access_flags;
+		mr->mmkey.iova = addr;
+		mr->mmkey.size = len;
+		mr->mmkey.pd = to_mpd(pd)->pdn;
+
+		if (flags & IB_MR_REREG_TRANS) {
+			upd_flags = MLX5_IB_UPD_XLT_ADDR;
+			if (flags & IB_MR_REREG_PD)
+				upd_flags |= MLX5_IB_UPD_XLT_PD;
+			if (flags & IB_MR_REREG_ACCESS)
+				upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
+			err = mlx5_ib_update_xlt(mr, 0, npages, page_shift,
+						 upd_flags);
+		} else {
+			err = rereg_umr(pd, mr, access_flags, flags);
+		}
+
 		if (err) {
 			mlx5_ib_warn(dev, "Failed to rereg UMR\n");
+			ib_umem_release(mr->umem);
+			clean_mr(mr);
 			return err;
 		}
 	}
 
-	if (flags & IB_MR_REREG_PD) {
-		ib_mr->pd = pd;
-		mr->mmkey.pd = to_mpd(pd)->pdn;
-	}
+	set_mr_fileds(dev, mr, npages, len, access_flags);
 
-	if (flags & IB_MR_REREG_ACCESS)
-		mr->access_flags = access_flags;
-
-	if (flags & IB_MR_REREG_TRANS) {
-		atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
-		set_mr_fileds(dev, mr, npages, len, access_flags);
-		mr->mmkey.iova = addr;
-		mr->mmkey.size = len;
-	}
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	update_odp_mr(mr);
 #endif
-
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 1e73c127feb7..cfd7ee500c47 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -91,16 +91,21 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			u64 umr_offset = idx & umr_block_mask;
 
 			if (in_block && umr_offset == 0) {
-				mlx5_ib_update_mtt(mr, blk_start_idx,
-						   idx - blk_start_idx, 1);
+				mlx5_ib_update_xlt(mr, blk_start_idx,
+						   idx - blk_start_idx,
+						   PAGE_SHIFT,
+						   MLX5_IB_UPD_XLT_ZAP |
+						   MLX5_IB_UPD_XLT_ATOMIC);
 				in_block = 0;
 			}
 		}
 	}
 	if (in_block)
-		mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
-				   1);
-
+		mlx5_ib_update_xlt(mr, blk_start_idx,
+				   idx - blk_start_idx + 1,
+				   PAGE_SHIFT,
+				   MLX5_IB_UPD_XLT_ZAP |
+				   MLX5_IB_UPD_XLT_ATOMIC);
 	/*
 	 * We are now sure that the device will not access the
 	 * memory. We can safely unmap it, and mark it as dirty if
@@ -257,7 +262,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
 			 * this MR, since ib_umem_odp_map_dma_pages already
 			 * checks this.
 			 */
-			ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
+			ret = mlx5_ib_update_xlt(mr, start_idx, npages,
+						 PAGE_SHIFT,
+						 MLX5_IB_UPD_XLT_ATOMIC);
 		} else {
 			ret = -EAGAIN;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 54e5a786f191..1713bd8d44a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -152,6 +152,26 @@ static struct mlx5_profile profile[] = {
 			.size	= 8,
 			.limit	= 4
 		},
+		.mr_cache[16]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+		.mr_cache[17]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+		.mr_cache[18]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+		.mr_cache[19]	= {
+			.size	= 4,
+			.limit	= 2
+		},
+		.mr_cache[20]	= {
+			.size	= 4,
+			.limit	= 2
+		},
 	},
 };
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 0ae55361e674..ec52f3b50bf5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -959,7 +959,7 @@ enum {
 };
 
 enum {
-	MAX_MR_CACHE_ENTRIES    = 16,
+	MAX_MR_CACHE_ENTRIES    = 21,
 };
 
 enum {
-- 
cgit v1.2.3


From 223cdc72429079aaf72511d2677b5d6584866313 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:45 +0200
Subject: net/mlx5: Update PAGE_FAULT_RESUME layout

Update PAGE_FAULT_RESUME command layout.

Three bit fields describing page fault: rdma, rdma_write, req_res gave 8
possible combinations, while only a few were legal. Now they
are interpreted as three-bit type field, where former legal
combinations turns into corresponding types and unused were added as new
types.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 10 ++--------
 include/linux/mlx5/mlx5_ifc.h                |  9 ++++-----
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index d0a4005fe63a..5378a5f74bdc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -515,14 +515,8 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
 
 	MLX5_SET(page_fault_resume_in, in, opcode,
 		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
-	MLX5_SET(page_fault_resume_in, in, qpn, qpn);
-
-	if (flags & MLX5_PAGE_FAULT_RESUME_REQUESTOR)
-		MLX5_SET(page_fault_resume_in, in, req_res, 1);
-	if (flags & MLX5_PAGE_FAULT_RESUME_WRITE)
-		MLX5_SET(page_fault_resume_in, in, read_write, 1);
-	if (flags & MLX5_PAGE_FAULT_RESUME_RDMA)
-		MLX5_SET(page_fault_resume_in, in, rdma, 1);
+	MLX5_SET(page_fault_resume_in, in, wq_number, qpn);
+	MLX5_SET(page_fault_resume_in, in, page_fault_type, flags);
 	if (error)
 		MLX5_SET(page_fault_resume_in, in, error, 1);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7c760e580268..608dc988b3d6 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4775,12 +4775,11 @@ struct mlx5_ifc_page_fault_resume_in_bits {
 
 	u8         error[0x1];
 	u8         reserved_at_41[0x4];
-	u8         rdma[0x1];
-	u8         read_write[0x1];
-	u8         req_res[0x1];
-	u8         qpn[0x18];
+	u8         page_fault_type[0x3];
+	u8         wq_number[0x18];
 
-	u8         reserved_at_60[0x20];
+	u8         reserved_at_60[0x8];
+	u8         token[0x18];
 };
 
 struct mlx5_ifc_nop_out_bits {
-- 
cgit v1.2.3


From d9aaed838765e28234cb700c7d1ac975cadf28c9 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:46 +0200
Subject: {net,IB}/mlx5: Refactor page fault handling

* Update page fault event according to last specification.
* Separate code path for page fault EQ, completion EQ and async EQ.
* Move page fault handling work queue from mlx5_ib static variable
  into mlx5_core page fault EQ.
* Allocate memory to store ODP event dynamically as the
  events arrive, since in atomic context - use mempool.
* Make mlx5_ib page fault handler run in process context.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c                  |  14 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |  49 +---
 drivers/infiniband/hw/mlx5/odp.c                   | 300 ++++++++-------------
 drivers/infiniband/hw/mlx5/qp.c                    |  26 --
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      |  33 +++
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 290 +++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  21 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/qp.c       | 108 --------
 include/linux/mlx5/device.h                        |   6 +-
 include/linux/mlx5/driver.h                        |  97 ++++++-
 include/linux/mlx5/qp.h                            |  44 ---
 12 files changed, 522 insertions(+), 468 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b87127206ef2..86c61e73780e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3319,6 +3319,9 @@ static struct mlx5_interface mlx5_ib_interface = {
 	.add            = mlx5_ib_add,
 	.remove         = mlx5_ib_remove,
 	.event          = mlx5_ib_event,
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	.pfault		= mlx5_ib_pfault,
+#endif
 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
 };
 
@@ -3329,25 +3332,14 @@ static int __init mlx5_ib_init(void)
 	if (deprecated_prof_sel != 2)
 		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
 
-	err = mlx5_ib_odp_init();
-	if (err)
-		return err;
-
 	err = mlx5_register_interface(&mlx5_ib_interface);
-	if (err)
-		goto clean_odp;
-
-	return err;
 
-clean_odp:
-	mlx5_ib_odp_cleanup();
 	return err;
 }
 
 static void __exit mlx5_ib_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5_ib_interface);
-	mlx5_ib_odp_cleanup();
 }
 
 module_init(mlx5_ib_init);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 02d925573945..a51c8051aeb2 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -277,29 +277,6 @@ struct mlx5_ib_rwq_ind_table {
 	u32			rqtn;
 };
 
-/*
- * Connect-IB can trigger up to four concurrent pagefaults
- * per-QP.
- */
-enum mlx5_ib_pagefault_context {
-	MLX5_IB_PAGEFAULT_RESPONDER_READ,
-	MLX5_IB_PAGEFAULT_REQUESTOR_READ,
-	MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
-	MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
-	MLX5_IB_PAGEFAULT_CONTEXTS
-};
-
-static inline enum mlx5_ib_pagefault_context
-	mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
-{
-	return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
-}
-
-struct mlx5_ib_pfault {
-	struct work_struct	work;
-	struct mlx5_pagefault	mpfault;
-};
-
 struct mlx5_ib_ubuffer {
 	struct ib_umem	       *umem;
 	int			buf_size;
@@ -385,20 +362,6 @@ struct mlx5_ib_qp {
 	/* Store signature errors */
 	bool			signature_en;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	/*
-	 * A flag that is true for QP's that are in a state that doesn't
-	 * allow page faults, and shouldn't schedule any more faults.
-	 */
-	int                     disable_page_faults;
-	/*
-	 * The disable_page_faults_lock protects a QP's disable_page_faults
-	 * field, allowing for a thread to atomically check whether the QP
-	 * allows page faults, and if so schedule a page fault.
-	 */
-	spinlock_t              disable_page_faults_lock;
-	struct mlx5_ib_pfault	pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
-#endif
 	struct list_head	qps_list;
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
@@ -869,18 +832,13 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
 int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-extern struct workqueue_struct *mlx5_ib_page_fault_wq;
-
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
-void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
-			       struct mlx5_ib_pfault *pfault);
-void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
+void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
+		    struct mlx5_pagefault *pfault);
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
-void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
@@ -889,13 +847,10 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 	return;
 }
 
-static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)		{}
 static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
 static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)	{}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				{}
-static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
-static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)  {}
 
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index cfd7ee500c47..26f96c79a45a 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -41,8 +41,6 @@
  * a pagefault. */
 #define MMU_NOTIFIER_TIMEOUT 1000
 
-struct workqueue_struct *mlx5_ib_page_fault_wq;
-
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end)
 {
@@ -162,38 +160,38 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
 }
 
-static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
-				      struct mlx5_ib_pfault *pfault,
+static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
+				      struct mlx5_pagefault *pfault,
 				      int error)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
-	u32 qpn = qp->trans_qp.base.mqp.qpn;
+	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
+		     pfault->wqe.wq_num : pfault->token;
 	int ret = mlx5_core_page_fault_resume(dev->mdev,
-					      qpn,
-					      pfault->mpfault.flags,
+					      pfault->token,
+					      wq_num,
+					      pfault->type,
 					      error);
 	if (ret)
-		pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn);
+		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
+			    wq_num);
 }
 
 /*
- * Handle a single data segment in a page-fault WQE.
+ * Handle a single data segment in a page-fault WQE or RDMA region.
  *
- * Returns number of pages retrieved on success. The caller will continue to
+ * Returns number of pages retrieved on success. The caller may continue to
  * the next data segment.
  * Can return the following error codes:
  * -EAGAIN to designate a temporary error. The caller will abort handling the
  *  page fault and resolve it.
  * -EFAULT when there's an error mapping the requested pages. The caller will
- *  abort the page fault handling and possibly move the QP to an error state.
- * On other errors the QP should also be closed with an error.
+ *  abort the page fault handling.
  */
-static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
-					 struct mlx5_ib_pfault *pfault,
+static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
 					 u32 key, u64 io_virt, size_t bcnt,
+					 u32 *bytes_committed,
 					 u32 *bytes_mapped)
 {
-	struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device);
 	int srcu_key;
 	unsigned int current_seq;
 	u64 start_idx;
@@ -219,12 +217,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
 			 key);
 		if (bytes_mapped)
 			*bytes_mapped +=
-				(bcnt - pfault->mpfault.bytes_committed);
-		goto srcu_unlock;
-	}
-	if (mr->ibmr.pd != qp->ibqp.pd) {
-		pr_err("Page-fault with different PDs for QP and MR.\n");
-		ret = -EFAULT;
+				(bcnt - *bytes_committed);
 		goto srcu_unlock;
 	}
 
@@ -240,8 +233,8 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
 	 * in all iterations (in iteration 2 and above,
 	 * bytes_committed == 0).
 	 */
-	io_virt += pfault->mpfault.bytes_committed;
-	bcnt -= pfault->mpfault.bytes_committed;
+	io_virt += *bytes_committed;
+	bcnt -= *bytes_committed;
 
 	start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
 
@@ -300,7 +293,7 @@ srcu_unlock:
 		}
 	}
 	srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
-	pfault->mpfault.bytes_committed = 0;
+	*bytes_committed = 0;
 	return ret ? ret : npages;
 }
 
@@ -322,8 +315,9 @@ srcu_unlock:
  * Returns the number of pages loaded if positive, zero for an empty WQE, or a
  * negative error code.
  */
-static int pagefault_data_segments(struct mlx5_ib_qp *qp,
-				   struct mlx5_ib_pfault *pfault, void *wqe,
+static int pagefault_data_segments(struct mlx5_ib_dev *dev,
+				   struct mlx5_pagefault *pfault,
+				   struct mlx5_ib_qp *qp, void *wqe,
 				   void *wqe_end, u32 *bytes_mapped,
 				   u32 *total_wqe_bytes, int receive_queue)
 {
@@ -367,22 +361,23 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp,
 
 		if (!inline_segment && total_wqe_bytes) {
 			*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
-					pfault->mpfault.bytes_committed);
+					pfault->bytes_committed);
 		}
 
 		/* A zero length data segment designates a length of 2GB. */
 		if (bcnt == 0)
 			bcnt = 1U << 31;
 
-		if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) {
-			pfault->mpfault.bytes_committed -=
+		if (inline_segment || bcnt <= pfault->bytes_committed) {
+			pfault->bytes_committed -=
 				min_t(size_t, bcnt,
-				      pfault->mpfault.bytes_committed);
+				      pfault->bytes_committed);
 			continue;
 		}
 
-		ret = pagefault_single_data_segment(qp, pfault, key, io_virt,
-						    bcnt, bytes_mapped);
+		ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
+						    &pfault->bytes_committed,
+						    bytes_mapped);
 		if (ret < 0)
 			break;
 		npages += ret;
@@ -396,12 +391,11 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp,
  * scatter-gather list, and set wqe_end to the end of the WQE.
  */
 static int mlx5_ib_mr_initiator_pfault_handler(
-	struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
-	void **wqe, void **wqe_end, int wqe_length)
+	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
+	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
 	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
-	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
+	u16 wqe_index = pfault->wqe.wqe_index;
 	unsigned ds, opcode;
 #if defined(DEBUG)
 	u32 ctrl_wqe_index, ctrl_qpn;
@@ -502,10 +496,9 @@ invalid_transport_or_opcode:
  * scatter-gather list, and set wqe_end to the end of the WQE.
  */
 static int mlx5_ib_mr_responder_pfault_handler(
-	struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
-	void **wqe, void **wqe_end, int wqe_length)
+	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
+	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
 	struct mlx5_ib_wq *wq = &qp->rq;
 	int wqe_size = 1 << wq->wqe_shift;
 
@@ -542,70 +535,83 @@ invalid_transport_or_opcode:
 	return 0;
 }
 
-static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
-					  struct mlx5_ib_pfault *pfault)
+static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
+					      u32 wq_num)
+{
+	struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
+
+	if (!mqp) {
+		mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
+		return NULL;
+	}
+
+	return to_mibqp(mqp);
+}
+
+static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
+					  struct mlx5_pagefault *pfault)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
 	int ret;
 	void *wqe, *wqe_end;
 	u32 bytes_mapped, total_wqe_bytes;
 	char *buffer = NULL;
-	int resume_with_error = 0;
-	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
-	int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
-	u32 qpn = qp->trans_qp.base.mqp.qpn;
+	int resume_with_error = 1;
+	u16 wqe_index = pfault->wqe.wqe_index;
+	int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
+	struct mlx5_ib_qp *qp;
 
 	buffer = (char *)__get_free_page(GFP_KERNEL);
 	if (!buffer) {
 		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
-		resume_with_error = 1;
 		goto resolve_page_fault;
 	}
 
+	qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
+	if (!qp)
+		goto resolve_page_fault;
+
 	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
 				    PAGE_SIZE, &qp->trans_qp.base);
 	if (ret < 0) {
-		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
-			    -ret, wqe_index, qpn);
-		resume_with_error = 1;
+		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
+			    ret, wqe_index, pfault->token);
 		goto resolve_page_fault;
 	}
 
 	wqe = buffer;
 	if (requestor)
-		ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe,
+		ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
 							  &wqe_end, ret);
 	else
-		ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe,
+		ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
 							  &wqe_end, ret);
-	if (ret < 0) {
-		resume_with_error = 1;
+	if (ret < 0)
 		goto resolve_page_fault;
-	}
 
 	if (wqe >= wqe_end) {
 		mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
-		resume_with_error = 1;
 		goto resolve_page_fault;
 	}
 
-	ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped,
-				      &total_wqe_bytes, !requestor);
+	ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
+				      &bytes_mapped, &total_wqe_bytes,
+				      !requestor);
 	if (ret == -EAGAIN) {
+		resume_with_error = 0;
 		goto resolve_page_fault;
 	} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
-		mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n",
-			    -ret);
-		resume_with_error = 1;
+		if (ret != -ENOENT)
+			mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n",
+				    ret);
 		goto resolve_page_fault;
 	}
 
+	resume_with_error = 0;
 resolve_page_fault:
-	mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
-	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
-		    qpn, resume_with_error,
-		    pfault->mpfault.flags);
-
+	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
+	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
+		    pfault->token, resume_with_error,
+		    pfault->type);
 	free_page((unsigned long)buffer);
 }
 
@@ -615,15 +621,14 @@ static int pages_in_range(u64 address, u32 length)
 		(address & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
-static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
-					   struct mlx5_ib_pfault *pfault)
+static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
+					   struct mlx5_pagefault *pfault)
 {
-	struct mlx5_pagefault *mpfault = &pfault->mpfault;
 	u64 address;
 	u32 length;
-	u32 prefetch_len = mpfault->bytes_committed;
+	u32 prefetch_len = pfault->bytes_committed;
 	int prefetch_activated = 0;
-	u32 rkey = mpfault->rdma.r_key;
+	u32 rkey = pfault->rdma.r_key;
 	int ret;
 
 	/* The RDMA responder handler handles the page fault in two parts.
@@ -632,38 +637,40 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
 	 * prefetches more pages. The second operation cannot use the pfault
 	 * context and therefore uses the dummy_pfault context allocated on
 	 * the stack */
-	struct mlx5_ib_pfault dummy_pfault = {};
+	pfault->rdma.rdma_va += pfault->bytes_committed;
+	pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
+					 pfault->rdma.rdma_op_len);
+	pfault->bytes_committed = 0;
 
-	dummy_pfault.mpfault.bytes_committed = 0;
-
-	mpfault->rdma.rdma_va += mpfault->bytes_committed;
-	mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed,
-					 mpfault->rdma.rdma_op_len);
-	mpfault->bytes_committed = 0;
-
-	address = mpfault->rdma.rdma_va;
-	length  = mpfault->rdma.rdma_op_len;
+	address = pfault->rdma.rdma_va;
+	length  = pfault->rdma.rdma_op_len;
 
 	/* For some operations, the hardware cannot tell the exact message
 	 * length, and in those cases it reports zero. Use prefetch
 	 * logic. */
 	if (length == 0) {
 		prefetch_activated = 1;
-		length = mpfault->rdma.packet_size;
+		length = pfault->rdma.packet_size;
 		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
 	}
 
-	ret = pagefault_single_data_segment(qp, pfault, rkey, address, length,
-					    NULL);
+	ret = pagefault_single_data_segment(dev, rkey, address, length,
+					    &pfault->bytes_committed, NULL);
 	if (ret == -EAGAIN) {
 		/* We're racing with an invalidation, don't prefetch */
 		prefetch_activated = 0;
 	} else if (ret < 0 || pages_in_range(address, length) > ret) {
-		mlx5_ib_page_fault_resume(qp, pfault, 1);
+		mlx5_ib_page_fault_resume(dev, pfault, 1);
+		if (ret != -ENOENT)
+			mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
+				     ret, pfault->token, pfault->type);
 		return;
 	}
 
-	mlx5_ib_page_fault_resume(qp, pfault, 0);
+	mlx5_ib_page_fault_resume(dev, pfault, 0);
+	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
+		    pfault->token, pfault->type,
+		    prefetch_activated);
 
 	/* At this point, there might be a new pagefault already arriving in
 	 * the eq, switch to the dummy pagefault for the rest of the
@@ -671,112 +678,39 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
 	 * work-queue is being fenced. */
 
 	if (prefetch_activated) {
-		ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey,
-						    address,
+		u32 bytes_committed = 0;
+
+		ret = pagefault_single_data_segment(dev, rkey, address,
 						    prefetch_len,
-						    NULL);
+						    &bytes_committed, NULL);
 		if (ret < 0) {
-			pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
-				ret, prefetch_activated,
-				qp->ibqp.qp_num, address, prefetch_len);
+			mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
+				     ret, pfault->token, address,
+				     prefetch_len);
 		}
 	}
 }
 
-void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
-			       struct mlx5_ib_pfault *pfault)
+void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
+		    struct mlx5_pagefault *pfault)
 {
-	u8 event_subtype = pfault->mpfault.event_subtype;
+	struct mlx5_ib_dev *dev = context;
+	u8 event_subtype = pfault->event_subtype;
 
 	switch (event_subtype) {
 	case MLX5_PFAULT_SUBTYPE_WQE:
-		mlx5_ib_mr_wqe_pfault_handler(qp, pfault);
+		mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
 		break;
 	case MLX5_PFAULT_SUBTYPE_RDMA:
-		mlx5_ib_mr_rdma_pfault_handler(qp, pfault);
+		mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
 		break;
 	default:
-		pr_warn("Invalid page fault event subtype: 0x%x\n",
-			event_subtype);
-		mlx5_ib_page_fault_resume(qp, pfault, 1);
-		break;
+		mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
+			    event_subtype);
+		mlx5_ib_page_fault_resume(dev, pfault, 1);
 	}
 }
 
-static void mlx5_ib_qp_pfault_action(struct work_struct *work)
-{
-	struct mlx5_ib_pfault *pfault = container_of(work,
-						     struct mlx5_ib_pfault,
-						     work);
-	enum mlx5_ib_pagefault_context context =
-		mlx5_ib_get_pagefault_context(&pfault->mpfault);
-	struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
-					     pagefaults[context]);
-	mlx5_ib_mr_pfault_handler(qp, pfault);
-}
-
-void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
-	qp->disable_page_faults = 1;
-	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
-
-	/*
-	 * Note that at this point, we are guarenteed that no more
-	 * work queue elements will be posted to the work queue with
-	 * the QP we are closing.
-	 */
-	flush_workqueue(mlx5_ib_page_fault_wq);
-}
-
-void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
-	qp->disable_page_faults = 0;
-	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
-}
-
-static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
-				   struct mlx5_pagefault *pfault)
-{
-	/*
-	 * Note that we will only get one fault event per QP per context
-	 * (responder/initiator, read/write), until we resolve the page fault
-	 * with the mlx5_ib_page_fault_resume command. Since this function is
-	 * called from within the work element, there is no risk of missing
-	 * events.
-	 */
-	struct mlx5_ib_qp *mibqp = to_mibqp(qp);
-	enum mlx5_ib_pagefault_context context =
-		mlx5_ib_get_pagefault_context(pfault);
-	struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
-
-	qp_pfault->mpfault = *pfault;
-
-	/* No need to stop interrupts here since we are in an interrupt */
-	spin_lock(&mibqp->disable_page_faults_lock);
-	if (!mibqp->disable_page_faults)
-		queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
-	spin_unlock(&mibqp->disable_page_faults_lock);
-}
-
-void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
-{
-	int i;
-
-	qp->disable_page_faults = 1;
-	spin_lock_init(&qp->disable_page_faults_lock);
-
-	qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler;
-
-	for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
-		INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
-}
-
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
 {
 	int ret;
@@ -793,17 +727,3 @@ void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
 	cleanup_srcu_struct(&ibdev->mr_srcu);
 }
 
-int __init mlx5_ib_odp_init(void)
-{
-	mlx5_ib_page_fault_wq = alloc_ordered_workqueue("mlx5_ib_page_faults",
-							WQ_MEM_RECLAIM);
-	if (!mlx5_ib_page_fault_wq)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void mlx5_ib_odp_cleanup(void)
-{
-	destroy_workqueue(mlx5_ib_page_fault_wq);
-}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index ec2301ac0fde..53f4dd32f956 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1526,9 +1526,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	       &qp->raw_packet_qp.rq.base :
 	       &qp->trans_qp.base;
 
-	if (init_attr->qp_type != IB_QPT_RAW_PACKET)
-		mlx5_ib_odp_create_qp(qp);
-
 	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
@@ -1923,7 +1920,6 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 
 	if (qp->state != IB_QPS_RESET) {
 		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
-			mlx5_ib_qp_disable_pagefaults(qp);
 			err = mlx5_core_qp_modify(dev->mdev,
 						  MLX5_CMD_OP_2RST_QP, 0,
 						  NULL, &base->mqp);
@@ -2823,16 +2819,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	if (mlx5_st < 0)
 		goto out;
 
-	/* If moving to a reset or error state, we must disable page faults on
-	 * this QP and flush all current page faults. Otherwise a stale page
-	 * fault may attempt to work on this QP after it is reset and moved
-	 * again to RTS, and may cause the driver and the device to get out of
-	 * sync. */
-	if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
-	    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) &&
-	    (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
-		mlx5_ib_qp_disable_pagefaults(qp);
-
 	if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
 	    !optab[mlx5_cur][mlx5_new])
 		goto out;
@@ -2864,10 +2850,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	if (err)
 		goto out;
 
-	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT &&
-	    (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
-		mlx5_ib_qp_enable_pagefaults(qp);
-
 	qp->state = new_state;
 
 	if (attr_mask & IB_QP_ACCESS_FLAGS)
@@ -4533,14 +4515,6 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
 					    qp_init_attr);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	/*
-	 * Wait for any outstanding page faults, in case the user frees memory
-	 * based upon this query's result.
-	 */
-	flush_workqueue(mlx5_ib_page_fault_wq);
-#endif
-
 	mutex_lock(&qp->mutex);
 
 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index a9dbc28f6b97..a62f4b6a21a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -71,6 +71,16 @@ void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	if (dev_ctx->context) {
 		spin_lock_irq(&priv->ctx_lock);
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		if (dev_ctx->intf->pfault) {
+			if (priv->pfault) {
+				mlx5_core_err(dev, "multiple page fault handlers not supported");
+			} else {
+				priv->pfault_ctx = dev_ctx->context;
+				priv->pfault = dev_ctx->intf->pfault;
+			}
+		}
+#endif
 		spin_unlock_irq(&priv->ctx_lock);
 	} else {
 		kfree(dev_ctx);
@@ -97,6 +107,15 @@ void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	if (!dev_ctx)
 		return;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	spin_lock_irq(&priv->ctx_lock);
+	if (priv->pfault == dev_ctx->intf->pfault)
+		priv->pfault = NULL;
+	spin_unlock_irq(&priv->ctx_lock);
+
+	synchronize_srcu(&priv->pfault_srcu);
+#endif
+
 	spin_lock_irq(&priv->ctx_lock);
 	list_del(&dev_ctx->list);
 	spin_unlock_irq(&priv->ctx_lock);
@@ -329,6 +348,20 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 	spin_unlock_irqrestore(&priv->ctx_lock, flags);
 }
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&priv->pfault_srcu);
+	if (priv->pfault)
+		priv->pfault(dev, priv->pfault_ctx, pfault);
+	srcu_read_unlock(&priv->pfault_srcu, srcu_idx);
+}
+#endif
+
 void mlx5_dev_list_lock(void)
 {
 	mutex_lock(&mlx5_intf_mutex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 8ffcc8808e50..4aff8ac68e14 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -54,6 +54,7 @@ enum {
 	MLX5_NUM_SPARE_EQE	= 0x80,
 	MLX5_NUM_ASYNC_EQE	= 0x100,
 	MLX5_NUM_CMD_EQE	= 32,
+	MLX5_NUM_PF_DRAIN	= 64,
 };
 
 enum {
@@ -188,10 +189,193 @@ static void eq_update_ci(struct mlx5_eq *eq, int arm)
 	mb();
 }
 
-static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static void eqe_pf_action(struct work_struct *work)
+{
+	struct mlx5_pagefault *pfault = container_of(work,
+						     struct mlx5_pagefault,
+						     work);
+	struct mlx5_eq *eq = pfault->eq;
+
+	mlx5_core_page_fault(eq->dev, pfault);
+	mempool_free(pfault, eq->pf_ctx.pool);
+}
+
+static void eq_pf_process(struct mlx5_eq *eq)
+{
+	struct mlx5_core_dev *dev = eq->dev;
+	struct mlx5_eqe_page_fault *pf_eqe;
+	struct mlx5_pagefault *pfault;
+	struct mlx5_eqe *eqe;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		pfault = mempool_alloc(eq->pf_ctx.pool, GFP_ATOMIC);
+		if (!pfault) {
+			schedule_work(&eq->pf_ctx.work);
+			break;
+		}
+
+		dma_rmb();
+		pf_eqe = &eqe->data.page_fault;
+		pfault->event_subtype = eqe->sub_type;
+		pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
+
+		mlx5_core_dbg(dev,
+			      "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
+			      eqe->sub_type, pfault->bytes_committed);
+
+		switch (eqe->sub_type) {
+		case MLX5_PFAULT_SUBTYPE_RDMA:
+			/* RDMA based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) &
+				MLX5_24BIT_MASK;
+			pfault->rdma.r_key =
+				be32_to_cpu(pf_eqe->rdma.r_key);
+			pfault->rdma.packet_size =
+				be16_to_cpu(pf_eqe->rdma.packet_length);
+			pfault->rdma.rdma_op_len =
+				be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+			pfault->rdma.rdma_va =
+				be64_to_cpu(pf_eqe->rdma.rdma_va);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
+				      pfault->type, pfault->token,
+				      pfault->rdma.r_key);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
+				      pfault->rdma.rdma_op_len,
+				      pfault->rdma.rdma_va);
+			break;
+
+		case MLX5_PFAULT_SUBTYPE_WQE:
+			/* WQE based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->wqe.token);
+			pfault->wqe.wq_num =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) &
+				MLX5_24BIT_MASK;
+			pfault->wqe.wqe_index =
+				be16_to_cpu(pf_eqe->wqe.wqe_index);
+			pfault->wqe.packet_size =
+				be16_to_cpu(pf_eqe->wqe.packet_length);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+				      pfault->type, pfault->token,
+				      pfault->wqe.wq_num,
+				      pfault->wqe.wqe_index);
+			break;
+
+		default:
+			mlx5_core_warn(dev,
+				       "Unsupported page fault event sub-type: 0x%02hhx\n",
+				       eqe->sub_type);
+			/* Unsupported page faults should still be
+			 * resolved by the page fault handler
+			 */
+		}
+
+		pfault->eq = eq;
+		INIT_WORK(&pfault->work, eqe_pf_action);
+		queue_work(eq->pf_ctx.wq, &pfault->work);
+
+		++eq->cons_index;
+		++set_ci;
+
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+}
+
+static irqreturn_t mlx5_eq_pf_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	unsigned long flags;
+
+	if (spin_trylock_irqsave(&eq->pf_ctx.lock, flags)) {
+		eq_pf_process(eq);
+		spin_unlock_irqrestore(&eq->pf_ctx.lock, flags);
+	} else {
+		schedule_work(&eq->pf_ctx.work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* mempool_refill() was proposed but unfortunately wasn't accepted
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
+ * Chip workaround.
+ */
+static void mempool_refill(mempool_t *pool)
+{
+	while (pool->curr_nr < pool->min_nr)
+		mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
+}
+
+static void eq_pf_action(struct work_struct *work)
+{
+	struct mlx5_eq *eq = container_of(work, struct mlx5_eq, pf_ctx.work);
+
+	mempool_refill(eq->pf_ctx.pool);
+
+	spin_lock_irq(&eq->pf_ctx.lock);
+	eq_pf_process(eq);
+	spin_unlock_irq(&eq->pf_ctx.lock);
+}
+
+static int init_pf_ctx(struct mlx5_eq_pagefault *pf_ctx, const char *name)
 {
+	spin_lock_init(&pf_ctx->lock);
+	INIT_WORK(&pf_ctx->work, eq_pf_action);
+
+	pf_ctx->wq = alloc_ordered_workqueue(name,
+					     WQ_MEM_RECLAIM);
+	if (!pf_ctx->wq)
+		return -ENOMEM;
+
+	pf_ctx->pool = mempool_create_kmalloc_pool
+		(MLX5_NUM_PF_DRAIN, sizeof(struct mlx5_pagefault));
+	if (!pf_ctx->pool)
+		goto err_wq;
+
+	return 0;
+err_wq:
+	destroy_workqueue(pf_ctx->wq);
+	return -ENOMEM;
+}
+
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
+				u32 wq_num, u8 type, int error)
+{
+	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = {0};
+
+	MLX5_SET(page_fault_resume_in, in, opcode,
+		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
+	MLX5_SET(page_fault_resume_in, in, error, !!error);
+	MLX5_SET(page_fault_resume_in, in, page_fault_type, type);
+	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
+	MLX5_SET(page_fault_resume_in, in, token, token);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
+#endif
+
+static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_core_dev *dev = eq->dev;
 	struct mlx5_eqe *eqe;
-	int eqes_found = 0;
 	int set_ci = 0;
 	u32 cqn = -1;
 	u32 rsn;
@@ -276,12 +460,6 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 			}
 			break;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-		case MLX5_EVENT_TYPE_PAGE_FAULT:
-			mlx5_eq_pagefault(dev, eqe);
-			break;
-#endif
-
 #ifdef CONFIG_MLX5_CORE_EN
 		case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE:
 			mlx5_eswitch_vport_event(dev->priv.eswitch, eqe);
@@ -299,7 +477,6 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		}
 
 		++eq->cons_index;
-		eqes_found = 1;
 		++set_ci;
 
 		/* The HCA will think the queue has overflowed if we
@@ -319,17 +496,6 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 	if (cqn != -1)
 		tasklet_schedule(&eq->tasklet_ctx.task);
 
-	return eqes_found;
-}
-
-static irqreturn_t mlx5_msix_handler(int irq, void *eq_ptr)
-{
-	struct mlx5_eq *eq = eq_ptr;
-	struct mlx5_core_dev *dev = eq->dev;
-
-	mlx5_eq_int(dev, eq);
-
-	/* MSI-X vectors always belong to us */
 	return IRQ_HANDLED;
 }
 
@@ -345,22 +511,32 @@ static void init_eq_buf(struct mlx5_eq *eq)
 }
 
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
-		       int nent, u64 mask, const char *name, struct mlx5_uar *uar)
+		       int nent, u64 mask, const char *name,
+		       struct mlx5_uar *uar, enum mlx5_eq_type type)
 {
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
+	irq_handler_t handler;
 	__be64 *pas;
 	void *eqc;
 	int inlen;
 	u32 *in;
 	int err;
 
+	eq->type = type;
 	eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
 	eq->cons_index = 0;
 	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
 	if (err)
 		return err;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF)
+		handler = mlx5_eq_pf_int;
+	else
+#endif
+		handler = mlx5_eq_int;
+
 	init_eq_buf(eq);
 
 	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
@@ -396,7 +572,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	eq->irqn = priv->msix_arr[vecidx].vector;
 	eq->dev = dev;
 	eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET;
-	err = request_irq(eq->irqn, mlx5_msix_handler, 0,
+	err = request_irq(eq->irqn, handler, 0,
 			  priv->irq_info[vecidx].name, eq);
 	if (err)
 		goto err_eq;
@@ -405,11 +581,20 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	if (err)
 		goto err_irq;
 
-	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
-	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
-	spin_lock_init(&eq->tasklet_ctx.lock);
-	tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
-		     (unsigned long)&eq->tasklet_ctx);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF) {
+		err = init_pf_ctx(&eq->pf_ctx, name);
+		if (err)
+			goto err_irq;
+	} else
+#endif
+	{
+		INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+		INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+		spin_lock_init(&eq->tasklet_ctx.lock);
+		tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
+			     (unsigned long)&eq->tasklet_ctx);
+	}
 
 	/* EQs are created in ARMED state
 	 */
@@ -444,7 +629,16 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
 			       eq->eqn);
 	synchronize_irq(eq->irqn);
-	tasklet_disable(&eq->tasklet_ctx.task);
+
+	if (eq->type == MLX5_EQ_TYPE_COMP) {
+		tasklet_disable(&eq->tasklet_ctx.task);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	} else if (eq->type == MLX5_EQ_TYPE_PF) {
+		cancel_work_sync(&eq->pf_ctx.work);
+		destroy_workqueue(eq->pf_ctx.wq);
+		mempool_destroy(eq->pf_ctx.pool);
+#endif
+	}
 	mlx5_buf_free(dev, &eq->buf);
 
 	return err;
@@ -479,8 +673,6 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	u64 async_event_mask = MLX5_ASYNC_EVENT_MASK;
 	int err;
 
-	if (MLX5_CAP_GEN(dev, pg))
-		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT);
 
 	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH &&
 	    MLX5_CAP_GEN(dev, vport_group_manager) &&
@@ -494,7 +686,8 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
-				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
+				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0],
+				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		return err;
@@ -504,7 +697,8 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
 				 MLX5_NUM_ASYNC_EQE, async_event_mask,
-				 "mlx5_async_eq", &dev->priv.uuari.uars[0]);
+				 "mlx5_async_eq", &dev->priv.uuari.uars[0],
+				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
 		goto err1;
@@ -514,13 +708,35 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 				 MLX5_EQ_VEC_PAGES,
 				 /* TODO: sriov max_vf + */ 1,
 				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
-				 &dev->priv.uuari.uars[0]);
+				 &dev->priv.uuari.uars[0],
+				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
 		goto err2;
 	}
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_create_map_eq(dev, &table->pfault_eq,
+					 MLX5_EQ_VEC_PFAULT,
+					 MLX5_NUM_ASYNC_EQE,
+					 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
+					 "mlx5_page_fault_eq",
+					 &dev->priv.uuari.uars[0],
+					 MLX5_EQ_TYPE_PF);
+		if (err) {
+			mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
+				       err);
+			goto err3;
+		}
+	}
+
 	return err;
+err3:
+	mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+#else
+	return err;
+#endif
 
 err2:
 	mlx5_destroy_unmap_eq(dev, &table->async_eq);
@@ -536,6 +752,14 @@ int mlx5_stop_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
 	int err;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_destroy_unmap_eq(dev, &table->pfault_eq);
+		if (err)
+			return err;
+	}
+#endif
+
 	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 1713bd8d44a4..f4115135e30b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -753,7 +753,8 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
-					 name, &dev->priv.uuari.uars[0]);
+					 name, &dev->priv.uuari.uars[0],
+					 MLX5_EQ_TYPE_COMP);
 		if (err) {
 			kfree(eq);
 			goto clean;
@@ -1295,10 +1296,19 @@ static int init_one(struct pci_dev *pdev,
 	spin_lock_init(&priv->ctx_lock);
 	mutex_init(&dev->pci_status_mutex);
 	mutex_init(&dev->intf_state_mutex);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	err = init_srcu_struct(&priv->pfault_srcu);
+	if (err) {
+		dev_err(&pdev->dev, "init_srcu_struct failed with error code %d\n",
+			err);
+		goto clean_dev;
+	}
+#endif
 	err = mlx5_pci_init(dev, priv);
 	if (err) {
 		dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
-		goto clean_dev;
+		goto clean_srcu;
 	}
 
 	err = mlx5_health_init(dev);
@@ -1332,7 +1342,11 @@ clean_health:
 	mlx5_health_cleanup(dev);
 close_pci:
 	mlx5_pci_close(dev, priv);
+clean_srcu:
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
 clean_dev:
+#endif
 	pci_set_drvdata(pdev, NULL);
 	devlink_free(devlink);
 
@@ -1357,6 +1371,9 @@ static void remove_one(struct pci_dev *pdev)
 	mlx5_pagealloc_cleanup(dev);
 	mlx5_health_cleanup(dev);
 	mlx5_pci_close(dev, priv);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
+#endif
 	pci_set_drvdata(pdev, NULL);
 	devlink_free(devlink);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index d4a99c9757cb..74241e82de63 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -86,6 +86,8 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault);
 void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 5378a5f74bdc..cbbcef2884be 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -143,95 +143,6 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
 	mlx5_core_put_rsc(common);
 }
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
-{
-	struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault;
-	int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK;
-	struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn);
-	struct mlx5_core_qp *qp =
-		container_of(common, struct mlx5_core_qp, common);
-	struct mlx5_pagefault pfault;
-
-	if (!qp) {
-		mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n",
-			       qpn);
-		return;
-	}
-
-	pfault.event_subtype = eqe->sub_type;
-	pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) &
-		(MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA);
-	pfault.bytes_committed = be32_to_cpu(
-		pf_eqe->bytes_committed);
-
-	mlx5_core_dbg(dev,
-		      "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n",
-		      eqe->sub_type, pfault.flags);
-
-	switch (eqe->sub_type) {
-	case MLX5_PFAULT_SUBTYPE_RDMA:
-		/* RDMA based event */
-		pfault.rdma.r_key =
-			be32_to_cpu(pf_eqe->rdma.r_key);
-		pfault.rdma.packet_size =
-			be16_to_cpu(pf_eqe->rdma.packet_length);
-		pfault.rdma.rdma_op_len =
-			be32_to_cpu(pf_eqe->rdma.rdma_op_len);
-		pfault.rdma.rdma_va =
-			be64_to_cpu(pf_eqe->rdma.rdma_va);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n",
-			      qpn, pfault.rdma.r_key);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: rdma_op_len: 0x%08x,\n",
-			      pfault.rdma.rdma_op_len);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: rdma_va: 0x%016llx,\n",
-			      pfault.rdma.rdma_va);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: bytes_committed: 0x%06x\n",
-			      pfault.bytes_committed);
-		break;
-
-	case MLX5_PFAULT_SUBTYPE_WQE:
-		/* WQE based event */
-		pfault.wqe.wqe_index =
-			be16_to_cpu(pf_eqe->wqe.wqe_index);
-		pfault.wqe.packet_size =
-			be16_to_cpu(pf_eqe->wqe.packet_length);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n",
-			      qpn, pfault.wqe.wqe_index);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: bytes_committed: 0x%06x\n",
-			      pfault.bytes_committed);
-		break;
-
-	default:
-		mlx5_core_warn(dev,
-			       "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n",
-			       eqe->sub_type, qpn);
-		/* Unsupported page faults should still be resolved by the
-		 * page fault handler
-		 */
-	}
-
-	if (qp->pfault_handler) {
-		qp->pfault_handler(qp, &pfault);
-	} else {
-		mlx5_core_err(dev,
-			      "ODP event for QP %08x, without a fault handler in QP\n",
-			      qpn);
-		/* Page fault will remain unresolved. QP will hang until it is
-		 * destroyed
-		 */
-	}
-
-	mlx5_core_put_rsc(common);
-}
-#endif
-
 static int create_qprqsq_common(struct mlx5_core_dev *dev,
 				struct mlx5_core_qp *qp,
 				int rsc_type)
@@ -506,25 +417,6 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
 }
 EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
-				u8 flags, int error)
-{
-	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0};
-	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = {0};
-
-	MLX5_SET(page_fault_resume_in, in, opcode,
-		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
-	MLX5_SET(page_fault_resume_in, in, wq_number, qpn);
-	MLX5_SET(page_fault_resume_in, in, page_fault_type, flags);
-	if (error)
-		MLX5_SET(page_fault_resume_in, in, error, 1);
-
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-}
-EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
-#endif
-
 int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *rq)
 {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 9f489365b3d3..3ccaeff15a80 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -534,7 +534,9 @@ struct mlx5_eqe_page_fault {
 			__be16  wqe_index;
 			u16	reserved2;
 			__be16  packet_length;
-			u8	reserved3[12];
+			__be32  token;
+			u8	reserved4[8];
+			__be32  pftype_wq;
 		} __packed wqe;
 		struct {
 			__be32  r_key;
@@ -542,9 +544,9 @@ struct mlx5_eqe_page_fault {
 			__be16  packet_length;
 			__be32  rdma_op_len;
 			__be64  rdma_va;
+			__be32  pftype_token;
 		} __packed rdma;
 	} __packed;
-	__be32 flags_qpn;
 } __packed;
 
 struct mlx5_eqe_vport_change {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ec52f3b50bf5..b52d07491fe7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -42,6 +42,7 @@
 #include <linux/vmalloc.h>
 #include <linux/radix-tree.h>
 #include <linux/workqueue.h>
+#include <linux/mempool.h>
 #include <linux/interrupt.h>
 
 #include <linux/mlx5/device.h>
@@ -83,6 +84,7 @@ enum {
 	MLX5_EQ_VEC_PAGES	 = 0,
 	MLX5_EQ_VEC_CMD		 = 1,
 	MLX5_EQ_VEC_ASYNC	 = 2,
+	MLX5_EQ_VEC_PFAULT	 = 3,
 	MLX5_EQ_VEC_COMP_BASE,
 };
 
@@ -178,6 +180,14 @@ enum mlx5_port_status {
 	MLX5_PORT_DOWN      = 2,
 };
 
+enum mlx5_eq_type {
+	MLX5_EQ_TYPE_COMP,
+	MLX5_EQ_TYPE_ASYNC,
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	MLX5_EQ_TYPE_PF,
+#endif
+};
+
 struct mlx5_uuar_info {
 	struct mlx5_uar	       *uars;
 	int			num_uars;
@@ -333,6 +343,14 @@ struct mlx5_eq_tasklet {
 	spinlock_t lock;
 };
 
+struct mlx5_eq_pagefault {
+	struct work_struct       work;
+	/* Pagefaults lock */
+	spinlock_t		 lock;
+	struct workqueue_struct *wq;
+	mempool_t		*pool;
+};
+
 struct mlx5_eq {
 	struct mlx5_core_dev   *dev;
 	__be32 __iomem	       *doorbell;
@@ -346,7 +364,13 @@ struct mlx5_eq {
 	struct list_head	list;
 	int			index;
 	struct mlx5_rsc_debug	*dbg;
-	struct mlx5_eq_tasklet	tasklet_ctx;
+	enum mlx5_eq_type	type;
+	union {
+		struct mlx5_eq_tasklet   tasklet_ctx;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		struct mlx5_eq_pagefault pf_ctx;
+#endif
+	};
 };
 
 struct mlx5_core_psv {
@@ -377,6 +401,8 @@ struct mlx5_core_mkey {
 	u32			pd;
 };
 
+#define MLX5_24BIT_MASK		((1 << 24) - 1)
+
 enum mlx5_res_type {
 	MLX5_RES_QP	= MLX5_EVENT_QUEUE_TYPE_QP,
 	MLX5_RES_RQ	= MLX5_EVENT_QUEUE_TYPE_RQ,
@@ -411,6 +437,9 @@ struct mlx5_eq_table {
 	struct mlx5_eq		pages_eq;
 	struct mlx5_eq		async_eq;
 	struct mlx5_eq		cmd_eq;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	struct mlx5_eq		pfault_eq;
+#endif
 	int			num_comp_vectors;
 	/* protect EQs list
 	 */
@@ -497,6 +526,7 @@ struct mlx5_fc_stats {
 
 struct mlx5_eswitch;
 struct mlx5_lag;
+struct mlx5_pagefault;
 
 struct mlx5_rl_entry {
 	u32                     rate;
@@ -601,6 +631,14 @@ struct mlx5_priv {
 	struct mlx5_rl_table            rl_table;
 
 	struct mlx5_port_module_event_stats  pme_stats;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	void		      (*pfault)(struct mlx5_core_dev *dev,
+					void *context,
+					struct mlx5_pagefault *pfault);
+	void		       *pfault_ctx;
+	struct srcu_struct      pfault_srcu;
+#endif
 };
 
 enum mlx5_device_state {
@@ -619,6 +657,50 @@ enum mlx5_pci_status {
 	MLX5_PCI_STATUS_ENABLED,
 };
 
+enum mlx5_pagefault_type_flags {
+	MLX5_PFAULT_REQUESTOR = 1 << 0,
+	MLX5_PFAULT_WRITE     = 1 << 1,
+	MLX5_PFAULT_RDMA      = 1 << 2,
+};
+
+/* Contains the details of a pagefault. */
+struct mlx5_pagefault {
+	u32			bytes_committed;
+	u32			token;
+	u8			event_subtype;
+	u8			type;
+	union {
+		/* Initiator or send message responder pagefault details. */
+		struct {
+			/* Received packet size, only valid for responders. */
+			u32	packet_size;
+			/*
+			 * Number of resource holding WQE, depends on type.
+			 */
+			u32	wq_num;
+			/*
+			 * WQE index. Refers to either the send queue or
+			 * receive queue, according to event_subtype.
+			 */
+			u16	wqe_index;
+		} wqe;
+		/* RDMA responder pagefault details */
+		struct {
+			u32	r_key;
+			/*
+			 * Received packet size, minimal size page fault
+			 * resolution required for forward progress.
+			 */
+			u32	packet_size;
+			u32	rdma_op_len;
+			u64	rdma_va;
+		} rdma;
+	};
+
+	struct mlx5_eq	       *eq;
+	struct work_struct	work;
+};
+
 struct mlx5_td {
 	struct list_head tirs_list;
 	u32              tdn;
@@ -879,15 +961,13 @@ void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
 void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
-#endif
 void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
 struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
 void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec);
 void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
-		       int nent, u64 mask, const char *name, struct mlx5_uar *uar);
+		       int nent, u64 mask, const char *name,
+		       struct mlx5_uar *uar, enum mlx5_eq_type type);
 int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_start_eqs(struct mlx5_core_dev *dev);
 int mlx5_stop_eqs(struct mlx5_core_dev *dev);
@@ -926,6 +1006,10 @@ int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
 			struct mlx5_odp_caps *odp_caps);
 int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
 			     u8 port_num, void *out, size_t sz);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
+				u32 wq_num, u8 type, int error);
+#endif
 
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
@@ -974,6 +1058,9 @@ struct mlx5_interface {
 	void			(*detach)(struct mlx5_core_dev *dev, void *context);
 	void			(*event)(struct mlx5_core_dev *dev, void *context,
 					 enum mlx5_dev_event event, unsigned long param);
+	void			(*pfault)(struct mlx5_core_dev *dev,
+					  void *context,
+					  struct mlx5_pagefault *pfault);
 	void *                  (*get_dev)(void *context);
 	int			protocol;
 	struct list_head	list;
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 693811e0cb24..9ed775f5cb66 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -50,9 +50,6 @@
 #define MLX5_BSF_APPTAG_ESCAPE	0x1
 #define MLX5_BSF_APPREF_ESCAPE	0x2
 
-#define MLX5_QPN_BITS		24
-#define MLX5_QPN_MASK		((1 << MLX5_QPN_BITS) - 1)
-
 enum mlx5_qp_optpar {
 	MLX5_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
 	MLX5_QP_OPTPAR_RRE			= 1 << 1,
@@ -418,46 +415,9 @@ struct mlx5_stride_block_ctrl_seg {
 	__be16		num_entries;
 };
 
-enum mlx5_pagefault_flags {
-	MLX5_PFAULT_REQUESTOR = 1 << 0,
-	MLX5_PFAULT_WRITE     = 1 << 1,
-	MLX5_PFAULT_RDMA      = 1 << 2,
-};
-
-/* Contains the details of a pagefault. */
-struct mlx5_pagefault {
-	u32			bytes_committed;
-	u8			event_subtype;
-	enum mlx5_pagefault_flags flags;
-	union {
-		/* Initiator or send message responder pagefault details. */
-		struct {
-			/* Received packet size, only valid for responders. */
-			u32	packet_size;
-			/*
-			 * WQE index. Refers to either the send queue or
-			 * receive queue, according to event_subtype.
-			 */
-			u16	wqe_index;
-		} wqe;
-		/* RDMA responder pagefault details */
-		struct {
-			u32	r_key;
-			/*
-			 * Received packet size, minimal size page fault
-			 * resolution required for forward progress.
-			 */
-			u32	packet_size;
-			u32	rdma_op_len;
-			u64	rdma_va;
-		} rdma;
-	};
-};
-
 struct mlx5_core_qp {
 	struct mlx5_core_rsc_common	common; /* must be first */
 	void (*event)		(struct mlx5_core_qp *, int);
-	void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *);
 	int			qpn;
 	struct mlx5_rsc_debug	*dbg;
 	int			pid;
@@ -557,10 +517,6 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
 int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
 void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
-				u8 context, int error);
-#endif
 int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *rq);
 void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From 17d2f88f92ce39b348f125f6b2e6eeb6b0906ac7 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:47 +0200
Subject: IB/mlx5: Add ODP atomics support

Handle ODP atomic operations. When initiator of RDMA atomic
operation use ODP MR to provide source data handle pagefault properly.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/odp.c | 88 +++++++++++++++++++++++-----------------
 include/linux/mlx5/mlx5_ifc.h    |  2 +-
 include/linux/mlx5/qp.h          | 18 ++++++++
 3 files changed, 69 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 26f96c79a45a..971b2885f474 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -144,6 +144,9 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
 
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+
 	return;
 }
 
@@ -386,6 +389,17 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
 	return ret < 0 ? ret : npages;
 }
 
+static const u32 mlx5_ib_odp_opcode_cap[] = {
+	[MLX5_OPCODE_SEND]	       = IB_ODP_SUPPORT_SEND,
+	[MLX5_OPCODE_SEND_IMM]	       = IB_ODP_SUPPORT_SEND,
+	[MLX5_OPCODE_SEND_INVAL]       = IB_ODP_SUPPORT_SEND,
+	[MLX5_OPCODE_RDMA_WRITE]       = IB_ODP_SUPPORT_WRITE,
+	[MLX5_OPCODE_RDMA_WRITE_IMM]   = IB_ODP_SUPPORT_WRITE,
+	[MLX5_OPCODE_RDMA_READ]	       = IB_ODP_SUPPORT_READ,
+	[MLX5_OPCODE_ATOMIC_CS]	       = IB_ODP_SUPPORT_ATOMIC,
+	[MLX5_OPCODE_ATOMIC_FA]	       = IB_ODP_SUPPORT_ATOMIC,
+};
+
 /*
  * Parse initiator WQE. Advances the wqe pointer to point at the
  * scatter-gather list, and set wqe_end to the end of the WQE.
@@ -396,6 +410,8 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 {
 	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
 	u16 wqe_index = pfault->wqe.wqe_index;
+	u32 transport_caps;
+	struct mlx5_base_av *av;
 	unsigned ds, opcode;
 #if defined(DEBUG)
 	u32 ctrl_wqe_index, ctrl_qpn;
@@ -441,53 +457,49 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 
 	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
 		 MLX5_WQE_CTRL_OPCODE_MASK;
+
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
-		switch (opcode) {
-		case MLX5_OPCODE_SEND:
-		case MLX5_OPCODE_SEND_IMM:
-		case MLX5_OPCODE_SEND_INVAL:
-			if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
-			      IB_ODP_SUPPORT_SEND))
-				goto invalid_transport_or_opcode;
-			break;
-		case MLX5_OPCODE_RDMA_WRITE:
-		case MLX5_OPCODE_RDMA_WRITE_IMM:
-			if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
-			      IB_ODP_SUPPORT_WRITE))
-				goto invalid_transport_or_opcode;
-			*wqe += sizeof(struct mlx5_wqe_raddr_seg);
-			break;
-		case MLX5_OPCODE_RDMA_READ:
-			if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
-			      IB_ODP_SUPPORT_READ))
-				goto invalid_transport_or_opcode;
-			*wqe += sizeof(struct mlx5_wqe_raddr_seg);
-			break;
-		default:
-			goto invalid_transport_or_opcode;
-		}
+		transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
 		break;
 	case IB_QPT_UD:
-		switch (opcode) {
-		case MLX5_OPCODE_SEND:
-		case MLX5_OPCODE_SEND_IMM:
-			if (!(dev->odp_caps.per_transport_caps.ud_odp_caps &
-			      IB_ODP_SUPPORT_SEND))
-				goto invalid_transport_or_opcode;
-			*wqe += sizeof(struct mlx5_wqe_datagram_seg);
-			break;
-		default:
-			goto invalid_transport_or_opcode;
-		}
+		transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
 		break;
 	default:
-invalid_transport_or_opcode:
-		mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n",
-			    qp->ibqp.qp_type, opcode);
+		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
+			    qp->ibqp.qp_type);
+		return -EFAULT;
+	}
+
+	if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) /
+	    sizeof(mlx5_ib_odp_opcode_cap[0]) ||
+	    !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) {
+		mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n",
+			    opcode);
 		return -EFAULT;
 	}
 
+	if (qp->ibqp.qp_type != IB_QPT_RC) {
+		av = *wqe;
+		if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT))
+			*wqe += sizeof(struct mlx5_av);
+		else
+			*wqe += sizeof(struct mlx5_base_av);
+	}
+
+	switch (opcode) {
+	case MLX5_OPCODE_RDMA_WRITE:
+	case MLX5_OPCODE_RDMA_WRITE_IMM:
+	case MLX5_OPCODE_RDMA_READ:
+		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
+		break;
+	case MLX5_OPCODE_ATOMIC_CS:
+	case MLX5_OPCODE_ATOMIC_FA:
+		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
+		*wqe += sizeof(struct mlx5_wqe_atomic_seg);
+		break;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 608dc988b3d6..15f896781966 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -328,7 +328,7 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits {
 	u8         receive[0x1];
 	u8         write[0x1];
 	u8         read[0x1];
-	u8         reserved_at_4[0x1];
+	u8         atomic[0x1];
 	u8         srq_receive[0x1];
 	u8         reserved_at_6[0x1a];
 };
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 9ed775f5cb66..219c699c17b7 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -212,6 +212,7 @@ struct mlx5_wqe_ctrl_seg {
 #define MLX5_WQE_CTRL_OPCODE_MASK 0xff
 #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00
 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8
+#define MLX5_WQE_AV_EXT 0x80000000
 
 enum {
 	MLX5_ETH_WQE_L3_INNER_CSUM      = 1 << 4,
@@ -242,6 +243,23 @@ struct mlx5_wqe_masked_atomic_seg {
 	__be64			compare_mask;
 };
 
+struct mlx5_base_av {
+	union {
+		struct {
+			__be32	qkey;
+			__be32	reserved;
+		} qkey;
+		__be64	dc_key;
+	} key;
+	__be32	dqp_dct;
+	u8	stat_rate_sl;
+	u8	fl_mlid;
+	union {
+		__be16	rlid;
+		__be16  udp_sport;
+	};
+};
+
 struct mlx5_av {
 	union {
 		struct {
-- 
cgit v1.2.3


From aa8e08d2f523501c40b0e70f1c4ecacb97195931 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Mon, 2 Jan 2017 11:37:48 +0200
Subject: IB/mlx5: Improve MR check

Add "type" field to mlx5_core MKEY struct.
Check whether page fault happens on MKEY corresponding to MR.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/mr.c  | 5 +++++
 drivers/infiniband/hw/mlx5/odp.c | 9 +++++++--
 include/linux/mlx5/driver.h      | 6 ++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index f4ecc10cbbba..8cf2a67f9fb0 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -129,6 +129,7 @@ static void reg_mr_callback(int status, void *context)
 		return;
 	}
 
+	mr->mmkey.type = MLX5_MKEY_MR;
 	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
 	key = dev->mdev->priv.mkey_key++;
 	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
@@ -728,6 +729,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 		goto err_in;
 
 	kfree(in);
+	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->ibmr.lkey = mr->mmkey.key;
 	mr->ibmr.rkey = mr->mmkey.key;
 	mr->umem = NULL;
@@ -1088,6 +1090,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 		mlx5_ib_warn(dev, "create mkey failed\n");
 		goto err_2;
 	}
+	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->umem = umem;
 	mr->dev = dev;
 	mr->live = 1;
@@ -1533,6 +1536,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
 	if (err)
 		goto err_destroy_psv;
 
+	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->ibmr.lkey = mr->mmkey.key;
 	mr->ibmr.rkey = mr->mmkey.key;
 	mr->umem = NULL;
@@ -1613,6 +1617,7 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 	if (err)
 		goto free;
 
+	mw->mmkey.type = MLX5_MKEY_MW;
 	mw->ibmw.rkey = mw->mmkey.key;
 
 	resp.response_length = min(offsetof(typeof(resp), response_length) +
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 971b2885f474..e5bc267aca73 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -155,9 +155,14 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 {
 	u32 base_key = mlx5_base_mkey(key);
 	struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key);
-	struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+	struct mlx5_ib_mr *mr;
+
+	if (!mmkey || mmkey->key != key || mmkey->type != MLX5_MKEY_MR)
+		return NULL;
+
+	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 
-	if (!mmkey || mmkey->key != key || !mr->live)
+	if (!mr->live)
 		return NULL;
 
 	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b52d07491fe7..cfa49bca009c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -394,11 +394,17 @@ struct mlx5_core_sig_ctx {
 	u32			sigerr_count;
 };
 
+enum {
+	MLX5_MKEY_MR = 1,
+	MLX5_MKEY_MW,
+};
+
 struct mlx5_core_mkey {
 	u64			iova;
 	u64			size;
 	u32			key;
 	u32			pd;
+	u32			type;
 };
 
 #define MLX5_24BIT_MASK		((1 << 24) - 1)
-- 
cgit v1.2.3


From 8e4881aa1d5d2f9c7ebfd0fe5e138f0cc345832c Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Sun, 1 Jan 2017 19:02:45 +0100
Subject: net: mdio: add mdio45_ethtool_ksettings_get

There is a function in mdio for the old ethtool api gset.
We add a new function mdio45_ethtool_ksettings_get for the
new ethtool api glinksettings.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio.c   | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mdio.h |  21 ++++++
 2 files changed, 199 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index 3e027ed0b3bb..077364cbf439 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -341,6 +341,184 @@ void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 }
 EXPORT_SYMBOL(mdio45_ethtool_gset_npage);
 
+/**
+ * mdio45_ethtool_ksettings_get_npage - get settings for ETHTOOL_GLINKSETTINGS
+ * @mdio: MDIO interface
+ * @cmd: Ethtool request structure
+ * @npage_adv: Modes currently advertised on next pages
+ * @npage_lpa: Modes advertised by link partner on next pages
+ *
+ * The @cmd parameter is expected to have been cleared before calling
+ * mdio45_ethtool_ksettings_get_npage().
+ *
+ * Since the CSRs for auto-negotiation using next pages are not fully
+ * standardised, this function does not attempt to decode them.  The
+ * caller must pass them in.
+ */
+void mdio45_ethtool_ksettings_get_npage(const struct mdio_if_info *mdio,
+					struct ethtool_link_ksettings *cmd,
+					u32 npage_adv, u32 npage_lpa)
+{
+	int reg;
+	u32 speed, supported = 0, advertising = 0, lp_advertising = 0;
+
+	BUILD_BUG_ON(MDIO_SUPPORTS_C22 != ETH_MDIO_SUPPORTS_C22);
+	BUILD_BUG_ON(MDIO_SUPPORTS_C45 != ETH_MDIO_SUPPORTS_C45);
+
+	cmd->base.phy_address = mdio->prtad;
+	cmd->base.mdio_support =
+		mdio->mode_support & (MDIO_SUPPORTS_C45 | MDIO_SUPPORTS_C22);
+
+	reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+			      MDIO_CTRL2);
+	switch (reg & MDIO_PMA_CTRL2_TYPE) {
+	case MDIO_PMA_CTRL2_10GBT:
+	case MDIO_PMA_CTRL2_1000BT:
+	case MDIO_PMA_CTRL2_100BTX:
+	case MDIO_PMA_CTRL2_10BT:
+		cmd->base.port = PORT_TP;
+		supported = SUPPORTED_TP;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_SPEED);
+		if (reg & MDIO_SPEED_10G)
+			supported |= SUPPORTED_10000baseT_Full;
+		if (reg & MDIO_PMA_SPEED_1000)
+			supported |= (SUPPORTED_1000baseT_Full |
+					    SUPPORTED_1000baseT_Half);
+		if (reg & MDIO_PMA_SPEED_100)
+			supported |= (SUPPORTED_100baseT_Full |
+					    SUPPORTED_100baseT_Half);
+		if (reg & MDIO_PMA_SPEED_10)
+			supported |= (SUPPORTED_10baseT_Full |
+					    SUPPORTED_10baseT_Half);
+		advertising = ADVERTISED_TP;
+		break;
+
+	case MDIO_PMA_CTRL2_10GBCX4:
+		cmd->base.port = PORT_OTHER;
+		supported = 0;
+		advertising = 0;
+		break;
+
+	case MDIO_PMA_CTRL2_10GBKX4:
+	case MDIO_PMA_CTRL2_10GBKR:
+	case MDIO_PMA_CTRL2_1000BKX:
+		cmd->base.port = PORT_OTHER;
+		supported = SUPPORTED_Backplane;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_PMA_EXTABLE);
+		if (reg & MDIO_PMA_EXTABLE_10GBKX4)
+			supported |= SUPPORTED_10000baseKX4_Full;
+		if (reg & MDIO_PMA_EXTABLE_10GBKR)
+			supported |= SUPPORTED_10000baseKR_Full;
+		if (reg & MDIO_PMA_EXTABLE_1000BKX)
+			supported |= SUPPORTED_1000baseKX_Full;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_PMA_10GBR_FECABLE);
+		if (reg & MDIO_PMA_10GBR_FECABLE_ABLE)
+			supported |= SUPPORTED_10000baseR_FEC;
+		advertising = ADVERTISED_Backplane;
+		break;
+
+	/* All the other defined modes are flavours of optical */
+	default:
+		cmd->base.port = PORT_FIBRE;
+		supported = SUPPORTED_FIBRE;
+		advertising = ADVERTISED_FIBRE;
+		break;
+	}
+
+	if (mdio->mmds & MDIO_DEVS_AN) {
+		supported |= SUPPORTED_Autoneg;
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_AN,
+				      MDIO_CTRL1);
+		if (reg & MDIO_AN_CTRL1_ENABLE) {
+			cmd->base.autoneg = AUTONEG_ENABLE;
+			advertising |=
+				ADVERTISED_Autoneg |
+				mdio45_get_an(mdio, MDIO_AN_ADVERTISE) |
+				npage_adv;
+		} else {
+			cmd->base.autoneg = AUTONEG_DISABLE;
+		}
+	} else {
+		cmd->base.autoneg = AUTONEG_DISABLE;
+	}
+
+	if (cmd->base.autoneg) {
+		u32 modes = 0;
+		int an_stat = mdio->mdio_read(mdio->dev, mdio->prtad,
+					      MDIO_MMD_AN, MDIO_STAT1);
+
+		/* If AN is complete and successful, report best common
+		 * mode, otherwise report best advertised mode.
+		 */
+		if (an_stat & MDIO_AN_STAT1_COMPLETE) {
+			lp_advertising =
+				mdio45_get_an(mdio, MDIO_AN_LPA) | npage_lpa;
+			if (an_stat & MDIO_AN_STAT1_LPABLE)
+				lp_advertising |= ADVERTISED_Autoneg;
+			modes = advertising & lp_advertising;
+		}
+		if ((modes & ~ADVERTISED_Autoneg) == 0)
+			modes = advertising;
+
+		if (modes & (ADVERTISED_10000baseT_Full |
+			     ADVERTISED_10000baseKX4_Full |
+			     ADVERTISED_10000baseKR_Full)) {
+			speed = SPEED_10000;
+			cmd->base.duplex = DUPLEX_FULL;
+		} else if (modes & (ADVERTISED_1000baseT_Full |
+				    ADVERTISED_1000baseT_Half |
+				    ADVERTISED_1000baseKX_Full)) {
+			speed = SPEED_1000;
+			cmd->base.duplex = !(modes & ADVERTISED_1000baseT_Half);
+		} else if (modes & (ADVERTISED_100baseT_Full |
+				    ADVERTISED_100baseT_Half)) {
+			speed = SPEED_100;
+			cmd->base.duplex = !!(modes & ADVERTISED_100baseT_Full);
+		} else {
+			speed = SPEED_10;
+			cmd->base.duplex = !!(modes & ADVERTISED_10baseT_Full);
+		}
+	} else {
+		/* Report forced settings */
+		reg = mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+				      MDIO_CTRL1);
+		speed = (((reg & MDIO_PMA_CTRL1_SPEED1000) ? 100 : 1)
+			 * ((reg & MDIO_PMA_CTRL1_SPEED100) ? 100 : 10));
+		cmd->base.duplex = (reg & MDIO_CTRL1_FULLDPLX ||
+				    speed == SPEED_10000);
+	}
+
+	cmd->base.speed = speed;
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising,
+						lp_advertising);
+
+	/* 10GBASE-T MDI/MDI-X */
+	if (cmd->base.port == PORT_TP && (cmd->base.speed == SPEED_10000)) {
+		switch (mdio->mdio_read(mdio->dev, mdio->prtad, MDIO_MMD_PMAPMD,
+					MDIO_PMA_10GBT_SWAPPOL)) {
+		case MDIO_PMA_10GBT_SWAPPOL_ABNX | MDIO_PMA_10GBT_SWAPPOL_CDNX:
+			cmd->base.eth_tp_mdix = ETH_TP_MDI;
+			break;
+		case 0:
+			cmd->base.eth_tp_mdix = ETH_TP_MDI_X;
+			break;
+		default:
+			/* It's complicated... */
+			cmd->base.eth_tp_mdix = ETH_TP_MDI_INVALID;
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(mdio45_ethtool_ksettings_get_npage);
+
 /**
  * mdio_mii_ioctl - MII ioctl interface for MDIO (clause 22 or 45) PHYs
  * @mdio: MDIO interface
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index bf9d1d750693..b6587a4b32e7 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -130,6 +130,10 @@ extern int mdio45_nway_restart(const struct mdio_if_info *mdio);
 extern void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio,
 				      struct ethtool_cmd *ecmd,
 				      u32 npage_adv, u32 npage_lpa);
+extern void
+mdio45_ethtool_ksettings_get_npage(const struct mdio_if_info *mdio,
+				   struct ethtool_link_ksettings *cmd,
+				   u32 npage_adv, u32 npage_lpa);
 
 /**
  * mdio45_ethtool_gset - get settings for ETHTOOL_GSET
@@ -147,6 +151,23 @@ static inline void mdio45_ethtool_gset(const struct mdio_if_info *mdio,
 	mdio45_ethtool_gset_npage(mdio, ecmd, 0, 0);
 }
 
+/**
+ * mdio45_ethtool_ksettings_get - get settings for ETHTOOL_GLINKSETTINGS
+ * @mdio: MDIO interface
+ * @cmd: Ethtool request structure
+ *
+ * Since the CSRs for auto-negotiation using next pages are not fully
+ * standardised, this function does not attempt to decode them.  Use
+ * mdio45_ethtool_ksettings_get_npage() to specify advertisement bits
+ * from next pages.
+ */
+static inline void
+mdio45_ethtool_ksettings_get(const struct mdio_if_info *mdio,
+			     struct ethtool_link_ksettings *cmd)
+{
+	mdio45_ethtool_ksettings_get_npage(mdio, cmd, 0, 0);
+}
+
 extern int mdio_mii_ioctl(const struct mdio_if_info *mdio,
 			  struct mii_ioctl_data *mii_data, int cmd);
 
-- 
cgit v1.2.3


From 7b73305160f11b633e9801b2c6b83d5b0cb867cc Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Fri, 16 Dec 2016 18:45:21 +0100
Subject: module: Drop redundant declaration of struct module

Struct module is already declared at the beginning of the file, no
need to declare it again.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Fixes: 93c2e105f6bc ("module: Optimize __module_address() using a latched RB-tree")
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 include/linux/module.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 7c84273d60b9..ef599379f9a4 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -282,8 +282,6 @@ enum module_state {
 	MODULE_STATE_UNFORMED,	/* Still setting it up. */
 };
 
-struct module;
-
 struct mod_tree_node {
 	struct module *mod;
 	struct latch_tree_node node;
-- 
cgit v1.2.3


From 8afda8b26d01ee26a60ef2f0284a7f01a5ed96f8 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 28 Nov 2016 15:06:24 +0300
Subject: spi-nor: Add support for Intel SPI serial flash controller

Add support for the SPI serial flash host controller found on many Intel
CPUs including Baytrail and Braswell. The SPI serial flash controller is
used to access BIOS and other platform specific information. By default the
driver exposes a single read-only MTD device but with a module parameter
"writeable=1" the MTD device can be made read-write which makes it possible
to upgrade BIOS directly from Linux.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/mtd/intel-spi.txt          |  88 ++++
 drivers/mtd/spi-nor/Kconfig              |  20 +
 drivers/mtd/spi-nor/Makefile             |   2 +
 drivers/mtd/spi-nor/intel-spi-platform.c |  57 +++
 drivers/mtd/spi-nor/intel-spi.c          | 777 +++++++++++++++++++++++++++++++
 drivers/mtd/spi-nor/intel-spi.h          |  24 +
 include/linux/platform_data/intel-spi.h  |  31 ++
 7 files changed, 999 insertions(+)
 create mode 100644 Documentation/mtd/intel-spi.txt
 create mode 100644 drivers/mtd/spi-nor/intel-spi-platform.c
 create mode 100644 drivers/mtd/spi-nor/intel-spi.c
 create mode 100644 drivers/mtd/spi-nor/intel-spi.h
 create mode 100644 include/linux/platform_data/intel-spi.h

(limited to 'include/linux')

diff --git a/Documentation/mtd/intel-spi.txt b/Documentation/mtd/intel-spi.txt
new file mode 100644
index 000000000000..bc357729c2cb
--- /dev/null
+++ b/Documentation/mtd/intel-spi.txt
@@ -0,0 +1,88 @@
+Upgrading BIOS using intel-spi
+------------------------------
+
+Many Intel CPUs like Baytrail and Braswell include SPI serial flash host
+controller which is used to hold BIOS and other platform specific data.
+Since contents of the SPI serial flash is crucial for machine to function,
+it is typically protected by different hardware protection mechanisms to
+avoid accidental (or on purpose) overwrite of the content.
+
+Not all manufacturers protect the SPI serial flash, mainly because it
+allows upgrading the BIOS image directly from an OS.
+
+The intel-spi driver makes it possible to read and write the SPI serial
+flash, if certain protection bits are not set and locked. If it finds
+any of them set, the whole MTD device is made read-only to prevent
+partial overwrites. By default the driver exposes SPI serial flash
+contents as read-only but it can be changed from kernel command line,
+passing "intel-spi.writeable=1".
+
+Please keep in mind that overwriting the BIOS image on SPI serial flash
+might render the machine unbootable and requires special equipment like
+Dediprog to revive. You have been warned!
+
+Below are the steps how to upgrade MinnowBoard MAX BIOS directly from
+Linux.
+
+ 1) Download and extract the latest Minnowboard MAX BIOS SPI image
+    [1]. At the time writing this the latest image is v92.
+
+ 2) Install mtd-utils package [2]. We need this in order to erase the SPI
+    serial flash. Distros like Debian and Fedora have this prepackaged with
+    name "mtd-utils".
+
+ 3) Add "intel-spi.writeable=1" to the kernel command line and reboot
+    the board (you can also reload the driver passing "writeable=1" as
+    module parameter to modprobe).
+
+ 4) Once the board is up and running again, find the right MTD partition
+    (it is named as "BIOS"):
+
+    # cat /proc/mtd
+    dev:    size   erasesize  name
+    mtd0: 00800000 00001000 "BIOS"
+
+    So here it will be /dev/mtd0 but it may vary.
+
+ 5) Make backup of the existing image first:
+
+    # dd if=/dev/mtd0ro of=bios.bak
+    16384+0 records in
+    16384+0 records out
+    8388608 bytes (8.4 MB) copied, 10.0269 s, 837 kB/s
+
+ 6) Verify the backup
+
+    # sha1sum /dev/mtd0ro bios.bak
+    fdbb011920572ca6c991377c4b418a0502668b73  /dev/mtd0ro
+    fdbb011920572ca6c991377c4b418a0502668b73  bios.bak
+
+    The SHA1 sums must match. Otherwise do not continue any further!
+
+ 7) Erase the SPI serial flash. After this step, do not reboot the
+    board! Otherwise it will not start anymore.
+
+    # flash_erase /dev/mtd0 0 0
+    Erasing 4 Kibyte @ 7ff000 -- 100 % complete
+
+ 8) Once completed without errors you can write the new BIOS image:
+
+    # dd if=MNW2MAX1.X64.0092.R01.1605221712.bin of=/dev/mtd0
+
+ 9) Verify that the new content of the SPI serial flash matches the new
+    BIOS image:
+
+    # sha1sum /dev/mtd0ro MNW2MAX1.X64.0092.R01.1605221712.bin
+    9b4df9e4be2057fceec3a5529ec3d950836c87a2  /dev/mtd0ro
+    9b4df9e4be2057fceec3a5529ec3d950836c87a2 MNW2MAX1.X64.0092.R01.1605221712.bin
+
+    The SHA1 sums should match.
+
+ 10) Now you can reboot your board and observe the new BIOS starting up
+     properly.
+
+References
+----------
+
+[1] https://firmware.intel.com/sites/default/files/MinnowBoard.MAX_.X64.92.R01.zip
+[2] http://www.linux-mtd.infradead.org/
diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig
index 4a682ee0f632..02013ffa5231 100644
--- a/drivers/mtd/spi-nor/Kconfig
+++ b/drivers/mtd/spi-nor/Kconfig
@@ -76,4 +76,24 @@ config SPI_NXP_SPIFI
 	  Flash. Enable this option if you have a device with a SPIFI
 	  controller and want to access the Flash as a mtd device.
 
+config SPI_INTEL_SPI
+	tristate
+
+config SPI_INTEL_SPI_PLATFORM
+	tristate "Intel PCH/PCU SPI flash platform driver" if EXPERT
+	depends on X86
+	select SPI_INTEL_SPI
+	help
+	  This enables platform support for the Intel PCH/PCU SPI
+	  controller in master mode. This controller is present in modern
+	  Intel hardware and is used to hold BIOS and other persistent
+	  settings. Using this driver it is possible to upgrade BIOS
+	  directly from Linux.
+
+	  Say N here unless you know what you are doing. Overwriting the
+	  SPI flash may render the system unbootable.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel-spi-platform.
+
 endif # MTD_SPI_NOR
diff --git a/drivers/mtd/spi-nor/Makefile b/drivers/mtd/spi-nor/Makefile
index 121695e83542..1796e8cd6e1a 100644
--- a/drivers/mtd/spi-nor/Makefile
+++ b/drivers/mtd/spi-nor/Makefile
@@ -5,3 +5,5 @@ obj-$(CONFIG_SPI_FSL_QUADSPI)	+= fsl-quadspi.o
 obj-$(CONFIG_SPI_HISI_SFC)	+= hisi-sfc.o
 obj-$(CONFIG_MTD_MT81xx_NOR)    += mtk-quadspi.o
 obj-$(CONFIG_SPI_NXP_SPIFI)	+= nxp-spifi.o
+obj-$(CONFIG_SPI_INTEL_SPI)	+= intel-spi.o
+obj-$(CONFIG_SPI_INTEL_SPI_PLATFORM)	+= intel-spi-platform.o
diff --git a/drivers/mtd/spi-nor/intel-spi-platform.c b/drivers/mtd/spi-nor/intel-spi-platform.c
new file mode 100644
index 000000000000..5c943df9398f
--- /dev/null
+++ b/drivers/mtd/spi-nor/intel-spi-platform.c
@@ -0,0 +1,57 @@
+/*
+ * Intel PCH/PCU SPI flash platform driver.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include "intel-spi.h"
+
+static int intel_spi_platform_probe(struct platform_device *pdev)
+{
+	struct intel_spi_boardinfo *info;
+	struct intel_spi *ispi;
+	struct resource *mem;
+
+	info = dev_get_platdata(&pdev->dev);
+	if (!info)
+		return -EINVAL;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ispi = intel_spi_probe(&pdev->dev, mem, info);
+	if (IS_ERR(ispi))
+		return PTR_ERR(ispi);
+
+	platform_set_drvdata(pdev, ispi);
+	return 0;
+}
+
+static int intel_spi_platform_remove(struct platform_device *pdev)
+{
+	struct intel_spi *ispi = platform_get_drvdata(pdev);
+
+	return intel_spi_remove(ispi);
+}
+
+static struct platform_driver intel_spi_platform_driver = {
+	.probe = intel_spi_platform_probe,
+	.remove = intel_spi_platform_remove,
+	.driver = {
+		.name = "intel-spi",
+	},
+};
+
+module_platform_driver(intel_spi_platform_driver);
+
+MODULE_DESCRIPTION("Intel PCH/PCU SPI flash platform driver");
+MODULE_AUTHOR("Mika Westerberg <mika.westerberg@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:intel-spi");
diff --git a/drivers/mtd/spi-nor/intel-spi.c b/drivers/mtd/spi-nor/intel-spi.c
new file mode 100644
index 000000000000..a10f6027b386
--- /dev/null
+++ b/drivers/mtd/spi-nor/intel-spi.c
@@ -0,0 +1,777 @@
+/*
+ * Intel PCH/PCU SPI flash driver.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/spi-nor.h>
+#include <linux/platform_data/intel-spi.h>
+
+#include "intel-spi.h"
+
+/* Offsets are from @ispi->base */
+#define BFPREG				0x00
+
+#define HSFSTS_CTL			0x04
+#define HSFSTS_CTL_FSMIE		BIT(31)
+#define HSFSTS_CTL_FDBC_SHIFT		24
+#define HSFSTS_CTL_FDBC_MASK		(0x3f << HSFSTS_CTL_FDBC_SHIFT)
+
+#define HSFSTS_CTL_FCYCLE_SHIFT		17
+#define HSFSTS_CTL_FCYCLE_MASK		(0x0f << HSFSTS_CTL_FCYCLE_SHIFT)
+/* HW sequencer opcodes */
+#define HSFSTS_CTL_FCYCLE_READ		(0x00 << HSFSTS_CTL_FCYCLE_SHIFT)
+#define HSFSTS_CTL_FCYCLE_WRITE		(0x02 << HSFSTS_CTL_FCYCLE_SHIFT)
+#define HSFSTS_CTL_FCYCLE_ERASE		(0x03 << HSFSTS_CTL_FCYCLE_SHIFT)
+#define HSFSTS_CTL_FCYCLE_ERASE_64K	(0x04 << HSFSTS_CTL_FCYCLE_SHIFT)
+#define HSFSTS_CTL_FCYCLE_RDID		(0x06 << HSFSTS_CTL_FCYCLE_SHIFT)
+#define HSFSTS_CTL_FCYCLE_WRSR		(0x07 << HSFSTS_CTL_FCYCLE_SHIFT)
+#define HSFSTS_CTL_FCYCLE_RDSR		(0x08 << HSFSTS_CTL_FCYCLE_SHIFT)
+
+#define HSFSTS_CTL_FGO			BIT(16)
+#define HSFSTS_CTL_FLOCKDN		BIT(15)
+#define HSFSTS_CTL_FDV			BIT(14)
+#define HSFSTS_CTL_SCIP			BIT(5)
+#define HSFSTS_CTL_AEL			BIT(2)
+#define HSFSTS_CTL_FCERR		BIT(1)
+#define HSFSTS_CTL_FDONE		BIT(0)
+
+#define FADDR				0x08
+#define DLOCK				0x0c
+#define FDATA(n)			(0x10 + ((n) * 4))
+
+#define FRACC				0x50
+
+#define FREG(n)				(0x54 + ((n) * 4))
+#define FREG_BASE_MASK			0x3fff
+#define FREG_LIMIT_SHIFT		16
+#define FREG_LIMIT_MASK			(0x03fff << FREG_LIMIT_SHIFT)
+
+/* Offset is from @ispi->pregs */
+#define PR(n)				((n) * 4)
+#define PR_WPE				BIT(31)
+#define PR_LIMIT_SHIFT			16
+#define PR_LIMIT_MASK			(0x3fff << PR_LIMIT_SHIFT)
+#define PR_RPE				BIT(15)
+#define PR_BASE_MASK			0x3fff
+/* Last PR is GPR0 */
+#define PR_NUM				(5 + 1)
+
+/* Offsets are from @ispi->sregs */
+#define SSFSTS_CTL			0x00
+#define SSFSTS_CTL_FSMIE		BIT(23)
+#define SSFSTS_CTL_DS			BIT(22)
+#define SSFSTS_CTL_DBC_SHIFT		16
+#define SSFSTS_CTL_SPOP			BIT(11)
+#define SSFSTS_CTL_ACS			BIT(10)
+#define SSFSTS_CTL_SCGO			BIT(9)
+#define SSFSTS_CTL_COP_SHIFT		12
+#define SSFSTS_CTL_FRS			BIT(7)
+#define SSFSTS_CTL_DOFRS		BIT(6)
+#define SSFSTS_CTL_AEL			BIT(4)
+#define SSFSTS_CTL_FCERR		BIT(3)
+#define SSFSTS_CTL_FDONE		BIT(2)
+#define SSFSTS_CTL_SCIP			BIT(0)
+
+#define PREOP_OPTYPE			0x04
+#define OPMENU0				0x08
+#define OPMENU1				0x0c
+
+/* CPU specifics */
+#define BYT_PR				0x74
+#define BYT_SSFSTS_CTL			0x90
+#define BYT_BCR				0xfc
+#define BYT_BCR_WPD			BIT(0)
+#define BYT_FREG_NUM			5
+
+#define LPT_PR				0x74
+#define LPT_SSFSTS_CTL			0x90
+#define LPT_FREG_NUM			5
+
+#define BXT_PR				0x84
+#define BXT_SSFSTS_CTL			0xa0
+#define BXT_FREG_NUM			12
+
+#define INTEL_SPI_TIMEOUT		5000 /* ms */
+#define INTEL_SPI_FIFO_SZ		64
+
+/**
+ * struct intel_spi - Driver private data
+ * @dev: Device pointer
+ * @info: Pointer to board specific info
+ * @nor: SPI NOR layer structure
+ * @base: Beginning of MMIO space
+ * @pregs: Start of protection registers
+ * @sregs: Start of software sequencer registers
+ * @nregions: Maximum number of regions
+ * @writeable: Is the chip writeable
+ * @swseq: Use SW sequencer in register reads/writes
+ * @erase_64k: 64k erase supported
+ * @opcodes: Opcodes which are supported. This are programmed by BIOS
+ *           before it locks down the controller.
+ * @preopcodes: Preopcodes which are supported.
+ */
+struct intel_spi {
+	struct device *dev;
+	const struct intel_spi_boardinfo *info;
+	struct spi_nor nor;
+	void __iomem *base;
+	void __iomem *pregs;
+	void __iomem *sregs;
+	size_t nregions;
+	bool writeable;
+	bool swseq;
+	bool erase_64k;
+	u8 opcodes[8];
+	u8 preopcodes[2];
+};
+
+static bool writeable;
+module_param(writeable, bool, 0);
+MODULE_PARM_DESC(writeable, "Enable write access to SPI flash chip (default=0)");
+
+static void intel_spi_dump_regs(struct intel_spi *ispi)
+{
+	u32 value;
+	int i;
+
+	dev_dbg(ispi->dev, "BFPREG=0x%08x\n", readl(ispi->base + BFPREG));
+
+	value = readl(ispi->base + HSFSTS_CTL);
+	dev_dbg(ispi->dev, "HSFSTS_CTL=0x%08x\n", value);
+	if (value & HSFSTS_CTL_FLOCKDN)
+		dev_dbg(ispi->dev, "-> Locked\n");
+
+	dev_dbg(ispi->dev, "FADDR=0x%08x\n", readl(ispi->base + FADDR));
+	dev_dbg(ispi->dev, "DLOCK=0x%08x\n", readl(ispi->base + DLOCK));
+
+	for (i = 0; i < 16; i++)
+		dev_dbg(ispi->dev, "FDATA(%d)=0x%08x\n",
+			i, readl(ispi->base + FDATA(i)));
+
+	dev_dbg(ispi->dev, "FRACC=0x%08x\n", readl(ispi->base + FRACC));
+
+	for (i = 0; i < ispi->nregions; i++)
+		dev_dbg(ispi->dev, "FREG(%d)=0x%08x\n", i,
+			readl(ispi->base + FREG(i)));
+	for (i = 0; i < PR_NUM; i++)
+		dev_dbg(ispi->dev, "PR(%d)=0x%08x\n", i,
+			readl(ispi->pregs + PR(i)));
+
+	value = readl(ispi->sregs + SSFSTS_CTL);
+	dev_dbg(ispi->dev, "SSFSTS_CTL=0x%08x\n", value);
+	dev_dbg(ispi->dev, "PREOP_OPTYPE=0x%08x\n",
+		readl(ispi->sregs + PREOP_OPTYPE));
+	dev_dbg(ispi->dev, "OPMENU0=0x%08x\n", readl(ispi->sregs + OPMENU0));
+	dev_dbg(ispi->dev, "OPMENU1=0x%08x\n", readl(ispi->sregs + OPMENU1));
+
+	if (ispi->info->type == INTEL_SPI_BYT)
+		dev_dbg(ispi->dev, "BCR=0x%08x\n", readl(ispi->base + BYT_BCR));
+
+	dev_dbg(ispi->dev, "Protected regions:\n");
+	for (i = 0; i < PR_NUM; i++) {
+		u32 base, limit;
+
+		value = readl(ispi->pregs + PR(i));
+		if (!(value & (PR_WPE | PR_RPE)))
+			continue;
+
+		limit = (value & PR_LIMIT_MASK) >> PR_LIMIT_SHIFT;
+		base = value & PR_BASE_MASK;
+
+		dev_dbg(ispi->dev, " %02d base: 0x%08x limit: 0x%08x [%c%c]\n",
+			 i, base << 12, (limit << 12) | 0xfff,
+			 value & PR_WPE ? 'W' : '.',
+			 value & PR_RPE ? 'R' : '.');
+	}
+
+	dev_dbg(ispi->dev, "Flash regions:\n");
+	for (i = 0; i < ispi->nregions; i++) {
+		u32 region, base, limit;
+
+		region = readl(ispi->base + FREG(i));
+		base = region & FREG_BASE_MASK;
+		limit = (region & FREG_LIMIT_MASK) >> FREG_LIMIT_SHIFT;
+
+		if (base >= limit || (i > 0 && limit == 0))
+			dev_dbg(ispi->dev, " %02d disabled\n", i);
+		else
+			dev_dbg(ispi->dev, " %02d base: 0x%08x limit: 0x%08x\n",
+				 i, base << 12, (limit << 12) | 0xfff);
+	}
+
+	dev_dbg(ispi->dev, "Using %cW sequencer for register access\n",
+		ispi->swseq ? 'S' : 'H');
+}
+
+/* Reads max INTEL_SPI_FIFO_SZ bytes from the device fifo */
+static int intel_spi_read_block(struct intel_spi *ispi, void *buf, size_t size)
+{
+	size_t bytes;
+	int i = 0;
+
+	if (size > INTEL_SPI_FIFO_SZ)
+		return -EINVAL;
+
+	while (size > 0) {
+		bytes = min_t(size_t, size, 4);
+		memcpy_fromio(buf, ispi->base + FDATA(i), bytes);
+		size -= bytes;
+		buf += bytes;
+		i++;
+	}
+
+	return 0;
+}
+
+/* Writes max INTEL_SPI_FIFO_SZ bytes to the device fifo */
+static int intel_spi_write_block(struct intel_spi *ispi, const void *buf,
+				 size_t size)
+{
+	size_t bytes;
+	int i = 0;
+
+	if (size > INTEL_SPI_FIFO_SZ)
+		return -EINVAL;
+
+	while (size > 0) {
+		bytes = min_t(size_t, size, 4);
+		memcpy_toio(ispi->base + FDATA(i), buf, bytes);
+		size -= bytes;
+		buf += bytes;
+		i++;
+	}
+
+	return 0;
+}
+
+static int intel_spi_wait_hw_busy(struct intel_spi *ispi)
+{
+	u32 val;
+
+	return readl_poll_timeout(ispi->base + HSFSTS_CTL, val,
+				  !(val & HSFSTS_CTL_SCIP), 0,
+				  INTEL_SPI_TIMEOUT * 1000);
+}
+
+static int intel_spi_wait_sw_busy(struct intel_spi *ispi)
+{
+	u32 val;
+
+	return readl_poll_timeout(ispi->sregs + SSFSTS_CTL, val,
+				  !(val & SSFSTS_CTL_SCIP), 0,
+				  INTEL_SPI_TIMEOUT * 1000);
+}
+
+static int intel_spi_init(struct intel_spi *ispi)
+{
+	u32 opmenu0, opmenu1, val;
+	int i;
+
+	switch (ispi->info->type) {
+	case INTEL_SPI_BYT:
+		ispi->sregs = ispi->base + BYT_SSFSTS_CTL;
+		ispi->pregs = ispi->base + BYT_PR;
+		ispi->nregions = BYT_FREG_NUM;
+
+		if (writeable) {
+			/* Disable write protection */
+			val = readl(ispi->base + BYT_BCR);
+			if (!(val & BYT_BCR_WPD)) {
+				val |= BYT_BCR_WPD;
+				writel(val, ispi->base + BYT_BCR);
+				val = readl(ispi->base + BYT_BCR);
+			}
+
+			ispi->writeable = !!(val & BYT_BCR_WPD);
+		}
+
+		break;
+
+	case INTEL_SPI_LPT:
+		ispi->sregs = ispi->base + LPT_SSFSTS_CTL;
+		ispi->pregs = ispi->base + LPT_PR;
+		ispi->nregions = LPT_FREG_NUM;
+		break;
+
+	case INTEL_SPI_BXT:
+		ispi->sregs = ispi->base + BXT_SSFSTS_CTL;
+		ispi->pregs = ispi->base + BXT_PR;
+		ispi->nregions = BXT_FREG_NUM;
+		ispi->erase_64k = true;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	/* Disable #SMI generation */
+	val = readl(ispi->base + HSFSTS_CTL);
+	val &= ~HSFSTS_CTL_FSMIE;
+	writel(val, ispi->base + HSFSTS_CTL);
+
+	/*
+	 * BIOS programs allowed opcodes and then locks down the register.
+	 * So read back what opcodes it decided to support. That's the set
+	 * we are going to support as well.
+	 */
+	opmenu0 = readl(ispi->sregs + OPMENU0);
+	opmenu1 = readl(ispi->sregs + OPMENU1);
+
+	/*
+	 * Some controllers can only do basic operations using hardware
+	 * sequencer. All other operations are supposed to be carried out
+	 * using software sequencer. If we find that BIOS has programmed
+	 * opcodes for the software sequencer we use that over the hardware
+	 * sequencer.
+	 */
+	if (opmenu0 && opmenu1) {
+		for (i = 0; i < ARRAY_SIZE(ispi->opcodes) / 2; i++) {
+			ispi->opcodes[i] = opmenu0 >> i * 8;
+			ispi->opcodes[i + 4] = opmenu1 >> i * 8;
+		}
+
+		val = readl(ispi->sregs + PREOP_OPTYPE);
+		ispi->preopcodes[0] = val;
+		ispi->preopcodes[1] = val >> 8;
+
+		/* Disable #SMI generation from SW sequencer */
+		val = readl(ispi->sregs + SSFSTS_CTL);
+		val &= ~SSFSTS_CTL_FSMIE;
+		writel(val, ispi->sregs + SSFSTS_CTL);
+
+		ispi->swseq = true;
+	}
+
+	intel_spi_dump_regs(ispi);
+
+	return 0;
+}
+
+static int intel_spi_opcode_index(struct intel_spi *ispi, u8 opcode)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ispi->opcodes); i++)
+		if (ispi->opcodes[i] == opcode)
+			return i;
+	return -EINVAL;
+}
+
+static int intel_spi_hw_cycle(struct intel_spi *ispi, u8 opcode, u8 *buf,
+			      int len)
+{
+	u32 val, status;
+	int ret;
+
+	val = readl(ispi->base + HSFSTS_CTL);
+	val &= ~(HSFSTS_CTL_FCYCLE_MASK | HSFSTS_CTL_FDBC_MASK);
+
+	switch (opcode) {
+	case SPINOR_OP_RDID:
+		val |= HSFSTS_CTL_FCYCLE_RDID;
+		break;
+	case SPINOR_OP_WRSR:
+		val |= HSFSTS_CTL_FCYCLE_WRSR;
+		break;
+	case SPINOR_OP_RDSR:
+		val |= HSFSTS_CTL_FCYCLE_RDSR;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	val |= (len - 1) << HSFSTS_CTL_FDBC_SHIFT;
+	val |= HSFSTS_CTL_FCERR | HSFSTS_CTL_FDONE;
+	val |= HSFSTS_CTL_FGO;
+	writel(val, ispi->base + HSFSTS_CTL);
+
+	ret = intel_spi_wait_hw_busy(ispi);
+	if (ret)
+		return ret;
+
+	status = readl(ispi->base + HSFSTS_CTL);
+	if (status & HSFSTS_CTL_FCERR)
+		return -EIO;
+	else if (status & HSFSTS_CTL_AEL)
+		return -EACCES;
+
+	return 0;
+}
+
+static int intel_spi_sw_cycle(struct intel_spi *ispi, u8 opcode, u8 *buf,
+			      int len)
+{
+	u32 val, status;
+	int ret;
+
+	ret = intel_spi_opcode_index(ispi, opcode);
+	if (ret < 0)
+		return ret;
+
+	val = (len << SSFSTS_CTL_DBC_SHIFT) | SSFSTS_CTL_DS;
+	val |= ret << SSFSTS_CTL_COP_SHIFT;
+	val |= SSFSTS_CTL_FCERR | SSFSTS_CTL_FDONE;
+	val |= SSFSTS_CTL_SCGO;
+	writel(val, ispi->sregs + SSFSTS_CTL);
+
+	ret = intel_spi_wait_sw_busy(ispi);
+	if (ret)
+		return ret;
+
+	status = readl(ispi->base + SSFSTS_CTL);
+	if (status & SSFSTS_CTL_FCERR)
+		return -EIO;
+	else if (status & SSFSTS_CTL_AEL)
+		return -EACCES;
+
+	return 0;
+}
+
+static int intel_spi_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
+{
+	struct intel_spi *ispi = nor->priv;
+	int ret;
+
+	/* Address of the first chip */
+	writel(0, ispi->base + FADDR);
+
+	if (ispi->swseq)
+		ret = intel_spi_sw_cycle(ispi, opcode, buf, len);
+	else
+		ret = intel_spi_hw_cycle(ispi, opcode, buf, len);
+
+	if (ret)
+		return ret;
+
+	return intel_spi_read_block(ispi, buf, len);
+}
+
+static int intel_spi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
+{
+	struct intel_spi *ispi = nor->priv;
+	int ret;
+
+	/*
+	 * This is handled with atomic operation and preop code in Intel
+	 * controller so skip it here now.
+	 */
+	if (opcode == SPINOR_OP_WREN)
+		return 0;
+
+	writel(0, ispi->base + FADDR);
+
+	/* Write the value beforehand */
+	ret = intel_spi_write_block(ispi, buf, len);
+	if (ret)
+		return ret;
+
+	if (ispi->swseq)
+		return intel_spi_sw_cycle(ispi, opcode, buf, len);
+	return intel_spi_hw_cycle(ispi, opcode, buf, len);
+}
+
+static ssize_t intel_spi_read(struct spi_nor *nor, loff_t from, size_t len,
+			      u_char *read_buf)
+{
+	struct intel_spi *ispi = nor->priv;
+	size_t block_size, retlen = 0;
+	u32 val, status;
+	ssize_t ret;
+
+	switch (nor->read_opcode) {
+	case SPINOR_OP_READ:
+	case SPINOR_OP_READ_FAST:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	while (len > 0) {
+		block_size = min_t(size_t, len, INTEL_SPI_FIFO_SZ);
+
+		writel(from, ispi->base + FADDR);
+
+		val = readl(ispi->base + HSFSTS_CTL);
+		val &= ~(HSFSTS_CTL_FDBC_MASK | HSFSTS_CTL_FCYCLE_MASK);
+		val |= HSFSTS_CTL_AEL | HSFSTS_CTL_FCERR | HSFSTS_CTL_FDONE;
+		val |= (block_size - 1) << HSFSTS_CTL_FDBC_SHIFT;
+		val |= HSFSTS_CTL_FCYCLE_READ;
+		val |= HSFSTS_CTL_FGO;
+		writel(val, ispi->base + HSFSTS_CTL);
+
+		ret = intel_spi_wait_hw_busy(ispi);
+		if (ret)
+			return ret;
+
+		status = readl(ispi->base + HSFSTS_CTL);
+		if (status & HSFSTS_CTL_FCERR)
+			ret = -EIO;
+		else if (status & HSFSTS_CTL_AEL)
+			ret = -EACCES;
+
+		if (ret < 0) {
+			dev_err(ispi->dev, "read error: %llx: %#x\n", from,
+				status);
+			return ret;
+		}
+
+		ret = intel_spi_read_block(ispi, read_buf, block_size);
+		if (ret)
+			return ret;
+
+		len -= block_size;
+		from += block_size;
+		retlen += block_size;
+		read_buf += block_size;
+	}
+
+	return retlen;
+}
+
+static ssize_t intel_spi_write(struct spi_nor *nor, loff_t to, size_t len,
+			       const u_char *write_buf)
+{
+	struct intel_spi *ispi = nor->priv;
+	size_t block_size, retlen = 0;
+	u32 val, status;
+	ssize_t ret;
+
+	while (len > 0) {
+		block_size = min_t(size_t, len, INTEL_SPI_FIFO_SZ);
+
+		writel(to, ispi->base + FADDR);
+
+		val = readl(ispi->base + HSFSTS_CTL);
+		val &= ~(HSFSTS_CTL_FDBC_MASK | HSFSTS_CTL_FCYCLE_MASK);
+		val |= HSFSTS_CTL_AEL | HSFSTS_CTL_FCERR | HSFSTS_CTL_FDONE;
+		val |= (block_size - 1) << HSFSTS_CTL_FDBC_SHIFT;
+		val |= HSFSTS_CTL_FCYCLE_WRITE;
+
+		/* Write enable */
+		if (ispi->preopcodes[1] == SPINOR_OP_WREN)
+			val |= SSFSTS_CTL_SPOP;
+		val |= SSFSTS_CTL_ACS;
+		writel(val, ispi->base + HSFSTS_CTL);
+
+		ret = intel_spi_write_block(ispi, write_buf, block_size);
+		if (ret) {
+			dev_err(ispi->dev, "failed to write block\n");
+			return ret;
+		}
+
+		/* Start the write now */
+		val = readl(ispi->base + HSFSTS_CTL);
+		writel(val | HSFSTS_CTL_FGO, ispi->base + HSFSTS_CTL);
+
+		ret = intel_spi_wait_hw_busy(ispi);
+		if (ret) {
+			dev_err(ispi->dev, "timeout\n");
+			return ret;
+		}
+
+		status = readl(ispi->base + HSFSTS_CTL);
+		if (status & HSFSTS_CTL_FCERR)
+			ret = -EIO;
+		else if (status & HSFSTS_CTL_AEL)
+			ret = -EACCES;
+
+		if (ret < 0) {
+			dev_err(ispi->dev, "write error: %llx: %#x\n", to,
+				status);
+			return ret;
+		}
+
+		len -= block_size;
+		to += block_size;
+		retlen += block_size;
+		write_buf += block_size;
+	}
+
+	return retlen;
+}
+
+static int intel_spi_erase(struct spi_nor *nor, loff_t offs)
+{
+	size_t erase_size, len = nor->mtd.erasesize;
+	struct intel_spi *ispi = nor->priv;
+	u32 val, status, cmd;
+	int ret;
+
+	/* If the hardware can do 64k erase use that when possible */
+	if (len >= SZ_64K && ispi->erase_64k) {
+		cmd = HSFSTS_CTL_FCYCLE_ERASE_64K;
+		erase_size = SZ_64K;
+	} else {
+		cmd = HSFSTS_CTL_FCYCLE_ERASE;
+		erase_size = SZ_4K;
+	}
+
+	while (len > 0) {
+		writel(offs, ispi->base + FADDR);
+
+		val = readl(ispi->base + HSFSTS_CTL);
+		val &= ~(HSFSTS_CTL_FDBC_MASK | HSFSTS_CTL_FCYCLE_MASK);
+		val |= HSFSTS_CTL_AEL | HSFSTS_CTL_FCERR | HSFSTS_CTL_FDONE;
+		val |= cmd;
+		val |= HSFSTS_CTL_FGO;
+		writel(val, ispi->base + HSFSTS_CTL);
+
+		ret = intel_spi_wait_hw_busy(ispi);
+		if (ret)
+			return ret;
+
+		status = readl(ispi->base + HSFSTS_CTL);
+		if (status & HSFSTS_CTL_FCERR)
+			return -EIO;
+		else if (status & HSFSTS_CTL_AEL)
+			return -EACCES;
+
+		offs += erase_size;
+		len -= erase_size;
+	}
+
+	return 0;
+}
+
+static bool intel_spi_is_protected(const struct intel_spi *ispi,
+				   unsigned int base, unsigned int limit)
+{
+	int i;
+
+	for (i = 0; i < PR_NUM; i++) {
+		u32 pr_base, pr_limit, pr_value;
+
+		pr_value = readl(ispi->pregs + PR(i));
+		if (!(pr_value & (PR_WPE | PR_RPE)))
+			continue;
+
+		pr_limit = (pr_value & PR_LIMIT_MASK) >> PR_LIMIT_SHIFT;
+		pr_base = pr_value & PR_BASE_MASK;
+
+		if (pr_base >= base && pr_limit <= limit)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * There will be a single partition holding all enabled flash regions. We
+ * call this "BIOS".
+ */
+static void intel_spi_fill_partition(struct intel_spi *ispi,
+				     struct mtd_partition *part)
+{
+	u64 end;
+	int i;
+
+	memset(part, 0, sizeof(*part));
+
+	/* Start from the mandatory descriptor region */
+	part->size = 4096;
+	part->name = "BIOS";
+
+	/*
+	 * Now try to find where this partition ends based on the flash
+	 * region registers.
+	 */
+	for (i = 1; i < ispi->nregions; i++) {
+		u32 region, base, limit;
+
+		region = readl(ispi->base + FREG(i));
+		base = region & FREG_BASE_MASK;
+		limit = (region & FREG_LIMIT_MASK) >> FREG_LIMIT_SHIFT;
+
+		if (base >= limit || limit == 0)
+			continue;
+
+		/*
+		 * If any of the regions have protection bits set, make the
+		 * whole partition read-only to be on the safe side.
+		 */
+		if (intel_spi_is_protected(ispi, base, limit))
+			ispi->writeable = 0;
+
+		end = (limit << 12) + 4096;
+		if (end > part->size)
+			part->size = end;
+	}
+}
+
+struct intel_spi *intel_spi_probe(struct device *dev,
+	struct resource *mem, const struct intel_spi_boardinfo *info)
+{
+	struct mtd_partition part;
+	struct intel_spi *ispi;
+	int ret;
+
+	if (!info || !mem)
+		return ERR_PTR(-EINVAL);
+
+	ispi = devm_kzalloc(dev, sizeof(*ispi), GFP_KERNEL);
+	if (!ispi)
+		return ERR_PTR(-ENOMEM);
+
+	ispi->base = devm_ioremap_resource(dev, mem);
+	if (IS_ERR(ispi->base))
+		return ispi->base;
+
+	ispi->dev = dev;
+	ispi->info = info;
+	ispi->writeable = info->writeable;
+
+	ret = intel_spi_init(ispi);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ispi->nor.dev = ispi->dev;
+	ispi->nor.priv = ispi;
+	ispi->nor.read_reg = intel_spi_read_reg;
+	ispi->nor.write_reg = intel_spi_write_reg;
+	ispi->nor.read = intel_spi_read;
+	ispi->nor.write = intel_spi_write;
+	ispi->nor.erase = intel_spi_erase;
+
+	ret = spi_nor_scan(&ispi->nor, NULL, SPI_NOR_NORMAL);
+	if (ret) {
+		dev_info(dev, "failed to locate the chip\n");
+		return ERR_PTR(ret);
+	}
+
+	intel_spi_fill_partition(ispi, &part);
+
+	/* Prevent writes if not explicitly enabled */
+	if (!ispi->writeable || !writeable)
+		ispi->nor.mtd.flags &= ~MTD_WRITEABLE;
+
+	ret = mtd_device_parse_register(&ispi->nor.mtd, NULL, NULL, &part, 1);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return ispi;
+}
+EXPORT_SYMBOL_GPL(intel_spi_probe);
+
+int intel_spi_remove(struct intel_spi *ispi)
+{
+	return mtd_device_unregister(&ispi->nor.mtd);
+}
+EXPORT_SYMBOL_GPL(intel_spi_remove);
+
+MODULE_DESCRIPTION("Intel PCH/PCU SPI flash core driver");
+MODULE_AUTHOR("Mika Westerberg <mika.westerberg@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/spi-nor/intel-spi.h b/drivers/mtd/spi-nor/intel-spi.h
new file mode 100644
index 000000000000..5ab7dc250050
--- /dev/null
+++ b/drivers/mtd/spi-nor/intel-spi.h
@@ -0,0 +1,24 @@
+/*
+ * Intel PCH/PCU SPI flash driver.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef INTEL_SPI_H
+#define INTEL_SPI_H
+
+#include <linux/platform_data/intel-spi.h>
+
+struct intel_spi;
+struct resource;
+
+struct intel_spi *intel_spi_probe(struct device *dev,
+	struct resource *mem, const struct intel_spi_boardinfo *info);
+int intel_spi_remove(struct intel_spi *ispi);
+
+#endif /* INTEL_SPI_H */
diff --git a/include/linux/platform_data/intel-spi.h b/include/linux/platform_data/intel-spi.h
new file mode 100644
index 000000000000..942b0c3f8f08
--- /dev/null
+++ b/include/linux/platform_data/intel-spi.h
@@ -0,0 +1,31 @@
+/*
+ * Intel PCH/PCU SPI flash driver.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef INTEL_SPI_PDATA_H
+#define INTEL_SPI_PDATA_H
+
+enum intel_spi_type {
+	INTEL_SPI_BYT = 1,
+	INTEL_SPI_LPT,
+	INTEL_SPI_BXT,
+};
+
+/**
+ * struct intel_spi_boardinfo - Board specific data for Intel SPI driver
+ * @type: Type which this controller is compatible with
+ * @writeable: The chip is writeable
+ */
+struct intel_spi_boardinfo {
+	enum intel_spi_type type;
+	bool writeable;
+};
+
+#endif /* INTEL_SPI_PDATA_H */
-- 
cgit v1.2.3


From ff00d7a32a1b88b772981a13fc198e0d29300666 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 28 Nov 2016 15:06:25 +0300
Subject: mfd: lpc_ich: Add support for SPI serial flash host controller

Many Intel CPUs including Haswell, Broadwell and Baytrail have SPI serial
flash host controller as part of the LPC device. This will populate an MFD
cell suitable for the SPI host controller driver if we know that the LPC
device has one.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/lpc_ich.c       | 92 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/lpc_ich.h |  3 ++
 2 files changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
index 1ef7575547e6..985135828cfc 100644
--- a/drivers/mfd/lpc_ich.c
+++ b/drivers/mfd/lpc_ich.c
@@ -83,6 +83,15 @@
 #define ACPIBASE_GCS_OFF	0x3410
 #define ACPIBASE_GCS_END	0x3414
 
+#define SPIBASE_BYT		0x54
+#define SPIBASE_BYT_SZ		512
+#define SPIBASE_BYT_EN		BIT(1)
+
+#define SPIBASE_LPT		0x3800
+#define SPIBASE_LPT_SZ		512
+#define BCR			0xdc
+#define BCR_WPD			BIT(0)
+
 #define GPIOBASE_ICH0		0x58
 #define GPIOCTRL_ICH0		0x5C
 #define GPIOBASE_ICH6		0x48
@@ -133,6 +142,12 @@ static struct resource gpio_ich_res[] = {
 	},
 };
 
+static struct resource intel_spi_res[] = {
+	{
+		.flags = IORESOURCE_MEM,
+	}
+};
+
 static struct mfd_cell lpc_ich_wdt_cell = {
 	.name = "iTCO_wdt",
 	.num_resources = ARRAY_SIZE(wdt_ich_res),
@@ -147,6 +162,14 @@ static struct mfd_cell lpc_ich_gpio_cell = {
 	.ignore_resource_conflicts = true,
 };
 
+
+static struct mfd_cell lpc_ich_spi_cell = {
+	.name = "intel-spi",
+	.num_resources = ARRAY_SIZE(intel_spi_res),
+	.resources = intel_spi_res,
+	.ignore_resource_conflicts = true,
+};
+
 /* chipset related info */
 enum lpc_chipsets {
 	LPC_ICH = 0,	/* ICH */
@@ -494,10 +517,12 @@ static struct lpc_ich_info lpc_chipset_info[] = {
 		.name = "Lynx Point",
 		.iTCO_version = 2,
 		.gpio_version = ICH_V5_GPIO,
+		.spi_type = INTEL_SPI_LPT,
 	},
 	[LPC_LPT_LP] = {
 		.name = "Lynx Point_LP",
 		.iTCO_version = 2,
+		.spi_type = INTEL_SPI_LPT,
 	},
 	[LPC_WBG] = {
 		.name = "Wellsburg",
@@ -511,6 +536,7 @@ static struct lpc_ich_info lpc_chipset_info[] = {
 	[LPC_BAYTRAIL] = {
 		.name = "Bay Trail SoC",
 		.iTCO_version = 3,
+		.spi_type = INTEL_SPI_BYT,
 	},
 	[LPC_COLETO] = {
 		.name = "Coleto Creek",
@@ -519,10 +545,12 @@ static struct lpc_ich_info lpc_chipset_info[] = {
 	[LPC_WPT_LP] = {
 		.name = "Wildcat Point_LP",
 		.iTCO_version = 2,
+		.spi_type = INTEL_SPI_LPT,
 	},
 	[LPC_BRASWELL] = {
 		.name = "Braswell SoC",
 		.iTCO_version = 3,
+		.spi_type = INTEL_SPI_BYT,
 	},
 	[LPC_LEWISBURG] = {
 		.name = "Lewisburg",
@@ -1056,6 +1084,64 @@ wdt_done:
 	return ret;
 }
 
+static int lpc_ich_init_spi(struct pci_dev *dev)
+{
+	struct lpc_ich_priv *priv = pci_get_drvdata(dev);
+	struct resource *res = &intel_spi_res[0];
+	struct intel_spi_boardinfo *info;
+	u32 spi_base, rcba, bcr;
+
+	info = devm_kzalloc(&dev->dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->type = lpc_chipset_info[priv->chipset].spi_type;
+
+	switch (info->type) {
+	case INTEL_SPI_BYT:
+		pci_read_config_dword(dev, SPIBASE_BYT, &spi_base);
+		if (spi_base & SPIBASE_BYT_EN) {
+			res->start = spi_base & ~(SPIBASE_BYT_SZ - 1);
+			res->end = res->start + SPIBASE_BYT_SZ - 1;
+		}
+		break;
+
+	case INTEL_SPI_LPT:
+		pci_read_config_dword(dev, RCBABASE, &rcba);
+		if (rcba & 1) {
+			spi_base = round_down(rcba, SPIBASE_LPT_SZ);
+			res->start = spi_base + SPIBASE_LPT;
+			res->end = res->start + SPIBASE_LPT_SZ - 1;
+
+			/*
+			 * Try to make the flash chip writeable now by
+			 * setting BCR_WPD. It it fails we tell the driver
+			 * that it can only read the chip.
+			 */
+			pci_read_config_dword(dev, BCR, &bcr);
+			if (!(bcr & BCR_WPD)) {
+				bcr |= BCR_WPD;
+				pci_write_config_dword(dev, BCR, bcr);
+				pci_read_config_dword(dev, BCR, &bcr);
+			}
+			info->writeable = !!(bcr & BCR_WPD);
+		}
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!res->start)
+		return -ENODEV;
+
+	lpc_ich_spi_cell.platform_data = info;
+	lpc_ich_spi_cell.pdata_size = sizeof(*info);
+
+	return mfd_add_devices(&dev->dev, PLATFORM_DEVID_NONE,
+			       &lpc_ich_spi_cell, 1, NULL, 0, NULL);
+}
+
 static int lpc_ich_probe(struct pci_dev *dev,
 				const struct pci_device_id *id)
 {
@@ -1099,6 +1185,12 @@ static int lpc_ich_probe(struct pci_dev *dev,
 			cell_added = true;
 	}
 
+	if (lpc_chipset_info[priv->chipset].spi_type) {
+		ret = lpc_ich_init_spi(dev);
+		if (!ret)
+			cell_added = true;
+	}
+
 	/*
 	 * We only care if at least one or none of the cells registered
 	 * successfully.
diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index 2b300b44f994..fba8fcb54f8c 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -20,6 +20,8 @@
 #ifndef LPC_ICH_H
 #define LPC_ICH_H
 
+#include <linux/platform_data/intel-spi.h>
+
 /* GPIO resources */
 #define ICH_RES_GPIO	0
 #define ICH_RES_GPE0	1
@@ -40,6 +42,7 @@ struct lpc_ich_info {
 	char name[32];
 	unsigned int iTCO_version;
 	unsigned int gpio_version;
+	enum intel_spi_type spi_type;
 	u8 use_gpio;
 };
 
-- 
cgit v1.2.3


From b31ef8285b19ec5563274c574fcfe7a5993125ce Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Wed, 21 Dec 2016 09:47:03 -0800
Subject: thermal core: convert ID allocation to IDA

The thermal core does not use the ability to look up pointers by ID, so
convert it from using an IDR to the more space-efficient IDA.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/thermal_core.c | 75 +++++++++++++++---------------------------
 include/linux/thermal.h        |  4 +--
 2 files changed, 28 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 641faab6e24b..f2a0cd119e22 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -36,9 +36,8 @@ MODULE_AUTHOR("Zhang Rui");
 MODULE_DESCRIPTION("Generic thermal management sysfs support");
 MODULE_LICENSE("GPL v2");
 
-static DEFINE_IDR(thermal_tz_idr);
-static DEFINE_IDR(thermal_cdev_idr);
-static DEFINE_MUTEX(thermal_idr_lock);
+static DEFINE_IDA(thermal_tz_ida);
+static DEFINE_IDA(thermal_cdev_ida);
 
 static LIST_HEAD(thermal_tz_list);
 static LIST_HEAD(thermal_cdev_list);
@@ -589,29 +588,6 @@ void thermal_zone_device_unbind_exception(struct thermal_zone_device *tz,
  * - thermal zone devices lifecycle: registration, unregistration,
  *				     binding, and unbinding.
  */
-static int get_idr(struct idr *idr, struct mutex *lock, int *id)
-{
-	int ret;
-
-	if (lock)
-		mutex_lock(lock);
-	ret = idr_alloc(idr, NULL, 0, 0, GFP_KERNEL);
-	if (lock)
-		mutex_unlock(lock);
-	if (unlikely(ret < 0))
-		return ret;
-	*id = ret;
-	return 0;
-}
-
-static void release_idr(struct idr *idr, struct mutex *lock, int id)
-{
-	if (lock)
-		mutex_lock(lock);
-	idr_remove(idr, id);
-	if (lock)
-		mutex_unlock(lock);
-}
 
 /**
  * thermal_zone_bind_cooling_device() - bind a cooling device to a thermal zone
@@ -685,15 +661,16 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	dev->target = THERMAL_NO_TARGET;
 	dev->weight = weight;
 
-	result = get_idr(&tz->idr, &tz->lock, &dev->id);
-	if (result)
+	result = ida_simple_get(&tz->ida, 0, 0, GFP_KERNEL);
+	if (result < 0)
 		goto free_mem;
 
+	dev->id = result;
 	sprintf(dev->name, "cdev%d", dev->id);
 	result =
 	    sysfs_create_link(&tz->device.kobj, &cdev->device.kobj, dev->name);
 	if (result)
-		goto release_idr;
+		goto release_ida;
 
 	sprintf(dev->attr_name, "cdev%d_trip_point", dev->id);
 	sysfs_attr_init(&dev->attr.attr);
@@ -737,8 +714,8 @@ remove_trip_file:
 	device_remove_file(&tz->device, &dev->attr);
 remove_symbol_link:
 	sysfs_remove_link(&tz->device.kobj, dev->name);
-release_idr:
-	release_idr(&tz->idr, &tz->lock, dev->id);
+release_ida:
+	ida_simple_remove(&tz->ida, dev->id);
 free_mem:
 	kfree(dev);
 	return result;
@@ -785,7 +762,7 @@ unbind:
 	device_remove_file(&tz->device, &pos->weight_attr);
 	device_remove_file(&tz->device, &pos->attr);
 	sysfs_remove_link(&tz->device.kobj, pos->name);
-	release_idr(&tz->idr, &tz->lock, pos->id);
+	ida_simple_remove(&tz->ida, pos->id);
 	kfree(pos);
 	return 0;
 }
@@ -920,12 +897,13 @@ __thermal_cooling_device_register(struct device_node *np,
 	if (!cdev)
 		return ERR_PTR(-ENOMEM);
 
-	result = get_idr(&thermal_cdev_idr, &thermal_idr_lock, &cdev->id);
-	if (result) {
+	result = ida_simple_get(&thermal_cdev_ida, 0, 0, GFP_KERNEL);
+	if (result < 0) {
 		kfree(cdev);
 		return ERR_PTR(result);
 	}
 
+	cdev->id = result;
 	strlcpy(cdev->type, type ? : "", sizeof(cdev->type));
 	mutex_init(&cdev->lock);
 	INIT_LIST_HEAD(&cdev->thermal_instances);
@@ -938,7 +916,7 @@ __thermal_cooling_device_register(struct device_node *np,
 	dev_set_name(&cdev->device, "cooling_device%d", cdev->id);
 	result = device_register(&cdev->device);
 	if (result) {
-		release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
+		ida_simple_remove(&thermal_cdev_ida, cdev->id);
 		kfree(cdev);
 		return ERR_PTR(result);
 	}
@@ -1065,7 +1043,7 @@ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 
 	mutex_unlock(&thermal_list_lock);
 
-	release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
+	ida_simple_remove(&thermal_cdev_ida, cdev->id);
 	device_unregister(&cdev->device);
 }
 EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister);
@@ -1167,14 +1145,15 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&tz->thermal_instances);
-	idr_init(&tz->idr);
+	ida_init(&tz->ida);
 	mutex_init(&tz->lock);
-	result = get_idr(&thermal_tz_idr, &thermal_idr_lock, &tz->id);
-	if (result) {
+	result = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL);
+	if (result < 0) {
 		kfree(tz);
 		return ERR_PTR(result);
 	}
 
+	tz->id = result;
 	strlcpy(tz->type, type, sizeof(tz->type));
 	tz->ops = ops;
 	tz->tzp = tzp;
@@ -1196,7 +1175,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 	dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 	result = device_register(&tz->device);
 	if (result) {
-		release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
+		ida_simple_remove(&thermal_tz_ida, tz->id);
 		kfree(tz);
 		return ERR_PTR(result);
 	}
@@ -1250,7 +1229,7 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 	return tz;
 
 unregister:
-	release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
+	ida_simple_remove(&thermal_tz_ida, tz->id);
 	device_unregister(&tz->device);
 	return ERR_PTR(result);
 }
@@ -1312,8 +1291,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 	thermal_set_governor(tz, NULL);
 
 	thermal_remove_hwmon_sysfs(tz);
-	release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
-	idr_destroy(&tz->idr);
+	ida_simple_remove(&thermal_tz_ida, tz->id);
+	ida_destroy(&tz->ida);
 	mutex_destroy(&tz->lock);
 	device_unregister(&tz->device);
 	kfree(tz->device.groups);
@@ -1514,9 +1493,8 @@ unregister_class:
 unregister_governors:
 	thermal_unregister_governors();
 error:
-	idr_destroy(&thermal_tz_idr);
-	idr_destroy(&thermal_cdev_idr);
-	mutex_destroy(&thermal_idr_lock);
+	ida_destroy(&thermal_tz_ida);
+	ida_destroy(&thermal_cdev_ida);
 	mutex_destroy(&thermal_list_lock);
 	mutex_destroy(&thermal_governor_lock);
 	return result;
@@ -1529,9 +1507,8 @@ static void __exit thermal_exit(void)
 	genetlink_exit();
 	class_unregister(&thermal_class);
 	thermal_unregister_governors();
-	idr_destroy(&thermal_tz_idr);
-	idr_destroy(&thermal_cdev_idr);
-	mutex_destroy(&thermal_idr_lock);
+	ida_destroy(&thermal_tz_ida);
+	ida_destroy(&thermal_cdev_ida);
 	mutex_destroy(&thermal_list_lock);
 	mutex_destroy(&thermal_governor_lock);
 }
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index e275e98bdceb..dab11f97e1c6 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -194,7 +194,7 @@ struct thermal_attr {
  * @governor:	pointer to the governor for this thermal zone
  * @governor_data:	private pointer for governor data
  * @thermal_instances:	list of &struct thermal_instance of this thermal zone
- * @idr:	&struct idr to generate unique id for this zone's cooling
+ * @ida:	&struct ida to generate unique id for this zone's cooling
  *		devices
  * @lock:	lock to protect thermal_instances list
  * @node:	node in thermal_tz_list (in thermal_core.c)
@@ -227,7 +227,7 @@ struct thermal_zone_device {
 	struct thermal_governor *governor;
 	void *governor_data;
 	struct list_head thermal_instances;
-	struct idr idr;
+	struct ida ida;
 	struct mutex lock;
 	struct list_head node;
 	struct delayed_work poll_queue;
-- 
cgit v1.2.3


From fe2858c8c6d167df33a839591ebe63ea05a69d06 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@gmail.com>
Date: Wed, 4 Jan 2017 09:39:52 +0100
Subject: pwm: Remove pwm_can_sleep()

The last user of this function has been removed, so it is no longer
needed.

Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 12 ------------
 include/linux/pwm.h |  7 -------
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 172ef8245811..78e114a11c4f 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -960,18 +960,6 @@ void devm_pwm_put(struct device *dev, struct pwm_device *pwm)
 }
 EXPORT_SYMBOL_GPL(devm_pwm_put);
 
-/**
-  * pwm_can_sleep() - report whether PWM access will sleep
-  * @pwm: PWM device
-  *
-  * Returns: True if accessing the PWM can sleep, false otherwise.
-  */
-bool pwm_can_sleep(struct pwm_device *pwm)
-{
-	return true;
-}
-EXPORT_SYMBOL_GPL(pwm_can_sleep);
-
 #ifdef CONFIG_DEBUG_FS
 static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 2c6c5114c089..e15fd3ce6502 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -451,8 +451,6 @@ struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id);
 struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
 				   const char *con_id);
 void devm_pwm_put(struct device *dev, struct pwm_device *pwm);
-
-bool pwm_can_sleep(struct pwm_device *pwm);
 #else
 static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
 {
@@ -566,11 +564,6 @@ static inline struct pwm_device *devm_of_pwm_get(struct device *dev,
 static inline void devm_pwm_put(struct device *dev, struct pwm_device *pwm)
 {
 }
-
-static inline bool pwm_can_sleep(struct pwm_device *pwm)
-{
-	return false;
-}
 #endif
 
 static inline void pwm_apply_args(struct pwm_device *pwm)
-- 
cgit v1.2.3


From 8c0216f377406c7613b67bd18755889026284192 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@gmail.com>
Date: Wed, 4 Jan 2017 09:40:54 +0100
Subject: pwm: Remove .can_sleep from struct pwm_chip

All PWM devices have been marked as "might sleep" since v4.5, there is
no longer a need to differentiate on a per-chip basis.

Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/pwm-atmel-hlcdc.c | 1 -
 drivers/pwm/pwm-atmel.c       | 1 -
 drivers/pwm/pwm-bcm-kona.c    | 1 -
 drivers/pwm/pwm-berlin.c      | 1 -
 drivers/pwm/pwm-brcmstb.c     | 1 -
 drivers/pwm/pwm-fsl-ftm.c     | 1 -
 drivers/pwm/pwm-imx.c         | 1 -
 drivers/pwm/pwm-lp3943.c      | 1 -
 drivers/pwm/pwm-mxs.c         | 2 +-
 drivers/pwm/pwm-pca9685.c     | 1 -
 drivers/pwm/pwm-sti.c         | 1 -
 drivers/pwm/pwm-sun4i.c       | 1 -
 drivers/pwm/pwm-twl-led.c     | 1 -
 drivers/pwm/pwm-twl.c         | 1 -
 drivers/staging/greybus/pwm.c | 1 -
 include/linux/pwm.h           | 3 ---
 16 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c
index 14fc011faa32..999187277ea5 100644
--- a/drivers/pwm/pwm-atmel-hlcdc.c
+++ b/drivers/pwm/pwm-atmel-hlcdc.c
@@ -270,7 +270,6 @@ static int atmel_hlcdc_pwm_probe(struct platform_device *pdev)
 	chip->chip.npwm = 1;
 	chip->chip.of_xlate = of_pwm_xlate_with_flags;
 	chip->chip.of_pwm_n_cells = 3;
-	chip->chip.can_sleep = 1;
 
 	ret = pwmchip_add_with_polarity(&chip->chip, PWM_POLARITY_INVERSED);
 	if (ret) {
diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c
index e6b8b1b7e6ba..67a7023be5c2 100644
--- a/drivers/pwm/pwm-atmel.c
+++ b/drivers/pwm/pwm-atmel.c
@@ -385,7 +385,6 @@ static int atmel_pwm_probe(struct platform_device *pdev)
 
 	atmel_pwm->chip.base = -1;
 	atmel_pwm->chip.npwm = 4;
-	atmel_pwm->chip.can_sleep = true;
 	atmel_pwm->config = data->config;
 	atmel_pwm->updated_pwms = 0;
 	mutex_init(&atmel_pwm->isr_lock);
diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c
index c63418322023..09a95aeb3a70 100644
--- a/drivers/pwm/pwm-bcm-kona.c
+++ b/drivers/pwm/pwm-bcm-kona.c
@@ -276,7 +276,6 @@ static int kona_pwmc_probe(struct platform_device *pdev)
 	kp->chip.npwm = 6;
 	kp->chip.of_xlate = of_pwm_xlate_with_flags;
 	kp->chip.of_pwm_n_cells = 3;
-	kp->chip.can_sleep = true;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	kp->base = devm_ioremap_resource(&pdev->dev, res);
diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c
index 01339c152ab0..771859aca4be 100644
--- a/drivers/pwm/pwm-berlin.c
+++ b/drivers/pwm/pwm-berlin.c
@@ -206,7 +206,6 @@ static int berlin_pwm_probe(struct platform_device *pdev)
 	pwm->chip.ops = &berlin_pwm_ops;
 	pwm->chip.base = -1;
 	pwm->chip.npwm = 4;
-	pwm->chip.can_sleep = true;
 	pwm->chip.of_xlate = of_pwm_xlate_with_flags;
 	pwm->chip.of_pwm_n_cells = 3;
 
diff --git a/drivers/pwm/pwm-brcmstb.c b/drivers/pwm/pwm-brcmstb.c
index 5d5adee16886..8063cffa1c96 100644
--- a/drivers/pwm/pwm-brcmstb.c
+++ b/drivers/pwm/pwm-brcmstb.c
@@ -270,7 +270,6 @@ static int brcmstb_pwm_probe(struct platform_device *pdev)
 	p->chip.ops = &brcmstb_pwm_ops;
 	p->chip.base = -1;
 	p->chip.npwm = 2;
-	p->chip.can_sleep = true;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	p->base = devm_ioremap_resource(&pdev->dev, res);
diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c
index fad968eb75f6..557b4ea16796 100644
--- a/drivers/pwm/pwm-fsl-ftm.c
+++ b/drivers/pwm/pwm-fsl-ftm.c
@@ -446,7 +446,6 @@ static int fsl_pwm_probe(struct platform_device *pdev)
 	fpc->chip.of_pwm_n_cells = 3;
 	fpc->chip.base = -1;
 	fpc->chip.npwm = 8;
-	fpc->chip.can_sleep = true;
 
 	ret = pwmchip_add(&fpc->chip);
 	if (ret < 0) {
diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c
index d600fd5cd4ba..1223187ad354 100644
--- a/drivers/pwm/pwm-imx.c
+++ b/drivers/pwm/pwm-imx.c
@@ -304,7 +304,6 @@ static int imx_pwm_probe(struct platform_device *pdev)
 	imx->chip.dev = &pdev->dev;
 	imx->chip.base = -1;
 	imx->chip.npwm = 1;
-	imx->chip.can_sleep = true;
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	imx->mmio_base = devm_ioremap_resource(&pdev->dev, r);
diff --git a/drivers/pwm/pwm-lp3943.c b/drivers/pwm/pwm-lp3943.c
index 872ea76a4f19..52584e9962ed 100644
--- a/drivers/pwm/pwm-lp3943.c
+++ b/drivers/pwm/pwm-lp3943.c
@@ -278,7 +278,6 @@ static int lp3943_pwm_probe(struct platform_device *pdev)
 	lp3943_pwm->chip.dev = &pdev->dev;
 	lp3943_pwm->chip.ops = &lp3943_pwm_ops;
 	lp3943_pwm->chip.npwm = LP3943_NUM_PWMS;
-	lp3943_pwm->chip.can_sleep = true;
 
 	platform_set_drvdata(pdev, lp3943_pwm);
 
diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c
index 9a596324ebef..a6017ad9926c 100644
--- a/drivers/pwm/pwm-mxs.c
+++ b/drivers/pwm/pwm-mxs.c
@@ -151,7 +151,7 @@ static int mxs_pwm_probe(struct platform_device *pdev)
 	mxs->chip.dev = &pdev->dev;
 	mxs->chip.ops = &mxs_pwm_ops;
 	mxs->chip.base = -1;
-	mxs->chip.can_sleep = true;
+
 	ret = of_property_read_u32(np, "fsl,pwm-number", &mxs->chip.npwm);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to get pwm number: %d\n", ret);
diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c
index 117fccf7934a..c8282a2650be 100644
--- a/drivers/pwm/pwm-pca9685.c
+++ b/drivers/pwm/pwm-pca9685.c
@@ -343,7 +343,6 @@ static int pca9685_pwm_probe(struct i2c_client *client,
 
 	pca->chip.dev = &client->dev;
 	pca->chip.base = -1;
-	pca->chip.can_sleep = true;
 
 	return pwmchip_add(&pca->chip);
 }
diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
index dd82dc840af9..2b7c31c9d1ab 100644
--- a/drivers/pwm/pwm-sti.c
+++ b/drivers/pwm/pwm-sti.c
@@ -635,7 +635,6 @@ skip_cpt:
 	pc->chip.ops = &sti_pwm_ops;
 	pc->chip.base = -1;
 	pc->chip.npwm = pc->cdata->pwm_num_devs;
-	pc->chip.can_sleep = true;
 
 	ret = pwmchip_add(&pc->chip);
 	if (ret < 0) {
diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index b0803f6c64d9..1284ffa05921 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -340,7 +340,6 @@ static int sun4i_pwm_probe(struct platform_device *pdev)
 	pwm->chip.ops = &sun4i_pwm_ops;
 	pwm->chip.base = -1;
 	pwm->chip.npwm = pwm->data->npwm;
-	pwm->chip.can_sleep = true;
 	pwm->chip.of_xlate = of_pwm_xlate_with_flags;
 	pwm->chip.of_pwm_n_cells = 3;
 
diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c
index b964470025c5..21eff991d0e3 100644
--- a/drivers/pwm/pwm-twl-led.c
+++ b/drivers/pwm/pwm-twl-led.c
@@ -303,7 +303,6 @@ static int twl_pwmled_probe(struct platform_device *pdev)
 
 	twl->chip.dev = &pdev->dev;
 	twl->chip.base = -1;
-	twl->chip.can_sleep = true;
 
 	mutex_init(&twl->mutex);
 
diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c
index 7a993b056638..9de617b76680 100644
--- a/drivers/pwm/pwm-twl.c
+++ b/drivers/pwm/pwm-twl.c
@@ -323,7 +323,6 @@ static int twl_pwm_probe(struct platform_device *pdev)
 	twl->chip.dev = &pdev->dev;
 	twl->chip.base = -1;
 	twl->chip.npwm = 2;
-	twl->chip.can_sleep = true;
 
 	mutex_init(&twl->mutex);
 
diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index c4bf3298ba07..f0404bc37123 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -284,7 +284,6 @@ static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
 	pwm->ops = &gb_pwm_ops;
 	pwm->base = -1;			/* Allocate base dynamically */
 	pwm->npwm = pwmc->pwm_max + 1;
-	pwm->can_sleep = true;		/* FIXME */
 
 	ret = pwmchip_add(pwm);
 	if (ret) {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index e15fd3ce6502..eae215ef1b2c 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -287,8 +287,6 @@ struct pwm_ops {
  * @pwms: array of PWM devices allocated by the framework
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
- * @can_sleep: must be true if the .config(), .enable() or .disable()
- *             operations may sleep
  */
 struct pwm_chip {
 	struct device *dev;
@@ -302,7 +300,6 @@ struct pwm_chip {
 	struct pwm_device * (*of_xlate)(struct pwm_chip *pc,
 					const struct of_phandle_args *args);
 	unsigned int of_pwm_n_cells;
-	bool can_sleep;
 };
 
 /**
-- 
cgit v1.2.3


From 1ff8cebf49ed9e9ca2ae44b5c4176aef9c21af9c Mon Sep 17 00:00:00 2001
From: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Date: Tue, 3 Jan 2017 20:42:17 +0800
Subject: scm: remove use CMSG{_COMPAT}_ALIGN(sizeof(struct {compat_}cmsghdr))

sizeof(struct cmsghdr) and sizeof(struct compat_cmsghdr) already aligned.
remove use CMSG_ALIGN(sizeof(struct cmsghdr)) and
CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) keep code consistent.

Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h |  6 +++---
 net/compat.c           | 14 ++++++--------
 net/core/scm.c         |  2 +-
 net/ipv4/ip_sockglue.c |  2 +-
 net/rxrpc/sendmsg.c    |  2 +-
 5 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index b5cc5a6d7011..c06438023d79 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -92,9 +92,9 @@ struct cmsghdr {
 
 #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) )
 
-#define CMSG_DATA(cmsg)	((void *)((char *)(cmsg) + CMSG_ALIGN(sizeof(struct cmsghdr))))
-#define CMSG_SPACE(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + CMSG_ALIGN(len))
-#define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + (len))
+#define CMSG_DATA(cmsg)	((void *)((char *)(cmsg) + sizeof(struct cmsghdr)))
+#define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len))
+#define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len))
 
 #define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \
 				  (struct cmsghdr *)(ctl) : \
diff --git a/net/compat.c b/net/compat.c
index 96c544b05b15..4e27dd1cd3a6 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -90,11 +90,11 @@ int get_compat_msghdr(struct msghdr *kmsg,
 #define CMSG_COMPAT_ALIGN(len)	ALIGN((len), sizeof(s32))
 
 #define CMSG_COMPAT_DATA(cmsg)				\
-	((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
+	((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr)))
 #define CMSG_COMPAT_SPACE(len)				\
-	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
+	(sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len))
 #define CMSG_COMPAT_LEN(len)				\
-	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
+	(sizeof(struct compat_cmsghdr) + (len))
 
 #define CMSG_COMPAT_FIRSTHDR(msg)			\
 	(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ?	\
@@ -141,8 +141,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
 			return -EINVAL;
 
-		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
-		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
 		tmp = CMSG_ALIGN(tmp);
 		kcmlen += tmp;
 		ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
@@ -168,8 +167,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 			goto Efault;
 		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
 			goto Einval;
-		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
-		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
 		if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
 			goto Einval;
 		kcmsg->cmsg_len = tmp;
@@ -178,7 +176,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 		    __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
 		    copy_from_user(CMSG_DATA(kcmsg),
 				   CMSG_COMPAT_DATA(ucmsg),
-				   (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+				   (ucmlen - sizeof(*ucmsg))))
 			goto Efault;
 
 		/* Advance. */
diff --git a/net/core/scm.c b/net/core/scm.c
index d8820438ba37..b6d83686e149 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -71,7 +71,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 	struct file **fpp;
 	int i, num;
 
-	num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+	num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
 
 	if (num <= 0)
 		return 0;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf9b43de4690..1caa38d8a9c9 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -272,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			continue;
 		switch (cmsg->cmsg_type) {
 		case IP_RETOPTS:
-			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+			err = cmsg->cmsg_len - sizeof(struct cmsghdr);
 
 			/* Our caller is responsible for freeing ipc->opt */
 			err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index b214a4d4a641..0a6ef217aa8a 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -376,7 +376,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 
-		len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+		len = cmsg->cmsg_len - sizeof(struct cmsghdr);
 		_debug("CMSG %d, %d, %d",
 		       cmsg->cmsg_level, cmsg->cmsg_type, len);
 
-- 
cgit v1.2.3


From 63971c5682bf89e15083733dcd7b4610e789e843 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 2 Jan 2017 13:44:29 +0200
Subject: spi: pxa2xx: fix indentation of the comments in header

Just for sake of readability fix the indentation of the comments in
pxa2xx_ssp.h header file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/pxa2xx_ssp.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 2d6f0c39ed68..a0522328d7aa 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -90,9 +90,9 @@
 #define SSSR_RFL_MASK	(0xf << 12)	/* Receive FIFO Level mask */
 
 #define SSCR1_TFT	(0x000003c0)	/* Transmit FIFO Threshold (mask) */
-#define SSCR1_TxTresh(x) (((x) - 1) << 6) /* level [1..16] */
+#define SSCR1_TxTresh(x) (((x) - 1) << 6)	/* level [1..16] */
 #define SSCR1_RFT	(0x00003c00)	/* Receive FIFO Threshold (mask) */
-#define SSCR1_RxTresh(x) (((x) - 1) << 10) /* level [1..16] */
+#define SSCR1_RxTresh(x) (((x) - 1) << 10)	/* level [1..16] */
 
 #define RX_THRESH_CE4100_DFLT	2
 #define TX_THRESH_CE4100_DFLT	2
@@ -106,9 +106,9 @@
 #define CE4100_SSCR1_RxTresh(x) (((x) - 1) << 10)	/* level [1..4] */
 
 /* QUARK_X1000 SSCR0 bit definition */
-#define QUARK_X1000_SSCR0_DSS	(0x1F)		/* Data Size Select (mask) */
-#define QUARK_X1000_SSCR0_DataSize(x)  ((x) - 1)	/* Data Size Select [4..32] */
-#define QUARK_X1000_SSCR0_FRF	(0x3 << 5)	/* FRame Format (mask) */
+#define QUARK_X1000_SSCR0_DSS		(0x1F << 0)	/* Data Size Select (mask) */
+#define QUARK_X1000_SSCR0_DataSize(x)	((x) - 1)	/* Data Size Select [4..32] */
+#define QUARK_X1000_SSCR0_FRF		(0x3 << 5)	/* FRame Format (mask) */
 #define QUARK_X1000_SSCR0_Motorola	(0x0 << 5)	/* Motorola's Serial Peripheral Interface (SPI) */
 
 #define RX_THRESH_QUARK_X1000_DFLT	1
@@ -121,8 +121,8 @@
 #define QUARK_X1000_SSCR1_TxTresh(x) (((x) - 1) << 6)	/* level [1..32] */
 #define QUARK_X1000_SSCR1_RFT	(0x1F << 11)	/* Receive FIFO Threshold (mask) */
 #define QUARK_X1000_SSCR1_RxTresh(x) (((x) - 1) << 11)	/* level [1..32] */
-#define QUARK_X1000_SSCR1_STRF       (1 << 17)		/* Select FIFO or EFWR */
-#define QUARK_X1000_SSCR1_EFWR	(1 << 16)		/* Enable FIFO Write/Read */
+#define QUARK_X1000_SSCR1_STRF	(1 << 17)	/* Select FIFO or EFWR */
+#define QUARK_X1000_SSCR1_EFWR	(1 << 16)	/* Enable FIFO Write/Read */
 
 /* extra bits in PXA255, PXA26x and PXA27x SSP ports */
 #define SSCR0_TISSP		(1 << 4)	/* TI Sync Serial Protocol */
-- 
cgit v1.2.3


From eac53b3664f592713655f5de59dc44bdd0cfc0bd Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 21 Dec 2016 15:36:47 +0100
Subject: power: supply: axp288_charger: Drop platform_data dependency

When the axp288_charger driver was originally merged, it was merged with
a dependency on some other driver providing platform data for it.

However the battery-data-framework which should provide that data never
got merged, so the axp288_charger as merged upstream has never worked,
its probe method simply always returns -ENODEV.

This commit removes the dependency on the platform_data instead reading
back the charging current and charging voltage that the firmware has set
and using those values as the maximum values the user may set.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/axp288_charger.c | 88 ++++++++++++-----------------------
 include/linux/mfd/axp20x.h            |  7 ---
 2 files changed, 30 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/axp288_charger.c b/drivers/power/supply/axp288_charger.c
index f1e3b0ec3b82..76f7d292a2ea 100644
--- a/drivers/power/supply/axp288_charger.c
+++ b/drivers/power/supply/axp288_charger.c
@@ -143,7 +143,6 @@ enum {
 
 struct axp288_chrg_info {
 	struct platform_device *pdev;
-	struct axp20x_chrg_pdata *pdata;
 	struct regmap *regmap;
 	struct regmap_irq_chip_data *regmap_irqc;
 	int irq[CHRG_INTR_END];
@@ -769,60 +768,42 @@ static int charger_init_hw_regs(struct axp288_chrg_info *info)
 		return ret;
 	}
 
-	/* Init charging current and voltage */
-	info->max_cc = info->pdata->max_cc;
-	info->max_cv = info->pdata->max_cv;
-
 	/* Read current charge voltage and current limit */
 	ret = regmap_read(info->regmap, AXP20X_CHRG_CTRL1, &val);
 	if (ret < 0) {
-		/* Assume default if cannot read */
-		info->cc = info->pdata->def_cc;
-		info->cv = info->pdata->def_cv;
-	} else {
-		/* Determine charge voltage */
-		cv = (val & CHRG_CCCV_CV_MASK) >> CHRG_CCCV_CV_BIT_POS;
-		switch (cv) {
-		case CHRG_CCCV_CV_4100MV:
-			info->cv = CV_4100MV;
-			break;
-		case CHRG_CCCV_CV_4150MV:
-			info->cv = CV_4150MV;
-			break;
-		case CHRG_CCCV_CV_4200MV:
-			info->cv = CV_4200MV;
-			break;
-		case CHRG_CCCV_CV_4350MV:
-			info->cv = CV_4350MV;
-			break;
-		default:
-			info->cv = INT_MAX;
-			break;
-		}
-
-		/* Determine charge current limit */
-		cc = (ret & CHRG_CCCV_CC_MASK) >> CHRG_CCCV_CC_BIT_POS;
-		cc = (cc * CHRG_CCCV_CC_LSB_RES) + CHRG_CCCV_CC_OFFSET;
-		info->cc = cc;
+		dev_err(&info->pdev->dev, "register(%x) read error(%d)\n",
+			AXP20X_CHRG_CTRL1, ret);
+		return ret;
+	}
 
-		/* Program default charging voltage and current */
-		cc = min(info->pdata->def_cc, info->max_cc);
-		cv = min(info->pdata->def_cv, info->max_cv);
+	/* Determine charge voltage */
+	cv = (val & CHRG_CCCV_CV_MASK) >> CHRG_CCCV_CV_BIT_POS;
+	switch (cv) {
+	case CHRG_CCCV_CV_4100MV:
+		info->cv = CV_4100MV;
+		break;
+	case CHRG_CCCV_CV_4150MV:
+		info->cv = CV_4150MV;
+		break;
+	case CHRG_CCCV_CV_4200MV:
+		info->cv = CV_4200MV;
+		break;
+	case CHRG_CCCV_CV_4350MV:
+		info->cv = CV_4350MV;
+		break;
+	}
 
-		ret = axp288_charger_set_cc(info, cc);
-		if (ret < 0) {
-			dev_err(&info->pdev->dev,
-					"error(%d) in setting CC\n", ret);
-			return ret;
-		}
+	/* Determine charge current limit */
+	cc = (ret & CHRG_CCCV_CC_MASK) >> CHRG_CCCV_CC_BIT_POS;
+	cc = (cc * CHRG_CCCV_CC_LSB_RES) + CHRG_CCCV_CC_OFFSET;
+	info->cc = cc;
 
-		ret = axp288_charger_set_cv(info, cv);
-		if (ret < 0) {
-			dev_err(&info->pdev->dev,
-					"error(%d) in setting CV\n", ret);
-			return ret;
-		}
-	}
+	/*
+	 * Do not allow the user to configure higher settings then those
+	 * set by the firmware
+	 */
+	info->max_cv = info->cv;
+	info->max_cc = info->cc;
 
 	return 0;
 }
@@ -841,15 +822,6 @@ static int axp288_charger_probe(struct platform_device *pdev)
 	info->pdev = pdev;
 	info->regmap = axp20x->regmap;
 	info->regmap_irqc = axp20x->regmap_irqc;
-	info->pdata = pdev->dev.platform_data;
-
-	if (!info->pdata) {
-		/* Try ACPI provided pdata via device properties */
-		if (!device_property_present(&pdev->dev,
-						"axp288_charger_data\n"))
-			dev_err(&pdev->dev, "failed to get platform data\n");
-		return -ENODEV;
-	}
 
 	info->cable.edev = extcon_get_extcon_dev(AXP288_EXTCON_DEV_NAME);
 	if (info->cable.edev == NULL) {
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index a4860bc9b73d..e460fc42d63e 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -554,13 +554,6 @@ struct axp20x_fg_pdata {
 	int thermistor_curve[MAX_THERM_CURVE_SIZE][2];
 };
 
-struct axp20x_chrg_pdata {
-	int max_cc;
-	int max_cv;
-	int def_cc;
-	int def_cv;
-};
-
 struct axp288_extcon_pdata {
 	/* GPIO pin control to switch D+/D- lines b/w PMIC and SOC */
 	struct gpio_desc *gpio_mux_cntl;
-- 
cgit v1.2.3


From 888f97435a856c2c5c6ca0b3337b68d595b5639e Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 14 Dec 2016 17:38:53 +0100
Subject: power: supply: axp288_fuel_gauge: Drop platform_data dependency

When the axp288_faul_gauge driver was originally merged, it was
merged with a dependency on some other driver providing platform
data for it.

However the battery-data-framework which should provide that data
never got merged, resulting in x86 tablets / laptops with an axp288
having no working battery monitor, as before this commit the driver
would simply return -ENODEV if there is no platform data.

This commit removes the dependency on the platform_data instead
checking that the firmware has initialized the fuel-gauge and
reading the info back from the pmic.

What is missing from the read-back info is the table to map raw adc
values to temperature, so this commit drops the temperature and
temperature limits properties. The min voltage, charge design and
model name info is also missing. Note that none of these are really
important for userspace to have.

All other functionality is preserved and actually made available
by this commit.

BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=88471
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/axp288_fuel_gauge.c | 405 +++----------------------------
 include/linux/mfd/axp20x.h               |  22 --
 2 files changed, 33 insertions(+), 394 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/axp288_fuel_gauge.c b/drivers/power/supply/axp288_fuel_gauge.c
index 539eb41504bb..326eb08beaa2 100644
--- a/drivers/power/supply/axp288_fuel_gauge.c
+++ b/drivers/power/supply/axp288_fuel_gauge.c
@@ -49,11 +49,6 @@
 #define CHRG_CCCV_CV_4350MV			0x3     /* 4.35V */
 #define CHRG_CCCV_CHG_EN			(1 << 7)
 
-#define CV_4100						4100    /* 4100mV */
-#define CV_4150						4150    /* 4150mV */
-#define CV_4200						4200    /* 4200mV */
-#define CV_4350						4350    /* 4350mV */
-
 #define TEMP_IRQ_CFG_QWBTU			(1 << 0)
 #define TEMP_IRQ_CFG_WBTU			(1 << 1)
 #define TEMP_IRQ_CFG_QWBTO			(1 << 2)
@@ -104,9 +99,7 @@
 
 /* 1.1mV per LSB expressed in uV */
 #define VOLTAGE_FROM_ADC(a)			((a * 11) / 10)
-/* properties converted to tenths of degrees, uV, uA, uW */
-#define PROP_TEMP(a)		((a) * 10)
-#define UNPROP_TEMP(a)		((a) / 10)
+/* properties converted to uV, uA */
 #define PROP_VOLT(a)		((a) * 1000)
 #define PROP_CURR(a)		((a) * 1000)
 
@@ -122,13 +115,13 @@ enum {
 
 struct axp288_fg_info {
 	struct platform_device *pdev;
-	struct axp20x_fg_pdata *pdata;
 	struct regmap *regmap;
 	struct regmap_irq_chip_data *regmap_irqc;
 	int irq[AXP288_FG_INTR_NUM];
 	struct power_supply *bat;
 	struct mutex lock;
 	int status;
+	int max_volt;
 	struct delayed_work status_monitor;
 	struct dentry *debug_file;
 };
@@ -138,22 +131,14 @@ static enum power_supply_property fuel_gauge_props[] = {
 	POWER_SUPPLY_PROP_PRESENT,
 	POWER_SUPPLY_PROP_HEALTH,
 	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
-	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
 	POWER_SUPPLY_PROP_VOLTAGE_OCV,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
 	POWER_SUPPLY_PROP_CAPACITY,
 	POWER_SUPPLY_PROP_CAPACITY_ALERT_MIN,
-	POWER_SUPPLY_PROP_TEMP,
-	POWER_SUPPLY_PROP_TEMP_MAX,
-	POWER_SUPPLY_PROP_TEMP_MIN,
-	POWER_SUPPLY_PROP_TEMP_ALERT_MIN,
-	POWER_SUPPLY_PROP_TEMP_ALERT_MAX,
 	POWER_SUPPLY_PROP_TECHNOLOGY,
 	POWER_SUPPLY_PROP_CHARGE_FULL,
 	POWER_SUPPLY_PROP_CHARGE_NOW,
-	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
-	POWER_SUPPLY_PROP_MODEL_NAME,
 };
 
 static int fuel_gauge_reg_readb(struct axp288_fg_info *info, int reg)
@@ -417,102 +402,6 @@ current_read_fail:
 	return ret;
 }
 
-static int temp_to_adc(struct axp288_fg_info *info, int tval)
-{
-	int rntc = 0, i, ret, adc_val;
-	int rmin, rmax, tmin, tmax;
-	int tcsz = info->pdata->tcsz;
-
-	/* get the Rntc resitance value for this temp */
-	if (tval > info->pdata->thermistor_curve[0][1]) {
-		rntc = info->pdata->thermistor_curve[0][0];
-	} else if (tval <= info->pdata->thermistor_curve[tcsz-1][1]) {
-		rntc = info->pdata->thermistor_curve[tcsz-1][0];
-	} else {
-		for (i = 1; i < tcsz; i++) {
-			if (tval > info->pdata->thermistor_curve[i][1]) {
-				rmin = info->pdata->thermistor_curve[i-1][0];
-				rmax = info->pdata->thermistor_curve[i][0];
-				tmin = info->pdata->thermistor_curve[i-1][1];
-				tmax = info->pdata->thermistor_curve[i][1];
-				rntc = rmin + ((rmax - rmin) *
-					(tval - tmin) / (tmax - tmin));
-				break;
-			}
-		}
-	}
-
-	/* we need the current to calculate the proper adc voltage */
-	ret = fuel_gauge_reg_readb(info, AXP20X_ADC_RATE);
-	if (ret < 0) {
-		dev_err(&info->pdev->dev, "%s:read err:%d\n", __func__, ret);
-		ret = 0x30;
-	}
-
-	/*
-	 * temperature is proportional to NTS thermistor resistance
-	 * ADC_RATE[5-4] determines current, 00=20uA,01=40uA,10=60uA,11=80uA
-	 * [12-bit ADC VAL] = R_NTC(Ω) * current / 800
-	 */
-	adc_val = rntc * (20 + (20 * ((ret >> 4) & 0x3))) / 800;
-
-	return adc_val;
-}
-
-static int adc_to_temp(struct axp288_fg_info *info, int adc_val)
-{
-	int ret, r, i, tval = 0;
-	int rmin, rmax, tmin, tmax;
-	int tcsz = info->pdata->tcsz;
-
-	ret = fuel_gauge_reg_readb(info, AXP20X_ADC_RATE);
-	if (ret < 0) {
-		dev_err(&info->pdev->dev, "%s:read err:%d\n", __func__, ret);
-		ret = 0x30;
-	}
-
-	/*
-	 * temperature is proportional to NTS thermistor resistance
-	 * ADC_RATE[5-4] determines current, 00=20uA,01=40uA,10=60uA,11=80uA
-	 * R_NTC(Ω) = [12-bit ADC VAL] * 800 / current
-	 */
-	r = adc_val * 800 / (20 + (20 * ((ret >> 4) & 0x3)));
-
-	if (r < info->pdata->thermistor_curve[0][0]) {
-		tval = info->pdata->thermistor_curve[0][1];
-	} else if (r >= info->pdata->thermistor_curve[tcsz-1][0]) {
-		tval = info->pdata->thermistor_curve[tcsz-1][1];
-	} else {
-		for (i = 1; i < tcsz; i++) {
-			if (r < info->pdata->thermistor_curve[i][0]) {
-				rmin = info->pdata->thermistor_curve[i-1][0];
-				rmax = info->pdata->thermistor_curve[i][0];
-				tmin = info->pdata->thermistor_curve[i-1][1];
-				tmax = info->pdata->thermistor_curve[i][1];
-				tval = tmin + ((tmax - tmin) *
-					(r - rmin) / (rmax - rmin));
-				break;
-			}
-		}
-	}
-
-	return tval;
-}
-
-static int fuel_gauge_get_btemp(struct axp288_fg_info *info, int *btemp)
-{
-	int ret, raw_val = 0;
-
-	ret = pmic_read_adc_val("axp288-batt-temp", &raw_val, info);
-	if (ret < 0)
-		goto temp_read_fail;
-
-	*btemp = adc_to_temp(info, raw_val);
-
-temp_read_fail:
-	return ret;
-}
-
 static int fuel_gauge_get_vocv(struct axp288_fg_info *info, int *vocv)
 {
 	int ret, value;
@@ -535,25 +424,14 @@ vocv_read_fail:
 
 static int fuel_gauge_battery_health(struct axp288_fg_info *info)
 {
-	int temp, vocv;
-	int ret, health = POWER_SUPPLY_HEALTH_UNKNOWN;
-
-	ret = fuel_gauge_get_btemp(info, &temp);
-	if (ret < 0)
-		goto health_read_fail;
+	int ret, vocv, health = POWER_SUPPLY_HEALTH_UNKNOWN;
 
 	ret = fuel_gauge_get_vocv(info, &vocv);
 	if (ret < 0)
 		goto health_read_fail;
 
-	if (vocv > info->pdata->max_volt)
+	if (vocv > info->max_volt)
 		health = POWER_SUPPLY_HEALTH_OVERVOLTAGE;
-	else if (temp > info->pdata->max_temp)
-		health = POWER_SUPPLY_HEALTH_OVERHEAT;
-	else if (temp < info->pdata->min_temp)
-		health = POWER_SUPPLY_HEALTH_COLD;
-	else if (vocv < info->pdata->min_volt)
-		health = POWER_SUPPLY_HEALTH_DEAD;
 	else
 		health = POWER_SUPPLY_HEALTH_GOOD;
 
@@ -561,28 +439,6 @@ health_read_fail:
 	return health;
 }
 
-static int fuel_gauge_set_high_btemp_alert(struct axp288_fg_info *info)
-{
-	int ret, adc_val;
-
-	/* program temperature threshold as 1/16 ADC value */
-	adc_val = temp_to_adc(info, info->pdata->max_temp);
-	ret = fuel_gauge_reg_writeb(info, AXP20X_V_HTF_DISCHRG, adc_val >> 4);
-
-	return ret;
-}
-
-static int fuel_gauge_set_low_btemp_alert(struct axp288_fg_info *info)
-{
-	int ret, adc_val;
-
-	/* program temperature threshold as 1/16 ADC value */
-	adc_val = temp_to_adc(info, info->pdata->min_temp);
-	ret = fuel_gauge_reg_writeb(info, AXP20X_V_LTF_DISCHRG, adc_val >> 4);
-
-	return ret;
-}
-
 static int fuel_gauge_get_property(struct power_supply *ps,
 		enum power_supply_property prop,
 		union power_supply_propval *val)
@@ -643,20 +499,6 @@ static int fuel_gauge_get_property(struct power_supply *ps,
 			goto fuel_gauge_read_err;
 		val->intval = (ret & 0x0f);
 		break;
-	case POWER_SUPPLY_PROP_TEMP:
-		ret = fuel_gauge_get_btemp(info, &value);
-		if (ret < 0)
-			goto fuel_gauge_read_err;
-		val->intval = PROP_TEMP(value);
-		break;
-	case POWER_SUPPLY_PROP_TEMP_MAX:
-	case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
-		val->intval = PROP_TEMP(info->pdata->max_temp);
-		break;
-	case POWER_SUPPLY_PROP_TEMP_MIN:
-	case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
-		val->intval = PROP_TEMP(info->pdata->min_temp);
-		break;
 	case POWER_SUPPLY_PROP_TECHNOLOGY:
 		val->intval = POWER_SUPPLY_TECHNOLOGY_LION;
 		break;
@@ -684,17 +526,8 @@ static int fuel_gauge_get_property(struct power_supply *ps,
 		value |= (ret & FG_DES_CAP0_VAL_MASK);
 		val->intval = value * FG_DES_CAP_RES_LSB;
 		break;
-	case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
-		val->intval = PROP_CURR(info->pdata->design_cap);
-		break;
 	case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
-		val->intval = PROP_VOLT(info->pdata->max_volt);
-		break;
-	case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
-		val->intval = PROP_VOLT(info->pdata->min_volt);
-		break;
-	case POWER_SUPPLY_PROP_MODEL_NAME:
-		val->strval = info->pdata->battid;
+		val->intval = PROP_VOLT(info->max_volt);
 		break;
 	default:
 		mutex_unlock(&info->lock);
@@ -718,35 +551,6 @@ static int fuel_gauge_set_property(struct power_supply *ps,
 
 	mutex_lock(&info->lock);
 	switch (prop) {
-	case POWER_SUPPLY_PROP_STATUS:
-		info->status = val->intval;
-		break;
-	case POWER_SUPPLY_PROP_TEMP_MIN:
-	case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
-		if ((val->intval < PD_DEF_MIN_TEMP) ||
-			(val->intval > PD_DEF_MAX_TEMP)) {
-			ret = -EINVAL;
-			break;
-		}
-		info->pdata->min_temp = UNPROP_TEMP(val->intval);
-		ret = fuel_gauge_set_low_btemp_alert(info);
-		if (ret < 0)
-			dev_err(&info->pdev->dev,
-				"temp alert min set fail:%d\n", ret);
-		break;
-	case POWER_SUPPLY_PROP_TEMP_MAX:
-	case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
-		if ((val->intval < PD_DEF_MIN_TEMP) ||
-			(val->intval > PD_DEF_MAX_TEMP)) {
-			ret = -EINVAL;
-			break;
-		}
-		info->pdata->max_temp = UNPROP_TEMP(val->intval);
-		ret = fuel_gauge_set_high_btemp_alert(info);
-		if (ret < 0)
-			dev_err(&info->pdev->dev,
-				"temp alert max set fail:%d\n", ret);
-		break;
 	case POWER_SUPPLY_PROP_CAPACITY_ALERT_MIN:
 		if ((val->intval < 0) || (val->intval > 15)) {
 			ret = -EINVAL;
@@ -774,11 +578,6 @@ static int fuel_gauge_property_is_writeable(struct power_supply *psy,
 	int ret;
 
 	switch (psp) {
-	case POWER_SUPPLY_PROP_STATUS:
-	case POWER_SUPPLY_PROP_TEMP_MIN:
-	case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
-	case POWER_SUPPLY_PROP_TEMP_MAX:
-	case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
 	case POWER_SUPPLY_PROP_CAPACITY_ALERT_MIN:
 		ret = 1;
 		break;
@@ -863,158 +662,6 @@ static const struct power_supply_desc fuel_gauge_desc = {
 	.external_power_changed	= fuel_gauge_external_power_changed,
 };
 
-static int fuel_gauge_set_lowbatt_thresholds(struct axp288_fg_info *info)
-{
-	int ret;
-	u8 reg_val;
-
-	ret = fuel_gauge_reg_readb(info, AXP20X_FG_RES);
-	if (ret < 0) {
-		dev_err(&info->pdev->dev, "%s:read err:%d\n", __func__, ret);
-		return ret;
-	}
-	ret = (ret & FG_REP_CAP_VAL_MASK);
-
-	if (ret > FG_LOW_CAP_WARN_THR)
-		reg_val = FG_LOW_CAP_WARN_THR;
-	else if (ret > FG_LOW_CAP_CRIT_THR)
-		reg_val = FG_LOW_CAP_CRIT_THR;
-	else
-		reg_val = FG_LOW_CAP_SHDN_THR;
-
-	reg_val |= FG_LOW_CAP_THR1_VAL;
-	ret = fuel_gauge_reg_writeb(info, AXP288_FG_LOW_CAP_REG, reg_val);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "%s:write err:%d\n", __func__, ret);
-
-	return ret;
-}
-
-static int fuel_gauge_program_vbatt_full(struct axp288_fg_info *info)
-{
-	int ret;
-	u8 val;
-
-	ret = fuel_gauge_reg_readb(info, AXP20X_CHRG_CTRL1);
-	if (ret < 0)
-		goto fg_prog_ocv_fail;
-	else
-		val = (ret & ~CHRG_CCCV_CV_MASK);
-
-	switch (info->pdata->max_volt) {
-	case CV_4100:
-		val |= (CHRG_CCCV_CV_4100MV << CHRG_CCCV_CV_BIT_POS);
-		break;
-	case CV_4150:
-		val |= (CHRG_CCCV_CV_4150MV << CHRG_CCCV_CV_BIT_POS);
-		break;
-	case CV_4200:
-		val |= (CHRG_CCCV_CV_4200MV << CHRG_CCCV_CV_BIT_POS);
-		break;
-	case CV_4350:
-		val |= (CHRG_CCCV_CV_4350MV << CHRG_CCCV_CV_BIT_POS);
-		break;
-	default:
-		val |= (CHRG_CCCV_CV_4200MV << CHRG_CCCV_CV_BIT_POS);
-		break;
-	}
-
-	ret = fuel_gauge_reg_writeb(info, AXP20X_CHRG_CTRL1, val);
-fg_prog_ocv_fail:
-	return ret;
-}
-
-static int fuel_gauge_program_design_cap(struct axp288_fg_info *info)
-{
-	int ret;
-
-	ret = fuel_gauge_reg_writeb(info,
-		AXP288_FG_DES_CAP1_REG, info->pdata->cap1);
-	if (ret < 0)
-		goto fg_prog_descap_fail;
-
-	ret = fuel_gauge_reg_writeb(info,
-		AXP288_FG_DES_CAP0_REG, info->pdata->cap0);
-
-fg_prog_descap_fail:
-	return ret;
-}
-
-static int fuel_gauge_program_ocv_curve(struct axp288_fg_info *info)
-{
-	int ret = 0, i;
-
-	for (i = 0; i < OCV_CURVE_SIZE; i++) {
-		ret = fuel_gauge_reg_writeb(info,
-			AXP288_FG_OCV_CURVE_REG + i, info->pdata->ocv_curve[i]);
-		if (ret < 0)
-			goto fg_prog_ocv_fail;
-	}
-
-fg_prog_ocv_fail:
-	return ret;
-}
-
-static int fuel_gauge_program_rdc_vals(struct axp288_fg_info *info)
-{
-	int ret;
-
-	ret = fuel_gauge_reg_writeb(info,
-		AXP288_FG_RDC1_REG, info->pdata->rdc1);
-	if (ret < 0)
-		goto fg_prog_ocv_fail;
-
-	ret = fuel_gauge_reg_writeb(info,
-		AXP288_FG_RDC0_REG, info->pdata->rdc0);
-
-fg_prog_ocv_fail:
-	return ret;
-}
-
-static void fuel_gauge_init_config_regs(struct axp288_fg_info *info)
-{
-	int ret;
-
-	/*
-	 * check if the config data is already
-	 * programmed and if so just return.
-	 */
-
-	ret = fuel_gauge_reg_readb(info, AXP288_FG_DES_CAP1_REG);
-	if (ret < 0) {
-		dev_warn(&info->pdev->dev, "CAP1 reg read err!!\n");
-	} else if (!(ret & FG_DES_CAP1_VALID)) {
-		dev_info(&info->pdev->dev, "FG data needs to be initialized\n");
-	} else {
-		dev_info(&info->pdev->dev, "FG data is already initialized\n");
-		return;
-	}
-
-	ret = fuel_gauge_program_vbatt_full(info);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "set vbatt full fail:%d\n", ret);
-
-	ret = fuel_gauge_program_design_cap(info);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "set design cap fail:%d\n", ret);
-
-	ret = fuel_gauge_program_rdc_vals(info);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "set rdc fail:%d\n", ret);
-
-	ret = fuel_gauge_program_ocv_curve(info);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "set ocv curve fail:%d\n", ret);
-
-	ret = fuel_gauge_set_lowbatt_thresholds(info);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "lowbatt thr set fail:%d\n", ret);
-
-	ret = fuel_gauge_reg_writeb(info, AXP20X_CC_CTRL, 0xef);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "gauge cntl set fail:%d\n", ret);
-}
-
 static void fuel_gauge_init_irq(struct axp288_fg_info *info)
 {
 	int ret, i, pirq;
@@ -1054,17 +701,8 @@ intr_failed:
 
 static void fuel_gauge_init_hw_regs(struct axp288_fg_info *info)
 {
-	int ret;
 	unsigned int val;
 
-	ret = fuel_gauge_set_high_btemp_alert(info);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "high batt temp set fail:%d\n", ret);
-
-	ret = fuel_gauge_set_low_btemp_alert(info);
-	if (ret < 0)
-		dev_err(&info->pdev->dev, "low batt temp set fail:%d\n", ret);
-
 	/* enable interrupts */
 	val = fuel_gauge_reg_readb(info, AXP20X_IRQ3_EN);
 	val |= TEMP_IRQ_CFG_MASK;
@@ -1090,15 +728,39 @@ static int axp288_fuel_gauge_probe(struct platform_device *pdev)
 	info->regmap = axp20x->regmap;
 	info->regmap_irqc = axp20x->regmap_irqc;
 	info->status = POWER_SUPPLY_STATUS_UNKNOWN;
-	info->pdata = pdev->dev.platform_data;
-	if (!info->pdata)
-		return -ENODEV;
 
 	platform_set_drvdata(pdev, info);
 
 	mutex_init(&info->lock);
 	INIT_DELAYED_WORK(&info->status_monitor, fuel_gauge_status_monitor);
 
+	ret = fuel_gauge_reg_readb(info, AXP288_FG_DES_CAP1_REG);
+	if (ret < 0)
+		return ret;
+
+	if (!(ret & FG_DES_CAP1_VALID)) {
+		dev_err(&pdev->dev, "axp288 not configured by firmware\n");
+		return -ENODEV;
+	}
+
+	ret = fuel_gauge_reg_readb(info, AXP20X_CHRG_CTRL1);
+	if (ret < 0)
+		return ret;
+	switch ((ret & CHRG_CCCV_CV_MASK) >> CHRG_CCCV_CV_BIT_POS) {
+	case CHRG_CCCV_CV_4100MV:
+		info->max_volt = 4100;
+		break;
+	case CHRG_CCCV_CV_4150MV:
+		info->max_volt = 4150;
+		break;
+	case CHRG_CCCV_CV_4200MV:
+		info->max_volt = 4200;
+		break;
+	case CHRG_CCCV_CV_4350MV:
+		info->max_volt = 4350;
+		break;
+	}
+
 	psy_cfg.drv_data = info;
 	info->bat = power_supply_register(&pdev->dev, &fuel_gauge_desc, &psy_cfg);
 	if (IS_ERR(info->bat)) {
@@ -1108,12 +770,11 @@ static int axp288_fuel_gauge_probe(struct platform_device *pdev)
 	}
 
 	fuel_gauge_create_debugfs(info);
-	fuel_gauge_init_config_regs(info);
 	fuel_gauge_init_irq(info);
 	fuel_gauge_init_hw_regs(info);
 	schedule_delayed_work(&info->status_monitor, STATUS_MON_DELAY_JIFFIES);
 
-	return ret;
+	return 0;
 }
 
 static const struct platform_device_id axp288_fg_id_table[] = {
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index e460fc42d63e..812806d6319b 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -532,28 +532,6 @@ struct axp20x_dev {
 	const struct regmap_irq_chip	*regmap_irq_chip;
 };
 
-#define BATTID_LEN				64
-#define OCV_CURVE_SIZE			32
-#define MAX_THERM_CURVE_SIZE	25
-#define PD_DEF_MIN_TEMP			0
-#define PD_DEF_MAX_TEMP			55
-
-struct axp20x_fg_pdata {
-	char battid[BATTID_LEN + 1];
-	int design_cap;
-	int min_volt;
-	int max_volt;
-	int max_temp;
-	int min_temp;
-	int cap1;
-	int cap0;
-	int rdc1;
-	int rdc0;
-	int ocv_curve[OCV_CURVE_SIZE];
-	int tcsz;
-	int thermistor_curve[MAX_THERM_CURVE_SIZE][2];
-};
-
 struct axp288_extcon_pdata {
 	/* GPIO pin control to switch D+/D- lines b/w PMIC and SOC */
 	struct gpio_desc *gpio_mux_cntl;
-- 
cgit v1.2.3


From 5952758101fb55844957a8d4fe88402d9827cfb4 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 4 Jan 2017 19:56:24 +0100
Subject: dsa: mv88e6xxx: Optimise atu_get

Lookup in the ATU can be performed starting from a given MAC
address. This is faster than starting with the first possible MAC
address and iterating all entries.

Entries are returned in numeric order. So if the MAC address returned
is bigger than what we are searching for, we know it is not in the
ATU.

Using the benchmark provided by Volodymyr Bendiuga
<volodymyr.bendiuga@gmail.com>,

https://www.spinics.net/lists/netdev/msg411550.html

on an Marvell Armada 370 RD, the test to add a number of static fdb
entries went from 1.616531 seconds to 0.312052 seconds.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c |  5 ++--
 include/linux/etherdevice.h      | 60 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index b5f0e1ec864d..676b0e2ad221 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2023,7 +2023,8 @@ static int mv88e6xxx_atu_get(struct mv88e6xxx_chip *chip, int fid,
 	struct mv88e6xxx_atu_entry next;
 	int err;
 
-	eth_broadcast_addr(next.mac);
+	memcpy(next.mac, addr, ETH_ALEN);
+	eth_addr_dec(next.mac);
 
 	err = _mv88e6xxx_atu_mac_write(chip, next.mac);
 	if (err)
@@ -2041,7 +2042,7 @@ static int mv88e6xxx_atu_get(struct mv88e6xxx_chip *chip, int fid,
 			*entry = next;
 			return 0;
 		}
-	} while (!is_broadcast_ether_addr(next.mac));
+	} while (ether_addr_greater(addr, next.mac));
 
 	memset(entry, 0, sizeof(*entry));
 	entry->fid = fid;
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 6fec9e81bd70..42add77ae47d 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -396,6 +396,66 @@ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
 	return true;
 }
 
+/**
+ * ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Return a u64 value of the address
+ */
+static inline u64 ether_addr_to_u64(const u8 *addr)
+{
+	u64 u = 0;
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++)
+		u = u << 8 | addr[i];
+
+	return u;
+}
+
+/**
+ * u64_to_ether_addr - Convert a u64 to an Ethernet address.
+ * @u: u64 to convert to an Ethernet MAC address
+ * @addr: Pointer to a six-byte array to contain the Ethernet address
+ */
+static inline void u64_to_ether_addr(u64 u, u8 *addr)
+{
+	int i;
+
+	for (i = ETH_ALEN - 1; i >= 0; i--) {
+		addr[i] = u & 0xff;
+		u = u >> 8;
+	}
+}
+
+/**
+ * eth_addr_dec - Decrement the given MAC address
+ *
+ * @addr: Pointer to a six-byte array containing Ethernet address to decrement
+ */
+static inline void eth_addr_dec(u8 *addr)
+{
+	u64 u = ether_addr_to_u64(addr);
+
+	u--;
+	u64_to_ether_addr(u, addr);
+}
+
+/**
+ * ether_addr_greater - Compare two Ethernet addresses
+ * @addr1: Pointer to a six-byte array containing the Ethernet address
+ * @addr2: Pointer other six-byte array containing the Ethernet address
+ *
+ * Compare two Ethernet addresses, returns true addr1 is greater than addr2
+ */
+static inline bool ether_addr_greater(const u8 *addr1, const u8 *addr2)
+{
+	u64 u1 = ether_addr_to_u64(addr1);
+	u64 u2 = ether_addr_to_u64(addr2);
+
+	return u1 > u2;
+}
+
 /**
  * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
  * @dev: Pointer to a device structure
-- 
cgit v1.2.3


From 3db5e3e707ebb9ab0ce3a2dcd924ed2ea525d770 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 5 Jan 2017 13:37:28 +0100
Subject: wireless: move IEEE80211_NUM_ACS to ieee80211.h

This constant isn't really specific to mac80211, so move it
"up" a level to ieee80211.h

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 ++
 include/net/mac80211.h    | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index fe849329511a..87d1937e4671 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -185,6 +185,8 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 
 /* number of user priorities 802.11 uses */
 #define IEEE80211_NUM_UPS		8
+/* number of ACs */
+#define IEEE80211_NUM_ACS		4
 
 #define IEEE80211_QOS_CTL_LEN		2
 /* 1d tag mask */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5345d358a510..5f5cb194cd78 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -147,7 +147,6 @@ enum ieee80211_ac_numbers {
 	IEEE80211_AC_BE		= 2,
 	IEEE80211_AC_BK		= 3,
 };
-#define IEEE80211_NUM_ACS	4
 
 /**
  * struct ieee80211_tx_queue_params - transmit queue configuration
-- 
cgit v1.2.3


From 0e377f3b9ae936aefe5aaca4c2e2546d57b63df7 Mon Sep 17 00:00:00 2001
From: Song Hongyan <hongyan.song@intel.com>
Date: Thu, 5 Jan 2017 18:24:04 +0800
Subject: iio: Add gravity sensor support

Gravity sensor is a soft sensor, which derives value from
standard accelerometer device by filtering out the acceleration
which is not caused by gravity.

Gravity sensor provides a three dimensional vector indicating
the direction and magnitude of gravity. Typically, this sensor
is used to determine the device's relative orientation in space.
The units and the coordinate system is the same as the one used by
the acceleration sensor.
When a device is at rest, the output of the gravity sensor should
be identical to that of the accelerometer.

More information can be found in:
http://www.usb.org/developers/hidpage/HUTRR59_-_Usages_for_Wearables.pdf

Gravity sensor and accelerometer have similar channels and
share channel usage ids. So the most of the code for accel_3d
can be reused.

Signed-off-by: Song Hongyan <hongyan.song@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/hid-sensor-accel-3d.c | 74 +++++++++++++++++++++++++++------
 include/linux/hid-sensor-ids.h          |  3 ++
 2 files changed, 64 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/hid-sensor-accel-3d.c b/drivers/iio/accel/hid-sensor-accel-3d.c
index 39de79ce8778..ca5759c0c318 100644
--- a/drivers/iio/accel/hid-sensor-accel-3d.c
+++ b/drivers/iio/accel/hid-sensor-accel-3d.c
@@ -93,6 +93,41 @@ static const struct iio_chan_spec accel_3d_channels[] = {
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
+/* Channel definitions */
+static const struct iio_chan_spec gravity_channels[] = {
+	{
+		.type = IIO_GRAVITY,
+		.modified = 1,
+		.channel2 = IIO_MOD_X,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+		BIT(IIO_CHAN_INFO_SCALE) |
+		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS),
+		.scan_index = CHANNEL_SCAN_INDEX_X,
+	}, {
+		.type = IIO_GRAVITY,
+		.modified = 1,
+		.channel2 = IIO_MOD_Y,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+		BIT(IIO_CHAN_INFO_SCALE) |
+		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS),
+		.scan_index = CHANNEL_SCAN_INDEX_Y,
+	}, {
+		.type = IIO_GRAVITY,
+		.modified = 1,
+		.channel2 = IIO_MOD_Z,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+		BIT(IIO_CHAN_INFO_SCALE) |
+		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS),
+		.scan_index = CHANNEL_SCAN_INDEX_Z,
+	}
+};
+
 /* Adjust channel real bits based on report descriptor */
 static void accel_3d_adjust_channel_bit_mask(struct iio_chan_spec *channels,
 						int channel, int size)
@@ -114,6 +149,8 @@ static int accel_3d_read_raw(struct iio_dev *indio_dev,
 	int report_id = -1;
 	u32 address;
 	int ret_type;
+	struct hid_sensor_hub_device *hsdev =
+					accel_state->common_attributes.hsdev;
 
 	*val = 0;
 	*val2 = 0;
@@ -125,8 +162,7 @@ static int accel_3d_read_raw(struct iio_dev *indio_dev,
 		if (report_id >= 0)
 			*val = sensor_hub_input_attr_get_raw_value(
 					accel_state->common_attributes.hsdev,
-					HID_USAGE_SENSOR_ACCEL_3D, address,
-					report_id,
+					hsdev->usage, address, report_id,
 					SENSOR_HUB_SYNC);
 		else {
 			*val = 0;
@@ -288,7 +324,7 @@ static int accel_3d_parse_report(struct platform_device *pdev,
 			st->accel[2].index, st->accel[2].report_id);
 
 	st->scale_precision = hid_sensor_format_scale(
-				HID_USAGE_SENSOR_ACCEL_3D,
+				hsdev->usage,
 				&st->accel[CHANNEL_SCAN_INDEX_X],
 				&st->scale_pre_decml, &st->scale_post_decml);
 
@@ -311,9 +347,12 @@ static int accel_3d_parse_report(struct platform_device *pdev,
 static int hid_accel_3d_probe(struct platform_device *pdev)
 {
 	int ret = 0;
-	static const char *name = "accel_3d";
+	static const char *name;
 	struct iio_dev *indio_dev;
 	struct accel_3d_state *accel_state;
+	const struct iio_chan_spec *channel_spec;
+	int channel_size;
+
 	struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data;
 
 	indio_dev = devm_iio_device_alloc(&pdev->dev,
@@ -327,24 +366,30 @@ static int hid_accel_3d_probe(struct platform_device *pdev)
 	accel_state->common_attributes.hsdev = hsdev;
 	accel_state->common_attributes.pdev = pdev;
 
-	ret = hid_sensor_parse_common_attributes(hsdev,
-					HID_USAGE_SENSOR_ACCEL_3D,
+	if (hsdev->usage == HID_USAGE_SENSOR_ACCEL_3D) {
+		name = "accel_3d";
+		channel_spec = accel_3d_channels;
+		channel_size = sizeof(accel_3d_channels);
+	} else {
+		name = "gravity";
+		channel_spec = gravity_channels;
+		channel_size = sizeof(gravity_channels);
+	}
+	ret = hid_sensor_parse_common_attributes(hsdev, hsdev->usage,
 					&accel_state->common_attributes);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
 	}
+	indio_dev->channels = kmemdup(channel_spec, channel_size, GFP_KERNEL);
 
-	indio_dev->channels = kmemdup(accel_3d_channels,
-				      sizeof(accel_3d_channels), GFP_KERNEL);
 	if (!indio_dev->channels) {
 		dev_err(&pdev->dev, "failed to duplicate channels\n");
 		return -ENOMEM;
 	}
-
 	ret = accel_3d_parse_report(pdev, hsdev,
-				    (struct iio_chan_spec *)indio_dev->channels,
-				    HID_USAGE_SENSOR_ACCEL_3D, accel_state);
+				(struct iio_chan_spec *)indio_dev->channels,
+				hsdev->usage, accel_state);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup attributes\n");
 		goto error_free_dev_mem;
@@ -379,7 +424,7 @@ static int hid_accel_3d_probe(struct platform_device *pdev)
 	accel_state->callbacks.send_event = accel_3d_proc_event;
 	accel_state->callbacks.capture_sample = accel_3d_capture_sample;
 	accel_state->callbacks.pdev = pdev;
-	ret = sensor_hub_register_callback(hsdev, HID_USAGE_SENSOR_ACCEL_3D,
+	ret = sensor_hub_register_callback(hsdev, hsdev->usage,
 					&accel_state->callbacks);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "callback reg failed\n");
@@ -406,7 +451,7 @@ static int hid_accel_3d_remove(struct platform_device *pdev)
 	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
 	struct accel_3d_state *accel_state = iio_priv(indio_dev);
 
-	sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_ACCEL_3D);
+	sensor_hub_remove_callback(hsdev, hsdev->usage);
 	iio_device_unregister(indio_dev);
 	hid_sensor_remove_trigger(&accel_state->common_attributes);
 	iio_triggered_buffer_cleanup(indio_dev);
@@ -420,6 +465,9 @@ static const struct platform_device_id hid_accel_3d_ids[] = {
 		/* Format: HID-SENSOR-usage_id_in_hex_lowercase */
 		.name = "HID-SENSOR-200073",
 	},
+	{	/* gravity sensor */
+		.name = "HID-SENSOR-20007b",
+	},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(platform, hid_accel_3d_ids);
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 9a3a9db1b39d..30c7dc45e45f 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -52,6 +52,9 @@
 #define HID_USAGE_SENSOR_ANGL_VELOCITY_Y_AXIS			0x200458
 #define HID_USAGE_SENSOR_ANGL_VELOCITY_Z_AXIS			0x200459
 
+/* Gravity vector */
+#define HID_USAGE_SENSOR_GRAVITY_VECTOR				0x20007B
+
 /* ORIENTATION: Compass 3D: (200083) */
 #define HID_USAGE_SENSOR_COMPASS_3D				0x200083
 #define HID_USAGE_SENSOR_DATA_ORIENTATION			0x200470
-- 
cgit v1.2.3


From 210af2a5f12403a8968d6014742886cc7e9823b4 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Thu, 5 Jan 2017 10:52:10 -0600
Subject: ipmi: make ipmi_usr_hndl const

It's only function pointers.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 drivers/char/ipmi/ipmi_devintf.c    | 2 +-
 drivers/char/ipmi/ipmi_msghandler.c | 4 ++--
 drivers/char/ipmi/ipmi_watchdog.c   | 2 +-
 include/linux/ipmi.h                | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index a21407de46ae..f45119c5337d 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -108,7 +108,7 @@ static int ipmi_fasync(int fd, struct file *file, int on)
 	return (result);
 }
 
-static struct ipmi_user_hndl ipmi_hndlrs =
+static const struct ipmi_user_hndl ipmi_hndlrs =
 {
 	.ipmi_recv_hndl	= file_receive_handler,
 };
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 92e53acf2cd2..9f699951b75a 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -102,7 +102,7 @@ struct ipmi_user {
 	struct kref refcount;
 
 	/* The upper layer that handles receive messages. */
-	struct ipmi_user_hndl *handler;
+	const struct ipmi_user_hndl *handler;
 	void             *handler_data;
 
 	/* The interface this user is bound to. */
@@ -919,7 +919,7 @@ static int intf_err_seq(ipmi_smi_t   intf,
 
 
 int ipmi_create_user(unsigned int          if_num,
-		     struct ipmi_user_hndl *handler,
+		     const struct ipmi_user_hndl *handler,
 		     void                  *handler_data,
 		     ipmi_user_t           *user)
 {
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 4035495f3a86..30b9e83bf1bf 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -985,7 +985,7 @@ static void ipmi_wdog_pretimeout_handler(void *handler_data)
 	pretimeout_since_last_heartbeat = 1;
 }
 
-static struct ipmi_user_hndl ipmi_hndlrs = {
+static const struct ipmi_user_hndl ipmi_hndlrs = {
 	.ipmi_recv_hndl           = ipmi_wdog_msg_handler,
 	.ipmi_watchdog_pretimeout = ipmi_wdog_pretimeout_handler
 };
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 78c5d5ae3857..f1045b2c6a00 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -100,7 +100,7 @@ struct ipmi_user_hndl {
 
 /* Create a new user of the IPMI layer on the given interface number. */
 int ipmi_create_user(unsigned int          if_num,
-		     struct ipmi_user_hndl *handler,
+		     const struct ipmi_user_hndl *handler,
 		     void                  *handler_data,
 		     ipmi_user_t           *user);
 
-- 
cgit v1.2.3


From 4e552c8cb5bc9137e67e035bab8df6dddbca7384 Mon Sep 17 00:00:00 2001
From: Andi Shyti <andi.shyti@samsung.com>
Date: Thu, 5 Jan 2017 11:34:12 +0900
Subject: leds: add LED_ON brightness as boolean value

Some devices do not handle the led brightness or simply don't
care about it. Conceptually said devices want to just switch on
or off the led. It is useless in this case to have a 255 range
of brightness, while just having an LED_ON and LED_OFF improves
the boolean meaning of the led status.

Signed-off-by: Andi Shyti <andi.shyti@samsung.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 include/linux/leds.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 569cb531094c..bb50d0151e75 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -27,6 +27,7 @@ struct device;
 
 enum led_brightness {
 	LED_OFF		= 0,
+	LED_ON		= 1,
 	LED_HALF	= 127,
 	LED_FULL	= 255,
 };
-- 
cgit v1.2.3


From eca90a3b3226fcecb4b3bbf4b0b6ff72422674bc Mon Sep 17 00:00:00 2001
From: Alexander Alemayhu <alexander@alemayhu.com>
Date: Thu, 5 Jan 2017 22:11:50 +0100
Subject: EDAC: Fix typos in enum mem_type comments

s/labed/labeled/
s/differenciate/differentiate/

Signed-off-by: Alexander Alemayhu <alexander@alemayhu.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170105211150.24003-1-alexander@alemayhu.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 include/linux/edac.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/edac.h b/include/linux/edac.h
index 07c52c0af62d..5b6adf964248 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -190,8 +190,8 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  *			part of the memory details to the memory controller.
  * @MEM_RMBS:		Rambus DRAM, used on a few Pentium III/IV controllers.
  * @MEM_DDR2:		DDR2 RAM, as described at JEDEC JESD79-2F.
- *			Those memories are labed as "PC2-" instead of "PC" to
- *			differenciate from DDR.
+ *			Those memories are labeled as "PC2-" instead of "PC" to
+ *			differentiate from DDR.
  * @MEM_FB_DDR2:	Fully-Buffered DDR2, as described at JEDEC Std No. 205
  *			and JESD206.
  *			Those memories are accessed per DIMM slot, and not by
-- 
cgit v1.2.3


From 69d707d034b6078f0b5998f80e5883c8243b205c Mon Sep 17 00:00:00 2001
From: "Karicheri, Muralidharan" <m-karicheri2@ti.com>
Date: Fri, 6 Jan 2017 15:37:39 -0500
Subject: net: netcp: extract eflag from desc for rx_hook handling

Extract the eflag bits from the received desc and pass it down
the rx_hook chain to be available for netcp modules. Also the
psdata and epib data has to be inspected by the netcp modules.
So the desc can be freed only after returning from the rx_hook.
So move knav_pool_desc_put() after the rx_hook processing.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp.h      |  1 +
 drivers/net/ethernet/ti/netcp_core.c | 20 +++++++++++++++++---
 include/linux/soc/ti/knav_dma.h      |  2 ++
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h
index 0f58c584ae09..a92abd62ec60 100644
--- a/drivers/net/ethernet/ti/netcp.h
+++ b/drivers/net/ethernet/ti/netcp.h
@@ -115,6 +115,7 @@ struct netcp_packet {
 	struct sk_buff		*skb;
 	__le32			*epib;
 	u32			*psdata;
+	u32			eflags;
 	unsigned int		psdata_len;
 	struct netcp_intf	*netcp;
 	struct netcp_tx_pipe	*tx_pipe;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index c243335ed649..a136c56e0aff 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -122,6 +122,13 @@ static void get_pkt_info(dma_addr_t *buff, u32 *buff_len, dma_addr_t *ndesc,
 	*ndesc = le32_to_cpu(desc->next_desc);
 }
 
+static void get_desc_info(u32 *desc_info, u32 *pkt_info,
+			  struct knav_dma_desc *desc)
+{
+	*desc_info = le32_to_cpu(desc->desc_info);
+	*pkt_info = le32_to_cpu(desc->packet_info);
+}
+
 static u32 get_sw_data(int index, struct knav_dma_desc *desc)
 {
 	/* No Endian conversion needed as this data is untouched by hw */
@@ -653,6 +660,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	struct netcp_packet p_info;
 	struct sk_buff *skb;
 	void *org_buf_ptr;
+	u32 tmp;
 
 	dma_desc = knav_queue_pop(netcp->rx_queue, &dma_sz);
 	if (!dma_desc)
@@ -724,9 +732,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 		knav_pool_desc_put(netcp->rx_pool, ndesc);
 	}
 
-	/* Free the primary descriptor */
-	knav_pool_desc_put(netcp->rx_pool, desc);
-
 	/* check for packet len and warn */
 	if (unlikely(pkt_sz != accum_sz))
 		dev_dbg(netcp->ndev_dev, "mismatch in packet size(%d) & sum of fragments(%d)\n",
@@ -739,6 +744,11 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	p_info.skb = skb;
 	skb->dev = netcp->ndev;
 	p_info.rxtstamp_complete = false;
+	get_desc_info(&tmp, &p_info.eflags, desc);
+	p_info.epib = desc->epib;
+	p_info.psdata = (u32 __force *)desc->psdata;
+	p_info.eflags = ((p_info.eflags >> KNAV_DMA_DESC_EFLAGS_SHIFT) &
+			 KNAV_DMA_DESC_EFLAGS_MASK);
 	list_for_each_entry(rx_hook, &netcp->rxhook_list_head, list) {
 		int ret;
 
@@ -748,10 +758,14 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 			dev_err(netcp->ndev_dev, "RX hook %d failed: %d\n",
 				rx_hook->order, ret);
 			netcp->ndev->stats.rx_errors++;
+			/* Free the primary descriptor */
+			knav_pool_desc_put(netcp->rx_pool, desc);
 			dev_kfree_skb(skb);
 			return 0;
 		}
 	}
+	/* Free the primary descriptor */
+	knav_pool_desc_put(netcp->rx_pool, desc);
 
 	netcp->ndev->stats.rx_packets++;
 	netcp->ndev->stats.rx_bytes += skb->len;
diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h
index 35cb9264e0d5..2b7882666ef6 100644
--- a/include/linux/soc/ti/knav_dma.h
+++ b/include/linux/soc/ti/knav_dma.h
@@ -41,6 +41,8 @@
 #define KNAV_DMA_DESC_RETQ_SHIFT		0
 #define KNAV_DMA_DESC_RETQ_MASK			MASK(14)
 #define KNAV_DMA_DESC_BUF_LEN_MASK		MASK(22)
+#define KNAV_DMA_DESC_EFLAGS_MASK		MASK(4)
+#define KNAV_DMA_DESC_EFLAGS_SHIFT		20
 
 #define KNAV_DMA_NUM_EPIB_WORDS			4
 #define KNAV_DMA_NUM_PS_WORDS			16
-- 
cgit v1.2.3


From f099d616dd689d27b08dec95d0b6f1f302cf8ec4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 5 Jan 2017 12:01:30 -0800
Subject: fscrypt: remove unused 'mode' member of fscrypt_ctx

Nothing reads or writes fscrypt_ctx.mode, and it doesn't belong there
because a fscrypt_ctx is not tied to a specific encryption mode.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fscrypto.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 2a2815702095..8635ea46ef6e 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -35,7 +35,6 @@ struct fscrypt_ctx {
 		struct list_head free_list;	/* Free list */
 	};
 	u8 flags;				/* Flags */
-	u8 mode;				/* Encryption mode for tfm */
 };
 
 /**
-- 
cgit v1.2.3


From a5d431eff2e0bb22156897435aa277ddc96074f7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 5 Jan 2017 13:51:18 -0800
Subject: fscrypt: make fscrypt_operations.key_prefix a string

There was an unnecessary amount of complexity around requesting the
filesystem-specific key prefix.  It was unclear why; perhaps it was
envisioned that different instances of the same filesystem type could
use different key prefixes, or that key prefixes could be binary.
However, neither of those things were implemented or really make sense
at all.  So simplify the code by making key_prefix a const char *.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/keyinfo.c      | 31 +++++++++++--------------------
 fs/ext4/ext4.h           | 11 -----------
 fs/ext4/super.c          | 13 +------------
 fs/f2fs/f2fs.h           |  9 ---------
 fs/f2fs/super.c          | 14 +-------------
 fs/ubifs/crypto.c        | 11 +----------
 include/linux/fscrypto.h |  2 +-
 7 files changed, 15 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 80f145c8d550..eeb6fd67ea17 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -77,26 +77,22 @@ out:
 
 static int validate_user_key(struct fscrypt_info *crypt_info,
 			struct fscrypt_context *ctx, u8 *raw_key,
-			u8 *prefix, int prefix_size)
+			const char *prefix)
 {
-	u8 *full_key_descriptor;
+	char *description;
 	struct key *keyring_key;
 	struct fscrypt_key *master_key;
 	const struct user_key_payload *ukp;
-	int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
 	int res;
 
-	full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
-	if (!full_key_descriptor)
+	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+				FS_KEY_DESCRIPTOR_SIZE,
+				ctx->master_key_descriptor);
+	if (!description)
 		return -ENOMEM;
 
-	memcpy(full_key_descriptor, prefix, prefix_size);
-	sprintf(full_key_descriptor + prefix_size,
-			"%*phN", FS_KEY_DESCRIPTOR_SIZE,
-			ctx->master_key_descriptor);
-	full_key_descriptor[full_key_len - 1] = '\0';
-	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-	kfree(full_key_descriptor);
+	keyring_key = request_key(&key_type_logon, description, NULL);
+	kfree(description);
 	if (IS_ERR(keyring_key))
 		return PTR_ERR(keyring_key);
 
@@ -251,15 +247,10 @@ retry:
 	if (!raw_key)
 		goto out;
 
-	res = validate_user_key(crypt_info, &ctx, raw_key,
-			FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+	res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX);
 	if (res && inode->i_sb->s_cop->key_prefix) {
-		u8 *prefix = NULL;
-		int prefix_size, res2;
-
-		prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
-		res2 = validate_user_key(crypt_info, &ctx, raw_key,
-							prefix, prefix_size);
+		int res2 = validate_user_key(crypt_info, &ctx, raw_key,
+					     inode->i_sb->s_cop->key_prefix);
 		if (res2) {
 			if (res2 == -ENOKEY)
 				res = -ENOKEY;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2163c1e69f2a..6bcb9622fdf9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1343,11 +1343,6 @@ struct ext4_super_block {
 /* Number of quota types we support */
 #define EXT4_MAXQUOTAS 3
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-#define EXT4_KEY_DESC_PREFIX "ext4:"
-#define EXT4_KEY_DESC_PREFIX_SIZE 5
-#endif
-
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1517,12 +1512,6 @@ struct ext4_sb_info {
 
 	/* Barrier between changing inodes' journal flags and writepages ops. */
 	struct percpu_rw_semaphore s_journal_flag_rwsem;
-
-	/* Encryption support */
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE];
-	u8 key_prefix_size;
-#endif
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 66845a08a87a..9d15a6293124 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1100,12 +1100,6 @@ static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
 				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
 }
 
-static int ext4_key_prefix(struct inode *inode, u8 **key)
-{
-	*key = EXT4_SB(inode->i_sb)->key_prefix;
-	return EXT4_SB(inode->i_sb)->key_prefix_size;
-}
-
 static int ext4_prepare_context(struct inode *inode)
 {
 	return ext4_convert_inline_data(inode);
@@ -1180,8 +1174,8 @@ static unsigned ext4_max_namelen(struct inode *inode)
 }
 
 static struct fscrypt_operations ext4_cryptops = {
+	.key_prefix		= "ext4:",
 	.get_context		= ext4_get_context,
-	.key_prefix		= ext4_key_prefix,
 	.prepare_context	= ext4_prepare_context,
 	.set_context		= ext4_set_context,
 	.dummy_context		= ext4_dummy_context,
@@ -4218,11 +4212,6 @@ no_journal:
 	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
 
 	kfree(orig_data);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX,
-				EXT4_KEY_DESC_PREFIX_SIZE);
-	sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE;
-#endif
 	return 0;
 
 cantfind_ext4:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2da8c3aa0ce5..93d38d854a41 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -760,10 +760,6 @@ enum {
 	MAX_TIME,
 };
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-#endif
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
@@ -771,11 +767,6 @@ struct f2fs_sb_info {
 	int valid_super_block;			/* valid super block no */
 	unsigned long s_flag;				/* flags for sbi */
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
-	u8 key_prefix_size;
-#endif
-
 #ifdef CONFIG_BLK_DEV_ZONED
 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 702638e21c76..739192d95e71 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1156,12 +1156,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 				ctx, len, NULL);
 }
 
-static int f2fs_key_prefix(struct inode *inode, u8 **key)
-{
-	*key = F2FS_I_SB(inode)->key_prefix;
-	return F2FS_I_SB(inode)->key_prefix_size;
-}
-
 static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
@@ -1177,8 +1171,8 @@ static unsigned f2fs_max_namelen(struct inode *inode)
 }
 
 static struct fscrypt_operations f2fs_cryptops = {
+	.key_prefix	= "f2fs:",
 	.get_context	= f2fs_get_context,
-	.key_prefix	= f2fs_key_prefix,
 	.set_context	= f2fs_set_context,
 	.is_encrypted	= f2fs_encrypted_inode,
 	.empty_dir	= f2fs_empty_dir,
@@ -1518,12 +1512,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	mutex_init(&sbi->wio_mutex[NODE]);
 	mutex_init(&sbi->wio_mutex[DATA]);
 	spin_lock_init(&sbi->cp_lock);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
-				F2FS_KEY_DESC_PREFIX_SIZE);
-	sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
-#endif
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 3402720f2b28..6335abcf98df 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -26,15 +26,6 @@ static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
 		return UBIFS_MAX_NLEN;
 }
 
-static int ubifs_key_prefix(struct inode *inode, u8 **key)
-{
-	static char prefix[] = "ubifs:";
-
-	*key = prefix;
-
-	return sizeof(prefix) - 1;
-}
-
 int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 		  unsigned int in_len, unsigned int *out_len, int block)
 {
@@ -88,10 +79,10 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 
 struct fscrypt_operations ubifs_crypt_operations = {
 	.flags			= FS_CFLG_OWN_PAGES,
+	.key_prefix		= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
 	.is_encrypted		= __ubifs_crypt_is_encrypted,
 	.empty_dir		= ubifs_crypt_empty_dir,
 	.max_namelen		= ubifs_crypt_max_namelen,
-	.key_prefix		= ubifs_key_prefix,
 };
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 8635ea46ef6e..715f17b3c6d7 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -85,8 +85,8 @@ struct fscrypt_name {
  */
 struct fscrypt_operations {
 	unsigned int flags;
+	const char *key_prefix;
 	int (*get_context)(struct inode *, void *, size_t);
-	int (*key_prefix)(struct inode *, u8 **);
 	int (*prepare_context)(struct inode *);
 	int (*set_context)(struct inode *, const void *, size_t, void *);
 	int (*dummy_context)(struct inode *);
-- 
cgit v1.2.3


From 2f5ff26478adaff5ed9b7ad4079d6a710b5f27e7 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:21 +0200
Subject: mlx5: Fix naming convention with respect to UARs

This establishes a solid naming conventions for UARs. A UAR (User Access
Region) can have size identical to a system page or can be fixed 4KB
depending on a value queried by firmware. Each UAR always has 4 blue
flame register which are used to post doorbell to send queue. In
addition, a UAR has section used for posting doorbells to CQs or EQs. In
this patch we change names to reflect this conventions.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c                |   6 +-
 drivers/infiniband/hw/mlx5/main.c              |  80 +++++------
 drivers/infiniband/hw/mlx5/mlx5_ib.h           |   6 +-
 drivers/infiniband/hw/mlx5/qp.c                | 176 ++++++++++++-------------
 drivers/net/ethernet/mellanox/mlx5/core/eq.c   |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/uar.c  |  90 ++++++-------
 include/linux/mlx5/device.h                    |   9 +-
 include/linux/mlx5/driver.h                    |  14 +-
 include/uapi/rdma/mlx5-abi.h                   |  12 +-
 10 files changed, 206 insertions(+), 203 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index b3ef47c3ab73..bb7e91c55003 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -689,7 +689,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 {
 	struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev;
 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
-	void __iomem *uar_page = mdev->priv.uuari.uars[0].map;
+	void __iomem *uar_page = mdev->priv.bfregi.uars[0].map;
 	unsigned long irq_flags;
 	int ret = 0;
 
@@ -790,7 +790,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = to_mucontext(context)->uuari.uars[0].index;
+	*index = to_mucontext(context)->bfregi.uars[0].index;
 
 	if (ucmd.cqe_comp_en == 1) {
 		if (unlikely((*cqe_size != 64) ||
@@ -886,7 +886,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = dev->mdev->priv.uuari.uars[0].index;
+	*index = dev->mdev->priv.bfregi.uars[0].index;
 
 	return 0;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 852b5b7b4897..d5cf82b387d3 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -999,12 +999,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
 	struct mlx5_ib_ucontext *context;
-	struct mlx5_uuar_info *uuari;
+	struct mlx5_bfreg_info *bfregi;
 	struct mlx5_uar *uars;
-	int gross_uuars;
+	int gross_bfregs;
 	int num_uars;
 	int ver;
-	int uuarn;
+	int bfregn;
 	int err;
 	int i;
 	size_t reqlen;
@@ -1032,10 +1032,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (req.flags)
 		return ERR_PTR(-EINVAL);
 
-	if (req.total_num_uuars > MLX5_MAX_UUARS)
+	if (req.total_num_bfregs > MLX5_MAX_BFREGS)
 		return ERR_PTR(-ENOMEM);
 
-	if (req.total_num_uuars == 0)
+	if (req.total_num_bfregs == 0)
 		return ERR_PTR(-EINVAL);
 
 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
@@ -1046,13 +1046,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 				 reqlen - sizeof(req)))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	req.total_num_uuars = ALIGN(req.total_num_uuars,
-				    MLX5_NON_FP_BF_REGS_PER_PAGE);
-	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
+	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
+				    MLX5_NON_FP_BFREGS_PER_UAR);
+	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
 		return ERR_PTR(-EINVAL);
 
-	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
-	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
+	num_uars = req.total_num_bfregs / MLX5_NON_FP_BFREGS_PER_UAR;
+	gross_bfregs = num_uars * MLX5_BFREGS_PER_UAR;
 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
 	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
 		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
@@ -1072,32 +1072,33 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
-	uuari = &context->uuari;
-	mutex_init(&uuari->lock);
+	bfregi = &context->bfregi;
+	mutex_init(&bfregi->lock);
 	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
 	if (!uars) {
 		err = -ENOMEM;
 		goto out_ctx;
 	}
 
-	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
-				sizeof(*uuari->bitmap),
+	bfregi->bitmap = kcalloc(BITS_TO_LONGS(gross_bfregs),
+				sizeof(*bfregi->bitmap),
 				GFP_KERNEL);
-	if (!uuari->bitmap) {
+	if (!bfregi->bitmap) {
 		err = -ENOMEM;
 		goto out_uar_ctx;
 	}
 	/*
-	 * clear all fast path uuars
+	 * clear all fast path bfregs
 	 */
-	for (i = 0; i < gross_uuars; i++) {
-		uuarn = i & 3;
-		if (uuarn == 2 || uuarn == 3)
-			set_bit(i, uuari->bitmap);
+	for (i = 0; i < gross_bfregs; i++) {
+		bfregn = i & 3;
+		if (bfregn == 2 || bfregn == 3)
+			set_bit(i, bfregi->bitmap);
 	}
 
-	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
-	if (!uuari->count) {
+	bfregi->count = kcalloc(gross_bfregs,
+				sizeof(*bfregi->count), GFP_KERNEL);
+	if (!bfregi->count) {
 		err = -ENOMEM;
 		goto out_bitmap;
 	}
@@ -1130,7 +1131,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
-	resp.tot_uuars = req.total_num_uuars;
+	resp.tot_bfregs = req.total_num_bfregs;
 	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
 
 	if (field_avail(typeof(resp), cqe_version, udata->outlen))
@@ -1163,10 +1164,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (err)
 		goto out_td;
 
-	uuari->ver = ver;
-	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
-	uuari->uars = uars;
-	uuari->num_uars = num_uars;
+	bfregi->ver = ver;
+	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
+	bfregi->uars = uars;
+	bfregi->num_uars = num_uars;
 	context->cqe_version = resp.cqe_version;
 
 	return &context->ibucontext;
@@ -1182,10 +1183,10 @@ out_uars:
 	for (i--; i >= 0; i--)
 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
 out_count:
-	kfree(uuari->count);
+	kfree(bfregi->count);
 
 out_bitmap:
-	kfree(uuari->bitmap);
+	kfree(bfregi->bitmap);
 
 out_uar_ctx:
 	kfree(uars);
@@ -1199,7 +1200,7 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
-	struct mlx5_uuar_info *uuari = &context->uuari;
+	struct mlx5_bfreg_info *bfregi = &context->bfregi;
 	int i;
 
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
@@ -1207,14 +1208,15 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 
 	free_page(context->upd_xlt_page);
 
-	for (i = 0; i < uuari->num_uars; i++) {
-		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
-			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
+	for (i = 0; i < bfregi->num_uars; i++) {
+		if (mlx5_cmd_free_uar(dev->mdev, bfregi->uars[i].index))
+			mlx5_ib_warn(dev, "Failed to free UAR 0x%x\n",
+				     bfregi->uars[i].index);
 	}
 
-	kfree(uuari->count);
-	kfree(uuari->bitmap);
-	kfree(uuari->uars);
+	kfree(bfregi->count);
+	kfree(bfregi->bitmap);
+	kfree(bfregi->uars);
 	kfree(context);
 
 	return 0;
@@ -1377,7 +1379,7 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		    struct vm_area_struct *vma,
 		    struct mlx5_ib_ucontext *context)
 {
-	struct mlx5_uuar_info *uuari = &context->uuari;
+	struct mlx5_bfreg_info *bfregi = &context->bfregi;
 	int err;
 	unsigned long idx;
 	phys_addr_t pfn, pa;
@@ -1408,10 +1410,10 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		return -EINVAL;
 
 	idx = get_index(vma->vm_pgoff);
-	if (idx >= uuari->num_uars)
+	if (idx >= bfregi->num_uars)
 		return -EINVAL;
 
-	pfn = uar_index2pfn(dev, uuari->uars[idx].index);
+	pfn = uar_index2pfn(dev, bfregi->uars[idx].index);
 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
 
 	vma->vm_page_prot = prot;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a51c8051aeb2..d4d1329df94a 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -100,7 +100,7 @@ enum mlx5_ib_mad_ifc_flags {
 };
 
 enum {
-	MLX5_CROSS_CHANNEL_UUAR         = 0,
+	MLX5_CROSS_CHANNEL_BFREG         = 0,
 };
 
 enum {
@@ -120,7 +120,7 @@ struct mlx5_ib_ucontext {
 	/* protect doorbell record alloc/free
 	 */
 	struct mutex		db_page_mutex;
-	struct mlx5_uuar_info	uuari;
+	struct mlx5_bfreg_info	bfregi;
 	u8			cqe_version;
 	/* Transport Domain number */
 	u32			tdn;
@@ -355,7 +355,7 @@ struct mlx5_ib_qp {
 	/* only for user space QPs. For kernel
 	 * we have it from the bf object
 	 */
-	int			uuarn;
+	int			bfregn;
 
 	int			create_type;
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 42d021cdc6c5..fbea9bd63c8e 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -475,12 +475,12 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)
 	return 1;
 }
 
-static int first_med_uuar(void)
+static int first_med_bfreg(void)
 {
 	return 1;
 }
 
-static int next_uuar(int n)
+static int next_bfreg(int n)
 {
 	n++;
 
@@ -490,45 +490,45 @@ static int next_uuar(int n)
 	return n;
 }
 
-static int num_med_uuar(struct mlx5_uuar_info *uuari)
+static int num_med_bfreg(struct mlx5_bfreg_info *bfregi)
 {
 	int n;
 
-	n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE -
-		uuari->num_low_latency_uuars - 1;
+	n = bfregi->num_uars * MLX5_NON_FP_BFREGS_PER_UAR -
+		bfregi->num_low_latency_bfregs - 1;
 
 	return n >= 0 ? n : 0;
 }
 
-static int max_uuari(struct mlx5_uuar_info *uuari)
+static int max_bfregi(struct mlx5_bfreg_info *bfregi)
 {
-	return uuari->num_uars * 4;
+	return bfregi->num_uars * 4;
 }
 
-static int first_hi_uuar(struct mlx5_uuar_info *uuari)
+static int first_hi_bfreg(struct mlx5_bfreg_info *bfregi)
 {
 	int med;
 	int i;
 	int t;
 
-	med = num_med_uuar(uuari);
-	for (t = 0, i = first_med_uuar();; i = next_uuar(i)) {
+	med = num_med_bfreg(bfregi);
+	for (t = 0, i = first_med_bfreg();; i = next_bfreg(i)) {
 		t++;
 		if (t == med)
-			return next_uuar(i);
+			return next_bfreg(i);
 	}
 
 	return 0;
 }
 
-static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
+static int alloc_high_class_bfreg(struct mlx5_bfreg_info *bfregi)
 {
 	int i;
 
-	for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) {
-		if (!test_bit(i, uuari->bitmap)) {
-			set_bit(i, uuari->bitmap);
-			uuari->count[i]++;
+	for (i = first_hi_bfreg(bfregi); i < max_bfregi(bfregi); i = next_bfreg(i)) {
+		if (!test_bit(i, bfregi->bitmap)) {
+			set_bit(i, bfregi->bitmap);
+			bfregi->count[i]++;
 			return i;
 		}
 	}
@@ -536,87 +536,87 @@ static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
 	return -ENOMEM;
 }
 
-static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari)
+static int alloc_med_class_bfreg(struct mlx5_bfreg_info *bfregi)
 {
-	int minidx = first_med_uuar();
+	int minidx = first_med_bfreg();
 	int i;
 
-	for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) {
-		if (uuari->count[i] < uuari->count[minidx])
+	for (i = first_med_bfreg(); i < first_hi_bfreg(bfregi); i = next_bfreg(i)) {
+		if (bfregi->count[i] < bfregi->count[minidx])
 			minidx = i;
 	}
 
-	uuari->count[minidx]++;
+	bfregi->count[minidx]++;
 	return minidx;
 }
 
-static int alloc_uuar(struct mlx5_uuar_info *uuari,
-		      enum mlx5_ib_latency_class lat)
+static int alloc_bfreg(struct mlx5_bfreg_info *bfregi,
+		       enum mlx5_ib_latency_class lat)
 {
-	int uuarn = -EINVAL;
+	int bfregn = -EINVAL;
 
-	mutex_lock(&uuari->lock);
+	mutex_lock(&bfregi->lock);
 	switch (lat) {
 	case MLX5_IB_LATENCY_CLASS_LOW:
-		uuarn = 0;
-		uuari->count[uuarn]++;
+		bfregn = 0;
+		bfregi->count[bfregn]++;
 		break;
 
 	case MLX5_IB_LATENCY_CLASS_MEDIUM:
-		if (uuari->ver < 2)
-			uuarn = -ENOMEM;
+		if (bfregi->ver < 2)
+			bfregn = -ENOMEM;
 		else
-			uuarn = alloc_med_class_uuar(uuari);
+			bfregn = alloc_med_class_bfreg(bfregi);
 		break;
 
 	case MLX5_IB_LATENCY_CLASS_HIGH:
-		if (uuari->ver < 2)
-			uuarn = -ENOMEM;
+		if (bfregi->ver < 2)
+			bfregn = -ENOMEM;
 		else
-			uuarn = alloc_high_class_uuar(uuari);
+			bfregn = alloc_high_class_bfreg(bfregi);
 		break;
 
 	case MLX5_IB_LATENCY_CLASS_FAST_PATH:
-		uuarn = 2;
+		bfregn = 2;
 		break;
 	}
-	mutex_unlock(&uuari->lock);
+	mutex_unlock(&bfregi->lock);
 
-	return uuarn;
+	return bfregn;
 }
 
-static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+static void free_med_class_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	clear_bit(uuarn, uuari->bitmap);
-	--uuari->count[uuarn];
+	clear_bit(bfregn, bfregi->bitmap);
+	--bfregi->count[bfregn];
 }
 
-static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+static void free_high_class_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	clear_bit(uuarn, uuari->bitmap);
-	--uuari->count[uuarn];
+	clear_bit(bfregn, bfregi->bitmap);
+	--bfregi->count[bfregn];
 }
 
-static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+static void free_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
-	int high_uuar = nuuars - uuari->num_low_latency_uuars;
+	int nbfregs = bfregi->num_uars * MLX5_BFREGS_PER_UAR;
+	int high_bfreg = nbfregs - bfregi->num_low_latency_bfregs;
 
-	mutex_lock(&uuari->lock);
-	if (uuarn == 0) {
-		--uuari->count[uuarn];
+	mutex_lock(&bfregi->lock);
+	if (bfregn == 0) {
+		--bfregi->count[bfregn];
 		goto out;
 	}
 
-	if (uuarn < high_uuar) {
-		free_med_class_uuar(uuari, uuarn);
+	if (bfregn < high_bfreg) {
+		free_med_class_bfreg(bfregi, bfregn);
 		goto out;
 	}
 
-	free_high_class_uuar(uuari, uuarn);
+	free_high_class_bfreg(bfregi, bfregn);
 
 out:
-	mutex_unlock(&uuari->lock);
+	mutex_unlock(&bfregi->lock);
 }
 
 static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
@@ -657,9 +657,9 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
 static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
 			       struct mlx5_ib_cq *recv_cq);
 
-static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
+static int bfregn_to_uar_index(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
+	return bfregi->uars[bfregn / MLX5_BFREGS_PER_UAR].index;
 }
 
 static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
@@ -776,7 +776,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	int uar_index;
 	int npages;
 	u32 offset = 0;
-	int uuarn;
+	int bfregn;
 	int ncont = 0;
 	__be64 *pas;
 	void *qpc;
@@ -794,27 +794,27 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	 */
 	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
 		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
-		uuarn = MLX5_CROSS_CHANNEL_UUAR;
+		bfregn = MLX5_CROSS_CHANNEL_BFREG;
 	else {
-		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
-		if (uuarn < 0) {
-			mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+		bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH);
+		if (bfregn < 0) {
+			mlx5_ib_dbg(dev, "failed to allocate low latency BFREG\n");
 			mlx5_ib_dbg(dev, "reverting to medium latency\n");
-			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
-			if (uuarn < 0) {
-				mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
+			bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_MEDIUM);
+			if (bfregn < 0) {
+				mlx5_ib_dbg(dev, "failed to allocate medium latency BFREG\n");
 				mlx5_ib_dbg(dev, "reverting to high latency\n");
-				uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
-				if (uuarn < 0) {
-					mlx5_ib_warn(dev, "uuar allocation failed\n");
-					return uuarn;
+				bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_LOW);
+				if (bfregn < 0) {
+					mlx5_ib_warn(dev, "bfreg allocation failed\n");
+					return bfregn;
 				}
 			}
 		}
 	}
 
-	uar_index = uuarn_to_uar_index(&context->uuari, uuarn);
-	mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index);
+	uar_index = bfregn_to_uar_index(&context->bfregi, bfregn);
+	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
 
 	qp->rq.offset = 0;
 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
@@ -822,7 +822,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	err = set_user_buf_size(dev, qp, &ucmd, base, attr);
 	if (err)
-		goto err_uuar;
+		goto err_bfreg;
 
 	if (ucmd.buf_addr && ubuffer->buf_size) {
 		ubuffer->buf_addr = ucmd.buf_addr;
@@ -831,7 +831,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 				       &ubuffer->umem, &npages, &page_shift,
 				       &ncont, &offset);
 		if (err)
-			goto err_uuar;
+			goto err_bfreg;
 	} else {
 		ubuffer->umem = NULL;
 	}
@@ -854,8 +854,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	MLX5_SET(qpc, qpc, page_offset, offset);
 
 	MLX5_SET(qpc, qpc, uar_page, uar_index);
-	resp->uuar_index = uuarn;
-	qp->uuarn = uuarn;
+	resp->bfreg_index = bfregn;
+	qp->bfregn = bfregn;
 
 	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
 	if (err) {
@@ -882,8 +882,8 @@ err_umem:
 	if (ubuffer->umem)
 		ib_umem_release(ubuffer->umem);
 
-err_uuar:
-	free_uuar(&context->uuari, uuarn);
+err_bfreg:
+	free_bfreg(&context->bfregi, bfregn);
 	return err;
 }
 
@@ -896,7 +896,7 @@ static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp,
 	mlx5_ib_db_unmap_user(context, &qp->db);
 	if (base->ubuffer.umem)
 		ib_umem_release(base->ubuffer.umem);
-	free_uuar(&context->uuari, qp->uuarn);
+	free_bfreg(&context->bfregi, qp->bfregn);
 }
 
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
@@ -906,13 +906,13 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 			    struct mlx5_ib_qp_base *base)
 {
 	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
-	struct mlx5_uuar_info *uuari;
+	struct mlx5_bfreg_info *bfregi;
 	int uar_index;
 	void *qpc;
-	int uuarn;
+	int bfregn;
 	int err;
 
-	uuari = &dev->mdev->priv.uuari;
+	bfregi = &dev->mdev->priv.bfregi;
 	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
 					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
 					IB_QP_CREATE_IPOIB_UD_LSO |
@@ -922,19 +922,19 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
 		lc = MLX5_IB_LATENCY_CLASS_FAST_PATH;
 
-	uuarn = alloc_uuar(uuari, lc);
-	if (uuarn < 0) {
+	bfregn = alloc_bfreg(bfregi, lc);
+	if (bfregn < 0) {
 		mlx5_ib_dbg(dev, "\n");
 		return -ENOMEM;
 	}
 
-	qp->bf = &uuari->bfs[uuarn];
+	qp->bf = &bfregi->bfs[bfregn];
 	uar_index = qp->bf->uar->index;
 
 	err = calc_sq_size(dev, init_attr, qp);
 	if (err < 0) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
-		goto err_uuar;
+		goto err_bfreg;
 	}
 
 	qp->rq.offset = 0;
@@ -944,7 +944,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 	err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
-		goto err_uuar;
+		goto err_bfreg;
 	}
 
 	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
@@ -1007,8 +1007,8 @@ err_free:
 err_buf:
 	mlx5_buf_free(dev->mdev, &qp->buf);
 
-err_uuar:
-	free_uuar(&dev->mdev->priv.uuari, uuarn);
+err_bfreg:
+	free_bfreg(&dev->mdev->priv.bfregi, bfregn);
 	return err;
 }
 
@@ -1021,7 +1021,7 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 	kfree(qp->rq.wrid);
 	mlx5_db_free(dev->mdev, &qp->db);
 	mlx5_buf_free(dev->mdev, &qp->buf);
-	free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn);
+	free_bfreg(&dev->mdev->priv.bfregi, qp->bf->bfregn);
 }
 
 static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
@@ -1353,7 +1353,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	if (init_attr->create_flags || init_attr->send_cq)
 		return -EINVAL;
 
-	min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index);
+	min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index);
 	if (udata->outlen < min_resp_len)
 		return -EINVAL;
 
@@ -4132,7 +4132,7 @@ out:
 			__acquire(&bf->lock);
 
 		/* TBD enable WC */
-		if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) {
+		if (0 && nreq == 1 && bf->bfregn && inl && size > 1 && size <= bf->buf_size / 16) {
 			mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp);
 			/* wc_wmb(); */
 		} else {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 4aff8ac68e14..11a8d638bcd0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -686,7 +686,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
-				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0],
+				 "mlx5_cmd_eq", &dev->priv.bfregi.uars[0],
 				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
@@ -697,7 +697,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
 				 MLX5_NUM_ASYNC_EQE, async_event_mask,
-				 "mlx5_async_eq", &dev->priv.uuari.uars[0],
+				 "mlx5_async_eq", &dev->priv.bfregi.uars[0],
 				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
@@ -708,7 +708,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 				 MLX5_EQ_VEC_PAGES,
 				 /* TODO: sriov max_vf + */ 1,
 				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
-				 &dev->priv.uuari.uars[0],
+				 &dev->priv.bfregi.uars[0],
 				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
@@ -722,7 +722,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 					 MLX5_NUM_ASYNC_EQE,
 					 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 					 "mlx5_page_fault_eq",
-					 &dev->priv.uuari.uars[0],
+					 &dev->priv.bfregi.uars[0],
 					 MLX5_EQ_TYPE_PF);
 		if (err) {
 			mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f4115135e30b..634e96a02516 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -753,7 +753,7 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
-					 name, &dev->priv.uuari.uars[0],
+					 name, &dev->priv.bfregi.uars[0],
 					 MLX5_EQ_TYPE_COMP);
 		if (err) {
 			kfree(eq);
@@ -1094,7 +1094,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_cleanup_once;
 	}
 
-	err = mlx5_alloc_uuars(dev, &priv->uuari);
+	err = mlx5_alloc_bfregs(dev, &priv->bfregi);
 	if (err) {
 		dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
 		goto err_disable_msix;
@@ -1170,7 +1170,7 @@ err_stop_eqs:
 	mlx5_stop_eqs(dev);
 
 err_free_uar:
-	mlx5_free_uuars(dev, &priv->uuari);
+	mlx5_free_bfregs(dev, &priv->bfregi);
 
 err_disable_msix:
 	mlx5_disable_msix(dev);
@@ -1230,7 +1230,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
-	mlx5_free_uuars(dev, &priv->uuari);
+	mlx5_free_bfregs(dev, &priv->bfregi);
 	mlx5_disable_msix(dev);
 	if (cleanup)
 		mlx5_cleanup_once(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index ab0b896621a0..ce7fcebb81a3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -39,7 +39,7 @@
 
 enum {
 	NUM_DRIVER_UARS		= 4,
-	NUM_LOW_LAT_UUARS	= 4,
+	NUM_LOW_LAT_BFREGS	= 4,
 };
 
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
@@ -67,116 +67,116 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
 }
 EXPORT_SYMBOL(mlx5_cmd_free_uar);
 
-static int need_uuar_lock(int uuarn)
+static int need_bfreg_lock(int bfregn)
 {
-	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+	int tot_bfregs = NUM_DRIVER_UARS * MLX5_BFREGS_PER_UAR;
 
-	if (uuarn == 0 || tot_uuars - NUM_LOW_LAT_UUARS)
+	if (bfregn == 0 || tot_bfregs - NUM_LOW_LAT_BFREGS)
 		return 0;
 
 	return 1;
 }
 
-int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+int mlx5_alloc_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi)
 {
-	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+	int tot_bfregs = NUM_DRIVER_UARS * MLX5_BFREGS_PER_UAR;
 	struct mlx5_bf *bf;
 	phys_addr_t addr;
 	int err;
 	int i;
 
-	uuari->num_uars = NUM_DRIVER_UARS;
-	uuari->num_low_latency_uuars = NUM_LOW_LAT_UUARS;
+	bfregi->num_uars = NUM_DRIVER_UARS;
+	bfregi->num_low_latency_bfregs = NUM_LOW_LAT_BFREGS;
 
-	mutex_init(&uuari->lock);
-	uuari->uars = kcalloc(uuari->num_uars, sizeof(*uuari->uars), GFP_KERNEL);
-	if (!uuari->uars)
+	mutex_init(&bfregi->lock);
+	bfregi->uars = kcalloc(bfregi->num_uars, sizeof(*bfregi->uars), GFP_KERNEL);
+	if (!bfregi->uars)
 		return -ENOMEM;
 
-	uuari->bfs = kcalloc(tot_uuars, sizeof(*uuari->bfs), GFP_KERNEL);
-	if (!uuari->bfs) {
+	bfregi->bfs = kcalloc(tot_bfregs, sizeof(*bfregi->bfs), GFP_KERNEL);
+	if (!bfregi->bfs) {
 		err = -ENOMEM;
 		goto out_uars;
 	}
 
-	uuari->bitmap = kcalloc(BITS_TO_LONGS(tot_uuars), sizeof(*uuari->bitmap),
+	bfregi->bitmap = kcalloc(BITS_TO_LONGS(tot_bfregs), sizeof(*bfregi->bitmap),
 				GFP_KERNEL);
-	if (!uuari->bitmap) {
+	if (!bfregi->bitmap) {
 		err = -ENOMEM;
 		goto out_bfs;
 	}
 
-	uuari->count = kcalloc(tot_uuars, sizeof(*uuari->count), GFP_KERNEL);
-	if (!uuari->count) {
+	bfregi->count = kcalloc(tot_bfregs, sizeof(*bfregi->count), GFP_KERNEL);
+	if (!bfregi->count) {
 		err = -ENOMEM;
 		goto out_bitmap;
 	}
 
-	for (i = 0; i < uuari->num_uars; i++) {
-		err = mlx5_cmd_alloc_uar(dev, &uuari->uars[i].index);
+	for (i = 0; i < bfregi->num_uars; i++) {
+		err = mlx5_cmd_alloc_uar(dev, &bfregi->uars[i].index);
 		if (err)
 			goto out_count;
 
-		addr = dev->iseg_base + ((phys_addr_t)(uuari->uars[i].index) << PAGE_SHIFT);
-		uuari->uars[i].map = ioremap(addr, PAGE_SIZE);
-		if (!uuari->uars[i].map) {
-			mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+		addr = dev->iseg_base + ((phys_addr_t)(bfregi->uars[i].index) << PAGE_SHIFT);
+		bfregi->uars[i].map = ioremap(addr, PAGE_SIZE);
+		if (!bfregi->uars[i].map) {
+			mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
 			err = -ENOMEM;
 			goto out_count;
 		}
 		mlx5_core_dbg(dev, "allocated uar index 0x%x, mmaped at %p\n",
-			      uuari->uars[i].index, uuari->uars[i].map);
+			      bfregi->uars[i].index, bfregi->uars[i].map);
 	}
 
-	for (i = 0; i < tot_uuars; i++) {
-		bf = &uuari->bfs[i];
+	for (i = 0; i < tot_bfregs; i++) {
+		bf = &bfregi->bfs[i];
 
 		bf->buf_size = (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) / 2;
-		bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE];
-		bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map;
+		bf->uar = &bfregi->uars[i / MLX5_BFREGS_PER_UAR];
+		bf->regreg = bfregi->uars[i / MLX5_BFREGS_PER_UAR].map;
 		bf->reg = NULL; /* Add WC support */
-		bf->offset = (i % MLX5_BF_REGS_PER_PAGE) *
+		bf->offset = (i % MLX5_BFREGS_PER_UAR) *
 			     (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) +
 			     MLX5_BF_OFFSET;
-		bf->need_lock = need_uuar_lock(i);
+		bf->need_lock = need_bfreg_lock(i);
 		spin_lock_init(&bf->lock);
 		spin_lock_init(&bf->lock32);
-		bf->uuarn = i;
+		bf->bfregn = i;
 	}
 
 	return 0;
 
 out_count:
 	for (i--; i >= 0; i--) {
-		iounmap(uuari->uars[i].map);
-		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+		iounmap(bfregi->uars[i].map);
+		mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
 	}
-	kfree(uuari->count);
+	kfree(bfregi->count);
 
 out_bitmap:
-	kfree(uuari->bitmap);
+	kfree(bfregi->bitmap);
 
 out_bfs:
-	kfree(uuari->bfs);
+	kfree(bfregi->bfs);
 
 out_uars:
-	kfree(uuari->uars);
+	kfree(bfregi->uars);
 	return err;
 }
 
-int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+int mlx5_free_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi)
 {
-	int i = uuari->num_uars;
+	int i = bfregi->num_uars;
 
 	for (i--; i >= 0; i--) {
-		iounmap(uuari->uars[i].map);
-		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+		iounmap(bfregi->uars[i].map);
+		mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
 	}
 
-	kfree(uuari->count);
-	kfree(uuari->bitmap);
-	kfree(uuari->bfs);
-	kfree(uuari->uars);
+	kfree(bfregi->count);
+	kfree(bfregi->bitmap);
+	kfree(bfregi->bfs);
+	kfree(bfregi->uars);
 
 	return 0;
 }
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 3ccaeff15a80..aa851c51ab59 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -212,10 +212,11 @@ enum {
 };
 
 enum {
-	MLX5_BF_REGS_PER_PAGE		= 4,
-	MLX5_MAX_UAR_PAGES		= 1 << 8,
-	MLX5_NON_FP_BF_REGS_PER_PAGE	= 2,
-	MLX5_MAX_UUARS	= MLX5_MAX_UAR_PAGES * MLX5_NON_FP_BF_REGS_PER_PAGE,
+	MLX5_BFREGS_PER_UAR		= 4,
+	MLX5_MAX_UARS			= 1 << 8,
+	MLX5_NON_FP_BFREGS_PER_UAR	= 2,
+	MLX5_MAX_BFREGS			= MLX5_MAX_UARS *
+					  MLX5_NON_FP_BFREGS_PER_UAR,
 };
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cfa49bca009c..3d07e25b3bf1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -188,16 +188,16 @@ enum mlx5_eq_type {
 #endif
 };
 
-struct mlx5_uuar_info {
+struct mlx5_bfreg_info {
 	struct mlx5_uar	       *uars;
 	int			num_uars;
-	int			num_low_latency_uuars;
+	int			num_low_latency_bfregs;
 	unsigned long	       *bitmap;
 	unsigned int	       *count;
 	struct mlx5_bf	       *bfs;
 
 	/*
-	 * protect uuar allocation data structs
+	 * protect bfreg allocation data structs
 	 */
 	struct mutex		lock;
 	u32			ver;
@@ -217,7 +217,7 @@ struct mlx5_bf {
 	/* serialize 64 bit writes when done as two 32 bit accesses
 	 */
 	spinlock_t		lock32;
-	int			uuarn;
+	int			bfregn;
 };
 
 struct mlx5_cmd_first {
@@ -579,7 +579,7 @@ struct mlx5_priv {
 	struct mlx5_eq_table	eq_table;
 	struct msix_entry	*msix_arr;
 	struct mlx5_irq_info	*irq_info;
-	struct mlx5_uuar_info	uuari;
+	struct mlx5_bfreg_info	bfregi;
 	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
 	/* pages stuff */
@@ -903,8 +903,8 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome);
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
-int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
-int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
+int mlx5_alloc_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
+int mlx5_free_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
 int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
 		       bool map_wc);
 void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index fae6cdaeb56d..86a8f30060f3 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -61,13 +61,13 @@ enum {
  */
 
 struct mlx5_ib_alloc_ucontext_req {
-	__u32	total_num_uuars;
-	__u32	num_low_latency_uuars;
+	__u32	total_num_bfregs;
+	__u32	num_low_latency_bfregs;
 };
 
 struct mlx5_ib_alloc_ucontext_req_v2 {
-	__u32	total_num_uuars;
-	__u32	num_low_latency_uuars;
+	__u32	total_num_bfregs;
+	__u32	num_low_latency_bfregs;
 	__u32	flags;
 	__u32	comp_mask;
 	__u8	max_cqe_version;
@@ -88,7 +88,7 @@ enum mlx5_user_cmds_supp_uhw {
 struct mlx5_ib_alloc_ucontext_resp {
 	__u32	qp_tab_size;
 	__u32	bf_reg_size;
-	__u32	tot_uuars;
+	__u32	tot_bfregs;
 	__u32	cache_line_size;
 	__u16	max_sq_desc_sz;
 	__u16	max_rq_desc_sz;
@@ -241,7 +241,7 @@ struct mlx5_ib_create_qp_rss {
 };
 
 struct mlx5_ib_create_qp_resp {
-	__u32	uuar_index;
+	__u32	bfreg_index;
 };
 
 struct mlx5_ib_alloc_mw {
-- 
cgit v1.2.3


From a6d51b68611e98f05042ada662aed5dbe3279c1e Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:23 +0200
Subject: net/mlx5: Introduce blue flame register allocator

Here is an implementation of an allocator that allocates blue flame
registers. A blue flame register is used for generating send doorbells.
A blue flame register can be used to generate either a regular doorbell
or a blue flame doorbell where the data to be sent is written to the
device's I/O memory hence saving the need to read the data from memory.
For blue flame kind of doorbells to succeed, the blue flame register
need to be mapped as write combining. The user can specify what kind of
send doorbells she wishes to use. If she requested write combining
mapping but that failed, the allocator will fall back to non write
combining mapping and will indicate that to the user.
Subsequent patches in this series will make use of this allocator.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/uar.c | 235 ++++++++++++++++++++++++++
 include/linux/mlx5/device.h                   |   2 +
 include/linux/mlx5/driver.h                   |  37 ++++
 include/linux/mlx5/mlx5_ifc.h                 |   7 +-
 4 files changed, 279 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index ce7fcebb81a3..6a081a8787a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -231,3 +231,238 @@ void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 	mlx5_cmd_free_uar(mdev, uar->index);
 }
 EXPORT_SYMBOL(mlx5_unmap_free_uar);
+
+static int uars_per_sys_page(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, uar_4k))
+		return MLX5_CAP_GEN(mdev, num_of_uars_per_page);
+
+	return 1;
+}
+
+static u64 uar2pfn(struct mlx5_core_dev *mdev, u32 index)
+{
+	u32 system_page_index;
+
+	if (MLX5_CAP_GEN(mdev, uar_4k))
+		system_page_index = index >> (PAGE_SHIFT - MLX5_ADAPTER_PAGE_SHIFT);
+	else
+		system_page_index = index;
+
+	return (pci_resource_start(mdev->pdev, 0) >> PAGE_SHIFT) + system_page_index;
+}
+
+static void up_rel_func(struct kref *kref)
+{
+	struct mlx5_uars_page *up = container_of(kref, struct mlx5_uars_page, ref_count);
+
+	list_del(&up->list);
+	if (mlx5_cmd_free_uar(up->mdev, up->index))
+		mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index);
+	kfree(up->reg_bitmap);
+	kfree(up->fp_bitmap);
+	kfree(up);
+}
+
+static struct mlx5_uars_page *alloc_uars_page(struct mlx5_core_dev *mdev,
+					      bool map_wc)
+{
+	struct mlx5_uars_page *up;
+	int err = -ENOMEM;
+	phys_addr_t pfn;
+	int bfregs;
+	int i;
+
+	bfregs = uars_per_sys_page(mdev) * MLX5_BFREGS_PER_UAR;
+	up = kzalloc(sizeof(*up), GFP_KERNEL);
+	if (!up)
+		return ERR_PTR(err);
+
+	up->mdev = mdev;
+	up->reg_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+	if (!up->reg_bitmap)
+		goto error1;
+
+	up->fp_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+	if (!up->fp_bitmap)
+		goto error1;
+
+	for (i = 0; i < bfregs; i++)
+		if ((i % MLX5_BFREGS_PER_UAR) < MLX5_NON_FP_BFREGS_PER_UAR)
+			set_bit(i, up->reg_bitmap);
+		else
+			set_bit(i, up->fp_bitmap);
+
+	up->bfregs = bfregs;
+	up->fp_avail = bfregs * MLX5_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR;
+	up->reg_avail = bfregs * MLX5_NON_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR;
+
+	err = mlx5_cmd_alloc_uar(mdev, &up->index);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_cmd_alloc_uar() failed, %d\n", err);
+		goto error1;
+	}
+
+	pfn = uar2pfn(mdev, up->index);
+	if (map_wc) {
+		up->map = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!up->map) {
+			err = -EAGAIN;
+			goto error2;
+		}
+	} else {
+		up->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!up->map) {
+			err = -ENOMEM;
+			goto error2;
+		}
+	}
+	kref_init(&up->ref_count);
+	mlx5_core_dbg(mdev, "allocated UAR page: index %d, total bfregs %d\n",
+		      up->index, up->bfregs);
+	return up;
+
+error2:
+	if (mlx5_cmd_free_uar(mdev, up->index))
+		mlx5_core_warn(mdev, "failed to free uar index %d\n", up->index);
+error1:
+	kfree(up->fp_bitmap);
+	kfree(up->reg_bitmap);
+	kfree(up);
+	return ERR_PTR(err);
+}
+
+static unsigned long map_offset(struct mlx5_core_dev *mdev, int dbi)
+{
+	/* return the offset in bytes from the start of the page to the
+	 * blue flame area of the UAR
+	 */
+	return dbi / MLX5_BFREGS_PER_UAR * MLX5_ADAPTER_PAGE_SIZE +
+	       (dbi % MLX5_BFREGS_PER_UAR) *
+	       (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) + MLX5_BF_OFFSET;
+}
+
+static int alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		       bool map_wc, bool fast_path)
+{
+	struct mlx5_bfreg_data *bfregs;
+	struct mlx5_uars_page *up;
+	struct list_head *head;
+	unsigned long *bitmap;
+	unsigned int *avail;
+	struct mutex *lock;  /* pointer to right mutex */
+	int dbi;
+
+	bfregs = &mdev->priv.bfregs;
+	if (map_wc) {
+		head = &bfregs->wc_head.list;
+		lock = &bfregs->wc_head.lock;
+	} else {
+		head = &bfregs->reg_head.list;
+		lock = &bfregs->reg_head.lock;
+	}
+	mutex_lock(lock);
+	if (list_empty(head)) {
+		up = alloc_uars_page(mdev, map_wc);
+		if (IS_ERR(up)) {
+			mutex_unlock(lock);
+			return PTR_ERR(up);
+		}
+		list_add(&up->list, head);
+	} else {
+		up = list_entry(head->next, struct mlx5_uars_page, list);
+		kref_get(&up->ref_count);
+	}
+	if (fast_path) {
+		bitmap = up->fp_bitmap;
+		avail = &up->fp_avail;
+	} else {
+		bitmap = up->reg_bitmap;
+		avail = &up->reg_avail;
+	}
+	dbi = find_first_bit(bitmap, up->bfregs);
+	clear_bit(dbi, bitmap);
+	(*avail)--;
+	if (!(*avail))
+		list_del(&up->list);
+
+	bfreg->map = up->map + map_offset(mdev, dbi);
+	bfreg->up = up;
+	bfreg->wc = map_wc;
+	bfreg->index = up->index + dbi / MLX5_BFREGS_PER_UAR;
+	mutex_unlock(lock);
+
+	return 0;
+}
+
+int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		     bool map_wc, bool fast_path)
+{
+	int err;
+
+	err = alloc_bfreg(mdev, bfreg, map_wc, fast_path);
+	if (!err)
+		return 0;
+
+	if (err == -EAGAIN && map_wc)
+		return alloc_bfreg(mdev, bfreg, false, fast_path);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_alloc_bfreg);
+
+static unsigned int addr_to_dbi_in_syspage(struct mlx5_core_dev *dev,
+					   struct mlx5_uars_page *up,
+					   struct mlx5_sq_bfreg *bfreg)
+{
+	unsigned int uar_idx;
+	unsigned int bfreg_idx;
+	unsigned int bf_reg_size;
+
+	bf_reg_size = 1 << MLX5_CAP_GEN(dev, log_bf_reg_size);
+
+	uar_idx = (bfreg->map - up->map) >> MLX5_ADAPTER_PAGE_SHIFT;
+	bfreg_idx = (((uintptr_t)bfreg->map % MLX5_ADAPTER_PAGE_SIZE) - MLX5_BF_OFFSET) / bf_reg_size;
+
+	return uar_idx * MLX5_BFREGS_PER_UAR + bfreg_idx;
+}
+
+void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg)
+{
+	struct mlx5_bfreg_data *bfregs;
+	struct mlx5_uars_page *up;
+	struct mutex *lock; /* pointer to right mutex */
+	unsigned int dbi;
+	bool fp;
+	unsigned int *avail;
+	unsigned long *bitmap;
+	struct list_head *head;
+
+	bfregs = &mdev->priv.bfregs;
+	if (bfreg->wc) {
+		head = &bfregs->wc_head.list;
+		lock = &bfregs->wc_head.lock;
+	} else {
+		head = &bfregs->reg_head.list;
+		lock = &bfregs->reg_head.lock;
+	}
+	up = bfreg->up;
+	dbi = addr_to_dbi_in_syspage(mdev, up, bfreg);
+	fp = (dbi % MLX5_BFREGS_PER_UAR) >= MLX5_NON_FP_BFREGS_PER_UAR;
+	if (fp) {
+		avail = &up->fp_avail;
+		bitmap = up->fp_bitmap;
+	} else {
+		avail = &up->reg_avail;
+		bitmap = up->reg_bitmap;
+	}
+	mutex_lock(lock);
+	(*avail)++;
+	set_bit(dbi, bitmap);
+	if (*avail == 1)
+		list_add_tail(&up->list, head);
+
+	kref_put(&up->ref_count, up_rel_func);
+	mutex_unlock(lock);
+}
+EXPORT_SYMBOL(mlx5_free_bfreg);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index aa851c51ab59..db1b9287012f 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -215,6 +215,8 @@ enum {
 	MLX5_BFREGS_PER_UAR		= 4,
 	MLX5_MAX_UARS			= 1 << 8,
 	MLX5_NON_FP_BFREGS_PER_UAR	= 2,
+	MLX5_FP_BFREGS_PER_UAR		= MLX5_BFREGS_PER_UAR -
+					  MLX5_NON_FP_BFREGS_PER_UAR,
 	MLX5_MAX_BFREGS			= MLX5_MAX_UARS *
 					  MLX5_NON_FP_BFREGS_PER_UAR,
 };
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3d07e25b3bf1..969aa1fe17e2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -452,6 +452,39 @@ struct mlx5_eq_table {
 	spinlock_t		lock;
 };
 
+struct mlx5_uars_page {
+	void __iomem	       *map;
+	bool			wc;
+	u32			index;
+	struct list_head	list;
+	unsigned int		bfregs;
+	unsigned long	       *reg_bitmap; /* for non fast path bf regs */
+	unsigned long	       *fp_bitmap;
+	unsigned int		reg_avail;
+	unsigned int		fp_avail;
+	struct kref		ref_count;
+	struct mlx5_core_dev   *mdev;
+};
+
+struct mlx5_bfreg_head {
+	/* protect blue flame registers allocations */
+	struct mutex		lock;
+	struct list_head	list;
+};
+
+struct mlx5_bfreg_data {
+	struct mlx5_bfreg_head	reg_head;
+	struct mlx5_bfreg_head	wc_head;
+};
+
+struct mlx5_sq_bfreg {
+	void __iomem	       *map;
+	struct mlx5_uars_page  *up;
+	bool			wc;
+	u32			index;
+	unsigned int		offset;
+};
+
 struct mlx5_uar {
 	u32			index;
 	struct list_head	bf_list;
@@ -645,6 +678,7 @@ struct mlx5_priv {
 	void		       *pfault_ctx;
 	struct srcu_struct      pfault_srcu;
 #endif
+	struct mlx5_bfreg_data		bfregs;
 };
 
 enum mlx5_device_state {
@@ -1022,6 +1056,9 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
 int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index);
 void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate);
 bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate);
+int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		     bool map_wc, bool fast_path);
+void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg);
 
 static inline int fw_initializing(struct mlx5_core_dev *dev)
 {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 15f896781966..1223feff0ea4 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -905,7 +905,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         uc[0x1];
 	u8         rc[0x1];
 
-	u8         reserved_at_240[0xa];
+	u8         uar_4k[0x1];
+	u8         reserved_at_241[0x9];
 	u8         uar_sz[0x6];
 	u8         reserved_at_250[0x8];
 	u8         log_pg_sz[0x8];
@@ -997,7 +998,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         device_frequency_mhz[0x20];
 	u8         device_frequency_khz[0x20];
 
-	u8         reserved_at_500[0x80];
+	u8         reserved_at_500[0x20];
+	u8	   num_of_uars_per_page[0x20];
+	u8         reserved_at_540[0x40];
 
 	u8         reserved_at_580[0x3f];
 	u8         cqe_compression[0x1];
-- 
cgit v1.2.3


From 9f09eaeae28a0c67b8fc2b5acb411a5d4fd8cc80 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Fri, 6 Jan 2017 17:39:06 -0800
Subject: net: ipmr: Remove nowait arg to ipmr_get_route

ipmr_get_route has 1 caller and the nowait arg is 0. Remove the arg and
simplify ipmr_get_route accordingly.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 2 +-
 net/ipv4/ipmr.c        | 7 +------
 net/ipv4/route.c       | 2 +-
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index e5fb81376e92..f019b62f27b5 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -120,5 +120,5 @@ struct mfc_cache {
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
-		   struct rtmsg *rtm, int nowait, u32 portid);
+		   struct rtmsg *rtm, u32 portid);
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b35dda57586b..824c4fdf21eb 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2136,7 +2136,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
-		   struct rtmsg *rtm, int nowait, u32 portid)
+		   struct rtmsg *rtm, u32 portid)
 {
 	struct mfc_cache *cache;
 	struct mr_table *mrt;
@@ -2160,11 +2160,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		struct net_device *dev;
 		int vif = -1;
 
-		if (nowait) {
-			rcu_read_unlock();
-			return -EAGAIN;
-		}
-
 		dev = skb->dev;
 		read_lock(&mrt_lock);
 		if (dev)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7b52ac20145b..0965f8c59e7e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2541,7 +2541,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
 			int err = ipmr_get_route(net, skb,
 						 fl4->saddr, fl4->daddr,
-						 r, 0, portid);
+						 r, portid);
 
 			if (err <= 0) {
 				if (err == 0)
-- 
cgit v1.2.3


From bc1f44709cf27fb2a5766cadafe7e2ad5e9cb221 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Fri, 6 Jan 2017 19:12:52 -0800
Subject: net: make ndo_get_stats64 a void function

The network device operation for reading statistics is only called
in one place, and it ignores the return value. Having a structure
return value is potentially confusing because some future driver could
incorrectly assume that the return value was used.

Fix all drivers with ndo_get_stats64 to have a void function.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                      | 10 ++++------
 drivers/net/dummy.c                                  |  5 ++---
 drivers/net/ethernet/alacritech/slicoss.c            |  6 ++----
 drivers/net/ethernet/amazon/ena/ena_netdev.c         | 10 ++++------
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c             |  6 ++----
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c     |  4 +---
 drivers/net/ethernet/atheros/alx/main.c              |  6 ++----
 drivers/net/ethernet/broadcom/b44.c                  |  5 ++---
 drivers/net/ethernet/broadcom/bnx2.c                 |  5 ++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c            |  6 ++----
 drivers/net/ethernet/broadcom/tg3.c                  |  8 +++-----
 drivers/net/ethernet/brocade/bna/bnad.c              |  6 ++----
 drivers/net/ethernet/calxeda/xgmac.c                 |  5 ++---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c     |  5 ++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c      |  7 +++----
 drivers/net/ethernet/cisco/enic/enic_main.c          |  8 +++-----
 drivers/net/ethernet/ec_bhf.c                        |  4 +---
 drivers/net/ethernet/emulex/benet/be_main.c          |  5 ++---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c       |  6 ++----
 drivers/net/ethernet/hisilicon/hns/hns_enet.c        |  6 ++----
 drivers/net/ethernet/ibm/ehea/ehea_main.c            |  5 ++---
 drivers/net/ethernet/intel/e1000e/e1000.h            |  4 ++--
 drivers/net/ethernet/intel/e1000e/netdev.c           |  5 ++---
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c      |  6 ++----
 drivers/net/ethernet/intel/i40e/i40e.h               |  5 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c          | 18 ++++++------------
 drivers/net/ethernet/intel/igb/igb_main.c            | 10 ++++------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  7 ++++---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c    |  6 ++----
 drivers/net/ethernet/marvell/mvneta.c                |  4 +---
 drivers/net/ethernet/marvell/mvpp2.c                 |  4 +---
 drivers/net/ethernet/marvell/sky2.c                  |  6 ++----
 drivers/net/ethernet/mediatek/mtk_eth_soc.c          |  6 ++----
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c       |  4 +---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c    |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c     |  3 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c       |  4 +---
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c       |  3 +--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c     |  9 ++++-----
 drivers/net/ethernet/neterion/vxge/vxge-main.c       |  4 +---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  |  6 ++----
 drivers/net/ethernet/nvidia/forcedeth.c              |  4 +---
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c | 10 ++++------
 drivers/net/ethernet/qlogic/qede/qede_main.c         |  7 ++-----
 drivers/net/ethernet/qualcomm/emac/emac.c            |  6 ++----
 drivers/net/ethernet/realtek/8139too.c               |  9 +++------
 drivers/net/ethernet/realtek/r8169.c                 |  4 +---
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c      |  8 ++------
 drivers/net/ethernet/sfc/efx.c                       |  6 ++----
 drivers/net/ethernet/sfc/falcon/efx.c                |  6 ++----
 drivers/net/ethernet/sun/niu.c                       |  6 ++----
 drivers/net/ethernet/synopsys/dwc_eth_qos.c          |  4 +---
 drivers/net/ethernet/tile/tilepro.c                  |  4 ++--
 drivers/net/ethernet/via/via-rhine.c                 |  8 +++-----
 drivers/net/fjes/fjes_main.c                         |  7 ++-----
 drivers/net/hyperv/netvsc_drv.c                      |  6 ++----
 drivers/net/ifb.c                                    |  6 ++----
 drivers/net/ipvlan/ipvlan_main.c                     |  5 ++---
 drivers/net/loopback.c                               |  5 ++---
 drivers/net/macsec.c                                 |  8 +++-----
 drivers/net/macvlan.c                                |  5 ++---
 drivers/net/nlmon.c                                  |  4 +---
 drivers/net/ppp/ppp_generic.c                        |  4 +---
 drivers/net/slip/slip.c                              |  3 +--
 drivers/net/team/team.c                              |  3 +--
 drivers/net/tun.c                                    |  3 +--
 drivers/net/veth.c                                   |  6 ++----
 drivers/net/virtio_net.c                             |  6 ++----
 drivers/net/vmxnet3/vmxnet3_ethtool.c                |  4 +---
 drivers/net/vmxnet3/vmxnet3_int.h                    |  4 ++--
 drivers/net/vrf.c                                    |  5 ++---
 drivers/net/xen-netfront.c                           |  6 ++----
 drivers/staging/netlogic/xlr_net.c                   | 10 +---------
 include/linux/netdevice.h                            |  8 ++++----
 include/net/ip_tunnels.h                             |  4 ++--
 net/8021q/vlan_dev.c                                 |  5 ++---
 net/bridge/br_device.c                               |  6 ++----
 net/ipv4/ip_tunnel_core.c                            |  6 ++----
 net/l2tp/l2tp_eth.c                                  |  6 ++----
 net/mac80211/iface.c                                 |  4 +---
 net/openvswitch/vport-internal_dev.c                 |  4 +---
 net/sched/sch_teql.c                                 |  5 ++---
 82 files changed, 166 insertions(+), 309 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8029dd4912b6..36919221b3f0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -211,8 +211,8 @@ static int lacp_fast;
 
 static int bond_init(struct net_device *bond_dev);
 static void bond_uninit(struct net_device *bond_dev);
-static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
-						struct rtnl_link_stats64 *stats);
+static void bond_get_stats(struct net_device *bond_dev,
+			   struct rtnl_link_stats64 *stats);
 static void bond_slave_arr_handler(struct work_struct *work);
 static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
 				  int mod);
@@ -3337,8 +3337,8 @@ static void bond_fold_stats(struct rtnl_link_stats64 *_res,
 	}
 }
 
-static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
-						struct rtnl_link_stats64 *stats)
+static void bond_get_stats(struct net_device *bond_dev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct rtnl_link_stats64 temp;
@@ -3362,8 +3362,6 @@ static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
 
 	memcpy(&bond->bond_stats, stats, sizeof(*stats));
 	spin_unlock(&bond->stats_lock);
-
-	return stats;
 }
 
 static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 6421835f11b7..1f2de4e8207c 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -54,8 +54,8 @@ struct pcpu_dstats {
 	struct u64_stats_sync	syncp;
 };
 
-static struct rtnl_link_stats64 *dummy_get_stats64(struct net_device *dev,
-						   struct rtnl_link_stats64 *stats)
+static void dummy_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats)
 {
 	int i;
 
@@ -73,7 +73,6 @@ static struct rtnl_link_stats64 *dummy_get_stats64(struct net_device *dev,
 		stats->tx_bytes += tbytes;
 		stats->tx_packets += tpackets;
 	}
-	return stats;
 }
 
 static netdev_tx_t dummy_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/drivers/net/ethernet/alacritech/slicoss.c b/drivers/net/ethernet/alacritech/slicoss.c
index b21d8aa8d653..15a8096c60df 100644
--- a/drivers/net/ethernet/alacritech/slicoss.c
+++ b/drivers/net/ethernet/alacritech/slicoss.c
@@ -1471,8 +1471,8 @@ drop_skb:
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *slic_get_stats(struct net_device *dev,
-						struct rtnl_link_stats64 *lst)
+static void slic_get_stats(struct net_device *dev,
+			   struct rtnl_link_stats64 *lst)
 {
 	struct slic_device *sdev = netdev_priv(dev);
 	struct slic_stats *stats = &sdev->stats;
@@ -1489,8 +1489,6 @@ static struct rtnl_link_stats64 *slic_get_stats(struct net_device *dev,
 	SLIC_GET_STATS_COUNTER(lst->rx_crc_errors, stats, rx_crc);
 	SLIC_GET_STATS_COUNTER(lst->rx_fifo_errors, stats, rx_oflow802);
 	SLIC_GET_STATS_COUNTER(lst->tx_carrier_errors, stats, tx_carrier);
-
-	return lst;
 }
 
 static int slic_get_sset_count(struct net_device *dev, int sset)
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index cc8b13ebfa75..aca95b397393 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2165,19 +2165,19 @@ err:
 	ena_com_delete_debug_area(adapter->ena_dev);
 }
 
-static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
-						 struct rtnl_link_stats64 *stats)
+static void ena_get_stats64(struct net_device *netdev,
+			    struct rtnl_link_stats64 *stats)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	struct ena_admin_basic_stats ena_stats;
 	int rc;
 
 	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
-		return NULL;
+		return;
 
 	rc = ena_com_get_dev_basic_stats(adapter->ena_dev, &ena_stats);
 	if (rc)
-		return NULL;
+		return;
 
 	stats->tx_bytes = ((u64)ena_stats.tx_bytes_high << 32) |
 		ena_stats.tx_bytes_low;
@@ -2204,8 +2204,6 @@ static struct rtnl_link_stats64 *ena_get_stats64(struct net_device *netdev,
 
 	stats->rx_errors = 0;
 	stats->tx_errors = 0;
-
-	return stats;
 }
 
 static const struct net_device_ops ena_netdev_ops = {
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 155190db682d..130de11fa553 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1759,8 +1759,8 @@ static void xgbe_tx_timeout(struct net_device *netdev)
 	schedule_work(&pdata->restart_work);
 }
 
-static struct rtnl_link_stats64 *xgbe_get_stats64(struct net_device *netdev,
-						  struct rtnl_link_stats64 *s)
+static void xgbe_get_stats64(struct net_device *netdev,
+			     struct rtnl_link_stats64 *s)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	struct xgbe_mmc_stats *pstats = &pdata->mmc_stats;
@@ -1786,8 +1786,6 @@ static struct rtnl_link_stats64 *xgbe_get_stats64(struct net_device *netdev,
 	s->tx_dropped = netdev->stats.tx_dropped;
 
 	DBGPR("<--%s\n", __func__);
-
-	return s;
 }
 
 static int xgbe_vlan_rx_add_vid(struct net_device *netdev, __be16 proto,
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index 523b8eff6d7b..45f2204e6695 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -1453,7 +1453,7 @@ err:
 	return ret;
 }
 
-static struct rtnl_link_stats64 *xgene_enet_get_stats64(
+static void xgene_enet_get_stats64(
 			struct net_device *ndev,
 			struct rtnl_link_stats64 *storage)
 {
@@ -1484,8 +1484,6 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64(
 		}
 	}
 	memcpy(storage, stats, sizeof(struct rtnl_link_stats64));
-
-	return storage;
 }
 
 static int xgene_enet_set_mac_address(struct net_device *ndev, void *addr)
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index c8f525574d68..c66195d00ed4 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1643,8 +1643,8 @@ static void alx_poll_controller(struct net_device *netdev)
 }
 #endif
 
-static struct rtnl_link_stats64 *alx_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *net_stats)
+static void alx_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *net_stats)
 {
 	struct alx_priv *alx = netdev_priv(dev);
 	struct alx_hw_stats *hw_stats = &alx->hw.stats;
@@ -1688,8 +1688,6 @@ static struct rtnl_link_stats64 *alx_get_stats64(struct net_device *dev,
 	net_stats->rx_packets = hw_stats->rx_ok + net_stats->rx_errors;
 
 	spin_unlock(&alx->stats_lock);
-
-	return net_stats;
 }
 
 static const struct net_device_ops alx_netdev_ops = {
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 48707ed76ffc..7aef70f7d8ef 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -1674,8 +1674,8 @@ static int b44_close(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *b44_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *nstat)
+static void b44_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *nstat)
 {
 	struct b44 *bp = netdev_priv(dev);
 	struct b44_hw_stats *hwstat = &bp->hw_stats;
@@ -1718,7 +1718,6 @@ static struct rtnl_link_stats64 *b44_get_stats64(struct net_device *dev,
 #endif
 	} while (u64_stats_fetch_retry_irq(&hwstat->syncp, start));
 
-	return nstat;
 }
 
 static int __b44_load_mcast(struct b44 *bp, struct net_device *dev)
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index d5d1026be4b7..de1d07c08495 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6821,13 +6821,13 @@ bnx2_save_stats(struct bnx2 *bp)
 	(unsigned long) (bp->stats_blk->ctr +			\
 			 bp->temp_stats_blk->ctr)
 
-static struct rtnl_link_stats64 *
+static void
 bnx2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 {
 	struct bnx2 *bp = netdev_priv(dev);
 
 	if (bp->stats_blk == NULL)
-		return net_stats;
+		return;
 
 	net_stats->rx_packets =
 		GET_64BIT_NET_STATS(stat_IfHCInUcastPkts) +
@@ -6891,7 +6891,6 @@ bnx2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 		GET_32BIT_NET_STATS(stat_IfInMBUFDiscards) +
 		GET_32BIT_NET_STATS(stat_FwRxDrop);
 
-	return net_stats;
 }
 
 /* All ethtool functions called with rtnl_lock */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 98e948489700..e5f458396e1a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5879,7 +5879,7 @@ static int bnxt_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return -EOPNOTSUPP;
 }
 
-static struct rtnl_link_stats64 *
+static void
 bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	u32 i;
@@ -5888,7 +5888,7 @@ bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	memset(stats, 0, sizeof(struct rtnl_link_stats64));
 
 	if (!bp->bnapi)
-		return stats;
+		return;
 
 	/* TODO check if we need to synchronize with bnxt_close path */
 	for (i = 0; i < bp->cp_nr_rings; i++) {
@@ -5935,8 +5935,6 @@ bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_fifo_errors = le64_to_cpu(tx->tx_fifo_underruns);
 		stats->tx_errors = le64_to_cpu(tx->tx_err);
 	}
-
-	return stats;
 }
 
 static bool bnxt_mc_list_updated(struct bnxt *bp, u32 *rx_mask)
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 185e9e047aa9..800328f562fa 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14142,8 +14142,8 @@ static const struct ethtool_ops tg3_ethtool_ops = {
 	.set_link_ksettings	= tg3_set_link_ksettings,
 };
 
-static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *stats)
+static void tg3_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *stats)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
@@ -14151,13 +14151,11 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
 	if (!tp->hw_stats) {
 		*stats = tp->net_stats_prev;
 		spin_unlock_bh(&tp->lock);
-		return stats;
+		return;
 	}
 
 	tg3_get_nstats(tp, stats);
 	spin_unlock_bh(&tp->lock);
-
-	return stats;
 }
 
 static void tg3_set_rx_mode(struct net_device *dev)
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index 112030828c4b..73a94113db1f 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -3111,7 +3111,7 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev)
  * Used spin_lock to synchronize reading of stats structures, which
  * is written by BNA under the same lock.
  */
-static struct rtnl_link_stats64 *
+static void
 bnad_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
 {
 	struct bnad *bnad = netdev_priv(netdev);
@@ -3123,8 +3123,6 @@ bnad_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
 	bnad_netdev_hwstats_fill(bnad, stats);
 
 	spin_unlock_irqrestore(&bnad->bna_lock, flags);
-
-	return stats;
 }
 
 static void
@@ -3427,7 +3425,7 @@ static const struct net_device_ops bnad_netdev_ops = {
 	.ndo_open		= bnad_open,
 	.ndo_stop		= bnad_stop,
 	.ndo_start_xmit		= bnad_start_xmit,
-	.ndo_get_stats64		= bnad_get_stats64,
+	.ndo_get_stats64	= bnad_get_stats64,
 	.ndo_set_rx_mode	= bnad_set_rx_mode,
 	.ndo_validate_addr      = eth_validate_addr,
 	.ndo_set_mac_address    = bnad_set_mac_address,
diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c
index ce7de6f72512..b0540658afad 100644
--- a/drivers/net/ethernet/calxeda/xgmac.c
+++ b/drivers/net/ethernet/calxeda/xgmac.c
@@ -1446,9 +1446,9 @@ static void xgmac_poll_controller(struct net_device *dev)
 }
 #endif
 
-static struct rtnl_link_stats64 *
+static void
 xgmac_get_stats64(struct net_device *dev,
-		       struct rtnl_link_stats64 *storage)
+		  struct rtnl_link_stats64 *storage)
 {
 	struct xgmac_priv *priv = netdev_priv(dev);
 	void __iomem *base = priv->base;
@@ -1476,7 +1476,6 @@ xgmac_get_stats64(struct net_device *dev,
 
 	writel(0, base + XGMAC_MMC_CTRL);
 	spin_unlock_bh(&priv->stats_lock);
-	return storage;
 }
 
 static int xgmac_set_mac_address(struct net_device *dev, void *p)
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 2006f58b14b1..273eafdb1c57 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1461,8 +1461,8 @@ void nicvf_update_stats(struct nicvf *nic)
 		nicvf_update_sq_stats(nic, qidx);
 }
 
-static struct rtnl_link_stats64 *nicvf_get_stats64(struct net_device *netdev,
-					    struct rtnl_link_stats64 *stats)
+static void nicvf_get_stats64(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct nicvf *nic = netdev_priv(netdev);
 	struct nicvf_hw_stats *hw_stats = &nic->hw_stats;
@@ -1478,7 +1478,6 @@ static struct rtnl_link_stats64 *nicvf_get_stats64(struct net_device *netdev,
 	stats->tx_packets = hw_stats->tx_frames;
 	stats->tx_dropped = hw_stats->tx_drops;
 
-	return stats;
 }
 
 static void nicvf_tx_timeout(struct net_device *dev)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 629b11879ceb..3349e1f376c3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2375,8 +2375,8 @@ int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
 }
 EXPORT_SYMBOL(cxgb4_remove_server_filter);
 
-static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
-						struct rtnl_link_stats64 *ns)
+static void cxgb_get_stats(struct net_device *dev,
+			   struct rtnl_link_stats64 *ns)
 {
 	struct port_stats stats;
 	struct port_info *p = netdev_priv(dev);
@@ -2389,7 +2389,7 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
 	spin_lock(&adapter->stats_lock);
 	if (!netif_device_present(dev)) {
 		spin_unlock(&adapter->stats_lock);
-		return ns;
+		return;
 	}
 	t4_get_port_stats_offset(adapter, p->tx_chan, &stats,
 				 &p->stats_base);
@@ -2423,7 +2423,6 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
 	ns->tx_errors = stats.tx_error_frames;
 	ns->rx_errors = stats.rx_symbol_err + stats.rx_fcs_err +
 		ns->rx_length_errors + stats.rx_len_err + ns->rx_fifo_errors;
-	return ns;
 }
 
 static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index cdd7a1a59aa7..c5842c525eed 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -680,8 +680,8 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb,
 }
 
 /* dev_base_lock rwlock held, nominally process context */
-static struct rtnl_link_stats64 *enic_get_stats(struct net_device *netdev,
-						struct rtnl_link_stats64 *net_stats)
+static void enic_get_stats(struct net_device *netdev,
+			   struct rtnl_link_stats64 *net_stats)
 {
 	struct enic *enic = netdev_priv(netdev);
 	struct vnic_stats *stats;
@@ -693,7 +693,7 @@ static struct rtnl_link_stats64 *enic_get_stats(struct net_device *netdev,
 	 * recorded stats.
 	 */
 	if (err == -ENOMEM)
-		return net_stats;
+		return;
 
 	net_stats->tx_packets = stats->tx.tx_frames_ok;
 	net_stats->tx_bytes = stats->tx.tx_bytes_ok;
@@ -707,8 +707,6 @@ static struct rtnl_link_stats64 *enic_get_stats(struct net_device *netdev,
 	net_stats->rx_over_errors = enic->rq_truncated_pkts;
 	net_stats->rx_crc_errors = enic->rq_bad_fcs;
 	net_stats->rx_dropped = stats->rx.rx_no_bufs + stats->rx.rx_drop;
-
-	return net_stats;
 }
 
 static int enic_mc_sync(struct net_device *netdev, const u8 *mc_addr)
diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c
index 7bf78a0d322c..278f139f2a22 100644
--- a/drivers/net/ethernet/ec_bhf.c
+++ b/drivers/net/ethernet/ec_bhf.c
@@ -457,7 +457,7 @@ static int ec_bhf_stop(struct net_device *net_dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *
+static void
 ec_bhf_get_stats(struct net_device *net_dev,
 		 struct rtnl_link_stats64 *stats)
 {
@@ -472,8 +472,6 @@ ec_bhf_get_stats(struct net_device *net_dev,
 
 	stats->tx_bytes = priv->stat_tx_bytes;
 	stats->rx_bytes = priv->stat_rx_bytes;
-
-	return stats;
 }
 
 static const struct net_device_ops ec_bhf_netdev_ops = {
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 225e9a4877d7..0a679d2eaeee 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -639,8 +639,8 @@ void be_parse_stats(struct be_adapter *adapter)
 	}
 }
 
-static struct rtnl_link_stats64 *be_get_stats64(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void be_get_stats64(struct net_device *netdev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_drv_stats *drvs = &adapter->drv_stats;
@@ -704,7 +704,6 @@ static struct rtnl_link_stats64 *be_get_stats64(struct net_device *netdev,
 	stats->rx_fifo_errors = drvs->rxpp_fifo_overflow_drop +
 				drvs->rx_input_fifo_overflow_drop +
 				drvs->rx_drops_no_pbuf;
-	return stats;
 }
 
 void be_link_status_update(struct be_adapter *adapter, u8 link_status)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index c9b7ad65e563..b7cbc26a0911 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -313,8 +313,8 @@ static void dpaa_tx_timeout(struct net_device *net_dev)
 /* Calculates the statistics for the given device by adding the statistics
  * collected by each CPU.
  */
-static struct rtnl_link_stats64 *dpaa_get_stats64(struct net_device *net_dev,
-						  struct rtnl_link_stats64 *s)
+static void dpaa_get_stats64(struct net_device *net_dev,
+			     struct rtnl_link_stats64 *s)
 {
 	int numstats = sizeof(struct rtnl_link_stats64) / sizeof(u64);
 	struct dpaa_priv *priv = netdev_priv(net_dev);
@@ -332,8 +332,6 @@ static struct rtnl_link_stats64 *dpaa_get_stats64(struct net_device *net_dev,
 		for (j = 0; j < numstats; j++)
 			netstats[j] += cpustats[j];
 	}
-
-	return s;
 }
 
 static struct mac_device *dpaa_mac_dev_get(struct platform_device *pdev)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 672b64606321..b7cb61385ad8 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -1625,8 +1625,8 @@ void hns_nic_set_rx_mode(struct net_device *ndev)
 		netdev_err(ndev, "sync uc address fail\n");
 }
 
-struct rtnl_link_stats64 *hns_nic_get_stats64(struct net_device *ndev,
-					      struct rtnl_link_stats64 *stats)
+static void hns_nic_get_stats64(struct net_device *ndev,
+				struct rtnl_link_stats64 *stats)
 {
 	int idx = 0;
 	u64 tx_bytes = 0;
@@ -1668,8 +1668,6 @@ struct rtnl_link_stats64 *hns_nic_get_stats64(struct net_device *ndev,
 	stats->tx_window_errors = ndev->stats.tx_window_errors;
 	stats->rx_compressed = ndev->stats.rx_compressed;
 	stats->tx_compressed = ndev->stats.tx_compressed;
-
-	return stats;
 }
 
 static u16
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 702446a93697..1e53d7a82675 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -328,8 +328,8 @@ out:
 	spin_unlock_irqrestore(&ehea_bcmc_regs.lock, flags);
 }
 
-static struct rtnl_link_stats64 *ehea_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *stats)
+static void ehea_get_stats64(struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
 {
 	struct ehea_port *port = netdev_priv(dev);
 	u64 rx_packets = 0, tx_packets = 0, rx_bytes = 0, tx_bytes = 0;
@@ -352,7 +352,6 @@ static struct rtnl_link_stats64 *ehea_get_stats64(struct net_device *dev,
 
 	stats->multicast = port->stats.multicast;
 	stats->rx_errors = port->stats.rx_errors;
-	return stats;
 }
 
 static void ehea_update_stats(struct work_struct *work)
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 879cca47b021..a29b12e80855 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -493,8 +493,8 @@ int e1000e_setup_rx_resources(struct e1000_ring *ring);
 int e1000e_setup_tx_resources(struct e1000_ring *ring);
 void e1000e_free_rx_resources(struct e1000_ring *ring);
 void e1000e_free_tx_resources(struct e1000_ring *ring);
-struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats);
+void e1000e_get_stats64(struct net_device *netdev,
+			struct rtnl_link_stats64 *stats);
 void e1000e_set_interrupt_capability(struct e1000_adapter *adapter);
 void e1000e_reset_interrupt_capability(struct e1000_adapter *adapter);
 void e1000e_get_hw_control(struct e1000_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index af3960853a32..723025b317cc 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5920,8 +5920,8 @@ static void e1000_reset_task(struct work_struct *work)
  *
  * Returns the address of the device statistics structure.
  **/
-struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats)
+void e1000e_get_stats64(struct net_device *netdev,
+			struct rtnl_link_stats64 *stats)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 
@@ -5958,7 +5958,6 @@ struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev,
 	/* Tx Dropped needs to be maintained elsewhere */
 
 	spin_unlock(&adapter->stats64_lock);
-	return stats;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index bc5ef6eb3dd6..01db688cf539 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1118,8 +1118,8 @@ void fm10k_reset_rx_state(struct fm10k_intfc *interface)
  * Returns 64bit statistics, for use in the ndo_get_stats64 callback. This
  * function replaces fm10k_get_stats for kernels which support it.
  */
-static struct rtnl_link_stats64 *fm10k_get_stats64(struct net_device *netdev,
-						   struct rtnl_link_stats64 *stats)
+static void fm10k_get_stats64(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_ring *ring;
@@ -1164,8 +1164,6 @@ static struct rtnl_link_stats64 *fm10k_get_stats64(struct net_device *netdev,
 
 	/* following stats updated by fm10k_service_task() */
 	stats->rx_missed_errors	= netdev->stats.rx_missed_errors;
-
-	return stats;
 }
 
 int fm10k_setup_tc(struct net_device *dev, u8 tc)
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index ba8d30984bee..342007df4663 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -834,9 +834,8 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector)
 void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf);
 void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba);
 #ifdef I40E_FCOE
-struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
-					     struct net_device *netdev,
-					     struct rtnl_link_stats64 *storage);
+void i40e_get_netdev_stats_struct(struct net_device *netdev,
+				  struct rtnl_link_stats64 *storage);
 int i40e_set_mac(struct net_device *netdev, void *p);
 void i40e_set_rx_mode(struct net_device *netdev);
 #endif
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ad4cf639430e..b2f76d24000d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -409,15 +409,11 @@ struct rtnl_link_stats64 *i40e_get_vsi_stats_struct(struct i40e_vsi *vsi)
  * Returns the address of the device statistics structure.
  * The statistics are actually updated from the service task.
  **/
-#ifdef I40E_FCOE
-struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
-					     struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats)
-#else
-static struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
-					     struct net_device *netdev,
-					     struct rtnl_link_stats64 *stats)
+#ifndef I40E_FCOE
+static
 #endif
+void i40e_get_netdev_stats_struct(struct net_device *netdev,
+				  struct rtnl_link_stats64 *stats)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_ring *tx_ring, *rx_ring;
@@ -426,10 +422,10 @@ static struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
 	int i;
 
 	if (test_bit(__I40E_DOWN, &vsi->state))
-		return stats;
+		return;
 
 	if (!vsi->tx_rings)
-		return stats;
+		return;
 
 	rcu_read_lock();
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
@@ -469,8 +465,6 @@ static struct rtnl_link_stats64 *i40e_get_netdev_stats_struct(
 	stats->rx_dropped	= vsi_stats->rx_dropped;
 	stats->rx_crc_errors	= vsi_stats->rx_crc_errors;
 	stats->rx_length_errors	= vsi_stats->rx_length_errors;
-
-	return stats;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 594604e09f8d..7546109d4980 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -137,8 +137,8 @@ static void igb_update_phy_info(unsigned long);
 static void igb_watchdog(unsigned long);
 static void igb_watchdog_task(struct work_struct *);
 static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *);
-static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev,
-					  struct rtnl_link_stats64 *stats);
+static void igb_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *stats);
 static int igb_change_mtu(struct net_device *, int);
 static int igb_set_mac(struct net_device *, void *);
 static void igb_set_uta(struct igb_adapter *adapter, bool set);
@@ -5404,8 +5404,8 @@ static void igb_reset_task(struct work_struct *work)
  *  @netdev: network interface device structure
  *  @stats: rtnl_link_stats64 pointer
  **/
-static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void igb_get_stats64(struct net_device *netdev,
+			    struct rtnl_link_stats64 *stats)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 
@@ -5413,8 +5413,6 @@ static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *netdev,
 	igb_update_stats(adapter, &adapter->stats64);
 	memcpy(stats, &adapter->stats64, sizeof(*stats));
 	spin_unlock(&adapter->stats64_lock);
-
-	return stats;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0c6eca570791..ffe7d940d9ff 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8173,8 +8173,9 @@ static void ixgbe_netpoll(struct net_device *netdev)
 }
 
 #endif
-static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
-						   struct rtnl_link_stats64 *stats)
+
+static void ixgbe_get_stats64(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	int i;
@@ -8212,13 +8213,13 @@ static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
 		}
 	}
 	rcu_read_unlock();
+
 	/* following stats updated by ixgbe_watchdog_task() */
 	stats->multicast	= netdev->stats.multicast;
 	stats->rx_errors	= netdev->stats.rx_errors;
 	stats->rx_length_errors	= netdev->stats.rx_length_errors;
 	stats->rx_crc_errors	= netdev->stats.rx_crc_errors;
 	stats->rx_missed_errors	= netdev->stats.rx_missed_errors;
-	return stats;
 }
 
 #ifdef CONFIG_IXGBE_DCB
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 1a28349114f8..b06863560c7d 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3896,8 +3896,8 @@ static void ixgbevf_shutdown(struct pci_dev *pdev)
 	ixgbevf_suspend(pdev, PMSG_SUSPEND);
 }
 
-static struct rtnl_link_stats64 *ixgbevf_get_stats(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void ixgbevf_get_stats(struct net_device *netdev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	unsigned int start;
@@ -3930,8 +3930,6 @@ static struct rtnl_link_stats64 *ixgbevf_get_stats(struct net_device *netdev,
 		stats->tx_bytes += bytes;
 		stats->tx_packets += packets;
 	}
-
-	return stats;
 }
 
 #define IXGBEVF_MAX_MAC_HDR_LEN		127
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index e05e22705cf7..3607d8febbcf 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -652,7 +652,7 @@ static void mvneta_mib_counters_clear(struct mvneta_port *pp)
 }
 
 /* Get System Network Statistics */
-static struct rtnl_link_stats64 *
+static void
 mvneta_get_stats64(struct net_device *dev,
 		   struct rtnl_link_stats64 *stats)
 {
@@ -686,8 +686,6 @@ mvneta_get_stats64(struct net_device *dev,
 	stats->rx_dropped	= dev->stats.rx_dropped;
 
 	stats->tx_dropped	= dev->stats.tx_dropped;
-
-	return stats;
 }
 
 /* Rx descriptors helper methods */
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 4fe430ceb194..69db40e1a4e1 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -5739,7 +5739,7 @@ error:
 	return err;
 }
 
-static struct rtnl_link_stats64 *
+static void
 mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
@@ -5771,8 +5771,6 @@ mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_errors	= dev->stats.rx_errors;
 	stats->rx_dropped	= dev->stats.rx_dropped;
 	stats->tx_dropped	= dev->stats.tx_dropped;
-
-	return stats;
 }
 
 static int mvpp2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index b60ad0e56a9f..18d6336fa162 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -3888,8 +3888,8 @@ static void sky2_set_multicast(struct net_device *dev)
 	gma_write16(hw, port, GM_RX_CTRL, reg);
 }
 
-static struct rtnl_link_stats64 *sky2_get_stats(struct net_device *dev,
-						struct rtnl_link_stats64 *stats)
+static void sky2_get_stats(struct net_device *dev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
@@ -3929,8 +3929,6 @@ static struct rtnl_link_stats64 *sky2_get_stats(struct net_device *dev,
 	stats->rx_dropped = dev->stats.rx_dropped;
 	stats->rx_fifo_errors = dev->stats.rx_fifo_errors;
 	stats->tx_fifo_errors = dev->stats.tx_fifo_errors;
-
-	return stats;
 }
 
 /* Can have one global because blinking is controlled by
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 3dd87889e67e..25ae0c5bce3a 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -462,8 +462,8 @@ static void mtk_stats_update(struct mtk_eth *eth)
 	}
 }
 
-static struct rtnl_link_stats64 *mtk_get_stats64(struct net_device *dev,
-					struct rtnl_link_stats64 *storage)
+static void mtk_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *storage)
 {
 	struct mtk_mac *mac = netdev_priv(dev);
 	struct mtk_hw_stats *hw_stats = mac->hw_stats;
@@ -494,8 +494,6 @@ static struct rtnl_link_stats64 *mtk_get_stats64(struct net_device *dev,
 	storage->tx_errors = dev->stats.tx_errors;
 	storage->rx_dropped = dev->stats.rx_dropped;
 	storage->tx_dropped = dev->stats.tx_dropped;
-
-	return storage;
 }
 
 static inline int mtk_max_frag_size(int mtu)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index edbe200ac2fa..06ef23f040a4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1321,7 +1321,7 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
 }
 
 
-static struct rtnl_link_stats64 *
+static void
 mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -1330,8 +1330,6 @@ mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	mlx4_en_fold_software_stats(dev);
 	netdev_stats_to_stats64(stats, &dev->stats);
 	spin_unlock_bh(&priv->stats_lock);
-
-	return stats;
 }
 
 static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 88dd731bb8cb..60e5670452a1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2686,7 +2686,7 @@ mqprio:
 	return mlx5e_setup_tc(dev, tc->tc);
 }
 
-static struct rtnl_link_stats64 *
+static void
 mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
@@ -2729,7 +2729,6 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->multicast =
 		VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
 
-	return stats;
 }
 
 static void mlx5e_set_rx_mode(struct net_device *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 850378893b25..2c864574a9d5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -374,13 +374,12 @@ int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
 	return -EINVAL;
 }
 
-static struct rtnl_link_stats64 *
+static void
 mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
 	memcpy(stats, &priv->stats.vf_vport, sizeof(*stats));
-	return stats;
 }
 
 static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index d768c7b6c6d6..46c53a042e6b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -947,15 +947,13 @@ out:
 /* Return the stats from a cache that is updated periodically,
  * as this function might get called in an atomic context.
  */
-static struct rtnl_link_stats64 *
+static void
 mlxsw_sp_port_get_stats64(struct net_device *dev,
 			  struct rtnl_link_stats64 *stats)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 
 	memcpy(stats, mlxsw_sp_port->hw_stats.cache, sizeof(*stats));
-
-	return stats;
 }
 
 int mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid_begin,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index 150ccf5192a9..696d40612d28 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -381,7 +381,7 @@ static int mlxsw_sx_port_change_mtu(struct net_device *dev, int mtu)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *
+static void
 mlxsw_sx_port_get_stats64(struct net_device *dev,
 			  struct rtnl_link_stats64 *stats)
 {
@@ -410,7 +410,6 @@ mlxsw_sx_port_get_stats64(struct net_device *dev,
 		tx_dropped	+= p->tx_dropped;
 	}
 	stats->tx_dropped	= tx_dropped;
-	return stats;
 }
 
 static int mlxsw_sx_port_get_phys_port_name(struct net_device *dev, char *name,
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index e506ca876d0d..db297cfce6f4 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -378,8 +378,8 @@ static inline void put_be32(__be32 val, __be32 __iomem * p)
 	__raw_writel((__force __u32) val, (__force void __iomem *)p);
 }
 
-static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
-						    struct rtnl_link_stats64 *stats);
+static void myri10ge_get_stats(struct net_device *dev,
+			       struct rtnl_link_stats64 *stats);
 
 static void set_fw_name(struct myri10ge_priv *mgp, char *name, bool allocated)
 {
@@ -3119,8 +3119,8 @@ drop:
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
-						    struct rtnl_link_stats64 *stats)
+static void myri10ge_get_stats(struct net_device *dev,
+			       struct rtnl_link_stats64 *stats)
 {
 	const struct myri10ge_priv *mgp = netdev_priv(dev);
 	const struct myri10ge_slice_netstats *slice_stats;
@@ -3135,7 +3135,6 @@ static struct rtnl_link_stats64 *myri10ge_get_stats(struct net_device *dev,
 		stats->rx_dropped += slice_stats->rx_dropped;
 		stats->tx_dropped += slice_stats->tx_dropped;
 	}
-	return stats;
 }
 
 static void myri10ge_set_multicast_list(struct net_device *dev)
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index e07b936f64ec..f364502229db 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3111,7 +3111,7 @@ static int vxge_change_mtu(struct net_device *dev, int new_mtu)
  * @stats: pointer to struct rtnl_link_stats64
  *
  */
-static struct rtnl_link_stats64 *
+static void
 vxge_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 {
 	struct vxgedev *vdev = netdev_priv(dev);
@@ -3150,8 +3150,6 @@ vxge_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 		net_stats->tx_bytes += bytes;
 		net_stats->tx_errors += txstats->tx_errors;
 	}
-
-	return net_stats;
 }
 
 static enum vxge_hw_status vxge_timestamp_config(struct __vxge_hw_device *devh)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e8d448109e03..67afd95ffb93 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2638,8 +2638,8 @@ static int nfp_net_change_mtu(struct net_device *netdev, int new_mtu)
 	return nfp_net_ring_reconfig(nn, &nn->xdp_prog, &rx, NULL);
 }
 
-static struct rtnl_link_stats64 *nfp_net_stat64(struct net_device *netdev,
-						struct rtnl_link_stats64 *stats)
+static void nfp_net_stat64(struct net_device *netdev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 	int r;
@@ -2669,8 +2669,6 @@ static struct rtnl_link_stats64 *nfp_net_stat64(struct net_device *netdev,
 		stats->tx_bytes += data[1];
 		stats->tx_errors += data[2];
 	}
-
-	return stats;
 }
 
 static bool nfp_net_ebpf_capable(struct nfp_net *nn)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 3913f07279d2..dfc2c8149d22 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -1733,7 +1733,7 @@ static void nv_update_stats(struct net_device *dev)
  * Called with read_lock(&dev_base_lock) held for read -
  * only synchronized against unregister_netdevice.
  */
-static struct rtnl_link_stats64*
+static void
 nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
 	__acquires(&netdev_priv(dev)->hwstats_lock)
 	__releases(&netdev_priv(dev)->hwstats_lock)
@@ -1793,8 +1793,6 @@ nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
 
 		spin_unlock_bh(&np->hwstats_lock);
 	}
-
-	return storage;
 }
 
 /*
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 561fb94c7267..86fb9d3df700 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -90,8 +90,8 @@ static irqreturn_t netxen_msix_intr(int irq, void *data);
 
 static void netxen_free_ip_list(struct netxen_adapter *, bool);
 static void netxen_restore_indev_addr(struct net_device *dev, unsigned long);
-static struct rtnl_link_stats64 *netxen_nic_get_stats(struct net_device *dev,
-						      struct rtnl_link_stats64 *stats);
+static void netxen_nic_get_stats(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats);
 static int netxen_nic_set_mac(struct net_device *netdev, void *p);
 
 /*  PCI Device ID Table  */
@@ -2302,8 +2302,8 @@ request_reset:
 	clear_bit(__NX_RESETTING, &adapter->state);
 }
 
-static struct rtnl_link_stats64 *netxen_nic_get_stats(struct net_device *netdev,
-						      struct rtnl_link_stats64 *stats)
+static void netxen_nic_get_stats(struct net_device *netdev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct netxen_adapter *adapter = netdev_priv(netdev);
 
@@ -2313,8 +2313,6 @@ static struct rtnl_link_stats64 *netxen_nic_get_stats(struct net_device *netdev,
 	stats->tx_bytes = adapter->stats.txbytes;
 	stats->rx_dropped = adapter->stats.rxdropped;
 	stats->tx_dropped = adapter->stats.txdropped;
-
-	return stats;
 }
 
 static irqreturn_t netxen_intr(int irq, void *data)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index b58509feecd5..40a76a1d5973 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -398,9 +398,8 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	edev->stats.tx_mac_ctrl_frames = stats.tx_mac_ctrl_frames;
 }
 
-static
-struct rtnl_link_stats64 *qede_get_stats64(struct net_device *dev,
-					   struct rtnl_link_stats64 *stats)
+static void qede_get_stats64(struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 
@@ -430,8 +429,6 @@ struct rtnl_link_stats64 *qede_get_stats64(struct net_device *dev,
 	stats->collisions = edev->stats.tx_total_collisions;
 	stats->rx_crc_errors = edev->stats.rx_crc_errors;
 	stats->rx_frame_errors = edev->stats.rx_align_errors;
-
-	return stats;
 }
 
 #ifdef CONFIG_QED_SRIOV
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 422289c232bc..40ebe010b06f 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -312,8 +312,8 @@ static int emac_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 }
 
 /* Provide network statistics info for the interface */
-static struct rtnl_link_stats64 *emac_get_stats64(struct net_device *netdev,
-						  struct rtnl_link_stats64 *net_stats)
+static void emac_get_stats64(struct net_device *netdev,
+			     struct rtnl_link_stats64 *net_stats)
 {
 	struct emac_adapter *adpt = netdev_priv(netdev);
 	unsigned int addr = REG_MAC_RX_STATUS_BIN;
@@ -377,8 +377,6 @@ static struct rtnl_link_stats64 *emac_get_stats64(struct net_device *netdev,
 	net_stats->tx_window_errors = stats->tx_late_col;
 
 	spin_unlock(&stats->lock);
-
-	return net_stats;
 }
 
 static const struct net_device_ops emac_netdev_ops = {
diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 9bc047ac883b..5ad59c6d29a4 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -653,9 +653,8 @@ static int rtl8139_poll(struct napi_struct *napi, int budget);
 static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance);
 static int rtl8139_close (struct net_device *dev);
 static int netdev_ioctl (struct net_device *dev, struct ifreq *rq, int cmd);
-static struct rtnl_link_stats64 *rtl8139_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64
-						    *stats);
+static void rtl8139_get_stats64(struct net_device *dev,
+				struct rtnl_link_stats64 *stats);
 static void rtl8139_set_rx_mode (struct net_device *dev);
 static void __set_rx_mode (struct net_device *dev);
 static void rtl8139_hw_start (struct net_device *dev);
@@ -2516,7 +2515,7 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 }
 
 
-static struct rtnl_link_stats64 *
+static void
 rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
@@ -2544,8 +2543,6 @@ rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_packets = tp->tx_stats.packets;
 		stats->tx_bytes = tp->tx_stats.bytes;
 	} while (u64_stats_fetch_retry_irq(&tp->tx_stats.syncp, start));
-
-	return stats;
 }
 
 /* Set or clear the multicast filter for this adaptor.
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 44389c90056a..858f4554de11 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7755,7 +7755,7 @@ err_pm_runtime_put:
 	goto out;
 }
 
-static struct rtnl_link_stats64 *
+static void
 rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
@@ -7809,8 +7809,6 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		le16_to_cpu(tp->tc_offset.tx_aborted);
 
 	pm_runtime_put_noidle(&pdev->dev);
-
-	return stats;
 }
 
 static void rtl8169_net_suspend(struct net_device *dev)
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index cddcff5a00a7..07074d9bc45d 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1706,11 +1706,9 @@ static inline u64 sxgbe_get_stat64(void __iomem *ioaddr, int reg_lo, int reg_hi)
  *  This function is a driver entry point whenever ifconfig command gets
  *  executed to see device statistics. Statistics are number of
  *  bytes sent or received, errors occurred etc.
- *  Return value:
- *  This function returns various statistical information of device.
  */
-static struct rtnl_link_stats64 *sxgbe_get_stats64(struct net_device *dev,
-						   struct rtnl_link_stats64 *stats)
+static void sxgbe_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats)
 {
 	struct sxgbe_priv_data *priv = netdev_priv(dev);
 	void __iomem *ioaddr = priv->ioaddr;
@@ -1761,8 +1759,6 @@ static struct rtnl_link_stats64 *sxgbe_get_stats64(struct net_device *dev,
 						 SXGBE_MMC_TXUFLWHI_GBCNT_REG);
 	writel(0, ioaddr + SXGBE_MMC_CTL_REG);
 	spin_unlock(&priv->stats_lock);
-
-	return stats;
 }
 
 /*  sxgbe_set_features - entry point to set offload features of the device.
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index bbbed2e84de8..ebeecb8fed45 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2219,16 +2219,14 @@ int efx_net_stop(struct net_device *net_dev)
 }
 
 /* Context: process, dev_base_lock or RTNL held, non-blocking. */
-static struct rtnl_link_stats64 *efx_net_stats(struct net_device *net_dev,
-					       struct rtnl_link_stats64 *stats)
+static void efx_net_stats(struct net_device *net_dev,
+			  struct rtnl_link_stats64 *stats)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
 	spin_lock_bh(&efx->stats_lock);
 	efx->type->update_stats(efx, NULL, stats);
 	spin_unlock_bh(&efx->stats_lock);
-
-	return stats;
 }
 
 /* Context: netif_tx_lock held, BHs disabled. */
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index ec3ac0e45cc9..8cfbe01e1ddf 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -2158,16 +2158,14 @@ int ef4_net_stop(struct net_device *net_dev)
 }
 
 /* Context: process, dev_base_lock or RTNL held, non-blocking. */
-static struct rtnl_link_stats64 *ef4_net_stats(struct net_device *net_dev,
-					       struct rtnl_link_stats64 *stats)
+static void ef4_net_stats(struct net_device *net_dev,
+			  struct rtnl_link_stats64 *stats)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
 
 	spin_lock_bh(&efx->stats_lock);
 	efx->type->update_stats(efx, NULL, stats);
 	spin_unlock_bh(&efx->stats_lock);
-
-	return stats;
 }
 
 /* Context: netif_tx_lock held, BHs disabled. */
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index f90d1af6d390..e557a3290a25 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -6294,8 +6294,8 @@ no_rings:
 	stats->tx_errors = errors;
 }
 
-static struct rtnl_link_stats64 *niu_get_stats(struct net_device *dev,
-					       struct rtnl_link_stats64 *stats)
+static void niu_get_stats(struct net_device *dev,
+			  struct rtnl_link_stats64 *stats)
 {
 	struct niu *np = netdev_priv(dev);
 
@@ -6303,8 +6303,6 @@ static struct rtnl_link_stats64 *niu_get_stats(struct net_device *dev,
 		niu_get_rx_stats(np, stats);
 		niu_get_tx_stats(np, stats);
 	}
-
-	return stats;
 }
 
 static void niu_load_hash_xmac(struct niu *np, u16 *hash)
diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
index 09f5a67da35e..467dcc53f5e1 100644
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -2490,7 +2490,7 @@ static void dwceqos_read_mmc_counters(struct net_local *lp, u32 rx_mask,
 			dwceqos_read(lp, DWC_MMC_RXPACKETCOUNT_GB);
 }
 
-static struct rtnl_link_stats64*
+static void
 dwceqos_get_stats64(struct net_device *ndev, struct rtnl_link_stats64 *s)
 {
 	unsigned long flags;
@@ -2522,8 +2522,6 @@ dwceqos_get_stats64(struct net_device *ndev, struct rtnl_link_stats64 *s)
 	else
 		s->tx_errors = hwstats->txunderflowerror +
 			hwstats->txcarriererror;
-
-	return s;
 }
 
 static void
diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c
index 0a3b7dafa3ba..30cfea62a356 100644
--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -2047,8 +2047,8 @@ static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
  *
  * Returns the address of the device statistics structure.
  */
-static struct rtnl_link_stats64 *tile_net_get_stats64(struct net_device *dev,
-		struct rtnl_link_stats64 *stats)
+static void tile_net_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct tile_net_priv *priv = netdev_priv(dev);
 	u64 rx_packets = 0, tx_packets = 0;
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index 0a6c4e804eed..453a1fad560c 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -513,8 +513,8 @@ static irqreturn_t rhine_interrupt(int irq, void *dev_instance);
 static void rhine_tx(struct net_device *dev);
 static int rhine_rx(struct net_device *dev, int limit);
 static void rhine_set_rx_mode(struct net_device *dev);
-static struct rtnl_link_stats64 *rhine_get_stats64(struct net_device *dev,
-	       struct rtnl_link_stats64 *stats);
+static void rhine_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats);
 static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 static const struct ethtool_ops netdev_ethtool_ops;
 static int  rhine_close(struct net_device *dev);
@@ -2221,7 +2221,7 @@ out_unlock:
 	mutex_unlock(&rp->task_lock);
 }
 
-static struct rtnl_link_stats64 *
+static void
 rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct rhine_private *rp = netdev_priv(dev);
@@ -2244,8 +2244,6 @@ rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_packets = rp->tx_stats.packets;
 		stats->tx_bytes = rp->tx_stats.bytes;
 	} while (u64_stats_fetch_retry_irq(&rp->tx_stats.syncp, start));
-
-	return stats;
 }
 
 static void rhine_set_rx_mode(struct net_device *dev)
diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c
index b77e4ecf3cf2..5028001429c7 100644
--- a/drivers/net/fjes/fjes_main.c
+++ b/drivers/net/fjes/fjes_main.c
@@ -57,8 +57,7 @@ static void fjes_raise_intr_rxdata_task(struct work_struct *);
 static void fjes_tx_stall_task(struct work_struct *);
 static void fjes_force_close_task(struct work_struct *);
 static irqreturn_t fjes_intr(int, void*);
-static struct rtnl_link_stats64 *
-fjes_get_stats64(struct net_device *, struct rtnl_link_stats64 *);
+static void fjes_get_stats64(struct net_device *, struct rtnl_link_stats64 *);
 static int fjes_change_mtu(struct net_device *, int);
 static int fjes_vlan_rx_add_vid(struct net_device *, __be16 proto, u16);
 static int fjes_vlan_rx_kill_vid(struct net_device *, __be16 proto, u16);
@@ -782,14 +781,12 @@ static void fjes_tx_retry(struct net_device *netdev)
 	netif_tx_wake_queue(queue);
 }
 
-static struct rtnl_link_stats64 *
+static void
 fjes_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
 {
 	struct fjes_adapter *adapter = netdev_priv(netdev);
 
 	memcpy(stats, &adapter->stats64, sizeof(struct rtnl_link_stats64));
-
-	return stats;
 }
 
 static int fjes_change_mtu(struct net_device *netdev, int new_mtu)
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index c9414c054852..05374fce7da4 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -908,8 +908,8 @@ out:
 	return ret;
 }
 
-static struct rtnl_link_stats64 *netvsc_get_stats64(struct net_device *net,
-						    struct rtnl_link_stats64 *t)
+static void netvsc_get_stats64(struct net_device *net,
+			       struct rtnl_link_stats64 *t)
 {
 	struct net_device_context *ndev_ctx = netdev_priv(net);
 	int cpu;
@@ -947,8 +947,6 @@ static struct rtnl_link_stats64 *netvsc_get_stats64(struct net_device *net,
 
 	t->rx_dropped	= net->stats.rx_dropped;
 	t->rx_errors	= net->stats.rx_errors;
-
-	return t;
 }
 
 static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 66c0eeafcb5d..082534e187fc 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -129,8 +129,8 @@ resched:
 
 }
 
-static struct rtnl_link_stats64 *ifb_stats64(struct net_device *dev,
-					     struct rtnl_link_stats64 *stats)
+static void ifb_stats64(struct net_device *dev,
+			struct rtnl_link_stats64 *stats)
 {
 	struct ifb_dev_private *dp = netdev_priv(dev);
 	struct ifb_q_private *txp = dp->tx_private;
@@ -157,8 +157,6 @@ static struct rtnl_link_stats64 *ifb_stats64(struct net_device *dev,
 	}
 	stats->rx_dropped = dev->stats.rx_dropped;
 	stats->tx_dropped = dev->stats.tx_dropped;
-
-	return stats;
 }
 
 static int ifb_dev_init(struct net_device *dev)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ce7ca6a5aa8a..1cdb8c5ec403 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -303,8 +303,8 @@ static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
 	dev_mc_sync(ipvlan->phy_dev, dev);
 }
 
-static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64 *s)
+static void ipvlan_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *s)
 {
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 
@@ -341,7 +341,6 @@ static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev,
 		s->rx_dropped = rx_errs;
 		s->tx_dropped = tx_drps;
 	}
-	return s;
 }
 
 static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 1e05b7c2d157..30a493936e63 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -97,8 +97,8 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
-						      struct rtnl_link_stats64 *stats)
+static void loopback_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	u64 bytes = 0;
 	u64 packets = 0;
@@ -122,7 +122,6 @@ static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
 	stats->tx_packets = packets;
 	stats->rx_bytes   = bytes;
 	stats->tx_bytes   = bytes;
-	return stats;
 }
 
 static u32 always_on(struct net_device *dev)
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index f83cf6696820..778a77303c49 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2888,13 +2888,13 @@ static int macsec_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *macsec_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64 *s)
+static void macsec_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *s)
 {
 	int cpu;
 
 	if (!dev->tstats)
-		return s;
+		return;
 
 	for_each_possible_cpu(cpu) {
 		struct pcpu_sw_netstats *stats;
@@ -2918,8 +2918,6 @@ static struct rtnl_link_stats64 *macsec_get_stats64(struct net_device *dev,
 
 	s->rx_dropped = dev->stats.rx_dropped;
 	s->tx_dropped = dev->stats.tx_dropped;
-
-	return s;
 }
 
 static int macsec_get_iflink(const struct net_device *dev)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 20b3fdf282c5..440ab3d8adf7 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -855,8 +855,8 @@ static void macvlan_uninit(struct net_device *dev)
 		macvlan_port_destroy(port->dev);
 }
 
-static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
-							 struct rtnl_link_stats64 *stats)
+static void macvlan_dev_get_stats64(struct net_device *dev,
+				    struct rtnl_link_stats64 *stats)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
@@ -893,7 +893,6 @@ static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
 		stats->rx_dropped	= rx_errors;
 		stats->tx_dropped	= tx_dropped;
 	}
-	return stats;
 }
 
 static int macvlan_vlan_rx_add_vid(struct net_device *dev,
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index 2de7faee9b19..b91603835d26 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -58,7 +58,7 @@ static int nlmon_close(struct net_device *dev)
 	return netlink_remove_tap(&nlmon->nt);
 }
 
-static struct rtnl_link_stats64 *
+static void
 nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -86,8 +86,6 @@ nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 
 	stats->rx_bytes = bytes;
 	stats->tx_bytes = 0;
-
-	return stats;
 }
 
 static u32 always_on(struct net_device *dev)
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 3d3b1f4339ef..a411b43a69eb 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1297,7 +1297,7 @@ ppp_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return err;
 }
 
-static struct rtnl_link_stats64*
+static void
 ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
 {
 	struct ppp *ppp = netdev_priv(dev);
@@ -1317,8 +1317,6 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
 	stats64->rx_dropped       = dev->stats.rx_dropped;
 	stats64->tx_dropped       = dev->stats.tx_dropped;
 	stats64->rx_length_errors = dev->stats.rx_length_errors;
-
-	return stats64;
 }
 
 static int ppp_dev_init(struct net_device *dev)
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 9841f3dc0682..08db4d687533 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -566,7 +566,7 @@ static int sl_change_mtu(struct net_device *dev, int new_mtu)
 
 /* Netdevice get statistics request */
 
-static struct rtnl_link_stats64 *
+static void
 sl_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct net_device_stats *devstats = &dev->stats;
@@ -597,7 +597,6 @@ sl_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->collisions     += comp->sls_o_misses;
 	}
 #endif
-	return stats;
 }
 
 /* Netdevice register callback */
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index bdc58567d10e..a3711769544b 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1798,7 +1798,7 @@ unwind:
 	return err;
 }
 
-static struct rtnl_link_stats64 *
+static void
 team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct team *team = netdev_priv(dev);
@@ -1835,7 +1835,6 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_dropped	= rx_dropped;
 	stats->tx_dropped	= tx_dropped;
 	stats->rx_nohandler	= rx_nohandler;
-	return stats;
 }
 
 static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cd8e02c94be0..8c1d3bd6b4d0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -953,7 +953,7 @@ static void tun_set_headroom(struct net_device *dev, int new_hr)
 	tun->align = new_hr;
 }
 
-static struct rtnl_link_stats64 *
+static void
 tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0;
@@ -987,7 +987,6 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_dropped  = rx_dropped;
 	stats->rx_frame_errors = rx_frame_errors;
 	stats->tx_dropped = tx_dropped;
-	return stats;
 }
 
 static const struct net_device_ops tun_netdev_ops = {
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 0520952aa096..8c39d6d690e5 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -158,8 +158,8 @@ static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
 	return atomic64_read(&priv->dropped);
 }
 
-static struct rtnl_link_stats64 *veth_get_stats64(struct net_device *dev,
-						  struct rtnl_link_stats64 *tot)
+static void veth_get_stats64(struct net_device *dev,
+			     struct rtnl_link_stats64 *tot)
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	struct net_device *peer;
@@ -177,8 +177,6 @@ static struct rtnl_link_stats64 *veth_get_stats64(struct net_device *dev,
 		tot->rx_packets = one.packets;
 	}
 	rcu_read_unlock();
-
-	return tot;
 }
 
 /* fake multicast ability */
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 2cea022e6e6e..37db91d1a0a3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1272,8 +1272,8 @@ out:
 	return ret;
 }
 
-static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
-					       struct rtnl_link_stats64 *tot)
+static void virtnet_stats(struct net_device *dev,
+			  struct rtnl_link_stats64 *tot)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int cpu;
@@ -1306,8 +1306,6 @@ static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
 	tot->rx_dropped = dev->stats.rx_dropped;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
-
-	return tot;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index aabc6ef366b4..f88ffafebfbf 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -113,7 +113,7 @@ vmxnet3_global_stats[] = {
 };
 
 
-struct rtnl_link_stats64 *
+void
 vmxnet3_get_stats64(struct net_device *netdev,
 		   struct rtnl_link_stats64 *stats)
 {
@@ -160,8 +160,6 @@ vmxnet3_get_stats64(struct net_device *netdev,
 		stats->rx_dropped += drvRxStats->drop_total;
 		stats->multicast +=  devRxStats->mcastPktsRxOK;
 	}
-
-	return stats;
 }
 
 static int
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 59e077be8829..ba1c9f93592b 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -465,8 +465,8 @@ vmxnet3_create_queues(struct vmxnet3_adapter *adapter,
 
 void vmxnet3_set_ethtool_ops(struct net_device *netdev);
 
-struct rtnl_link_stats64 *
-vmxnet3_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats);
+void vmxnet3_get_stats64(struct net_device *dev,
+			 struct rtnl_link_stats64 *stats);
 
 extern char vmxnet3_driver_name[];
 #endif
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 23dfb0eac098..895e3e258543 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -77,8 +77,8 @@ static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
-static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
-						 struct rtnl_link_stats64 *stats)
+static void vrf_get_stats64(struct net_device *dev,
+			    struct rtnl_link_stats64 *stats)
 {
 	int i;
 
@@ -102,7 +102,6 @@ static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
 		stats->rx_bytes += rbytes;
 		stats->rx_packets += rpkts;
 	}
-	return stats;
 }
 
 /* Local traffic destined to local address. Reinsert the packet to rx
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index a479cd99911d..40f26b69beb1 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1073,8 +1073,8 @@ static int xennet_change_mtu(struct net_device *dev, int mtu)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *xennet_get_stats64(struct net_device *dev,
-						    struct rtnl_link_stats64 *tot)
+static void xennet_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *tot)
 {
 	struct netfront_info *np = netdev_priv(dev);
 	int cpu;
@@ -1105,8 +1105,6 @@ static struct rtnl_link_stats64 *xennet_get_stats64(struct net_device *dev,
 
 	tot->rx_errors  = dev->stats.rx_errors;
 	tot->tx_dropped = dev->stats.tx_dropped;
-
-	return tot;
 }
 
 static void xennet_release_tx_bufs(struct netfront_queue *queue)
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index fb0928a4fb97..f84069ffa8c6 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -397,14 +397,6 @@ static void xlr_stats(struct net_device *ndev, struct rtnl_link_stats64 *stats)
 			TX_DROP_FRAME_COUNTER);
 }
 
-static struct rtnl_link_stats64 *xlr_get_stats64(struct net_device *ndev,
-						 struct rtnl_link_stats64 *stats
-						 )
-{
-	xlr_stats(ndev, stats);
-	return stats;
-}
-
 static const struct net_device_ops xlr_netdev_ops = {
 	.ndo_open = xlr_net_open,
 	.ndo_stop = xlr_net_stop,
@@ -412,7 +404,7 @@ static const struct net_device_ops xlr_netdev_ops = {
 	.ndo_select_queue = xlr_net_select_queue,
 	.ndo_set_mac_address = xlr_net_set_mac_addr,
 	.ndo_set_rx_mode = xlr_set_rx_mode,
-	.ndo_get_stats64 = xlr_get_stats64,
+	.ndo_get_stats64 = xlr_stats,
 };
 
 /*
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ecd78b3c9aba..b14ad9c139d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -913,8 +913,8 @@ struct netdev_xdp {
  *	Callback used when the transmitter has not made any progress
  *	for dev->watchdog ticks.
  *
- * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
- *                      struct rtnl_link_stats64 *storage);
+ * void (*ndo_get_stats64)(struct net_device *dev,
+ *                         struct rtnl_link_stats64 *storage);
  * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
  *	Called when a user wants to get the network device usage
  *	statistics. Drivers must do one of the following:
@@ -1165,8 +1165,8 @@ struct net_device_ops {
 						   struct neigh_parms *);
 	void			(*ndo_tx_timeout) (struct net_device *dev);
 
-	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
-						     struct rtnl_link_stats64 *storage);
+	void			(*ndo_get_stats64)(struct net_device *dev,
+						   struct rtnl_link_stats64 *storage);
 	bool			(*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
 	int			(*ndo_get_offload_stats)(int attr_id,
 							 const struct net_device *dev,
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index e893fe43dd13..3d4ca4df1209 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -261,8 +261,8 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
 
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *tot);
+void ip_tunnel_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *tot);
 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 				   int link, __be16 flags,
 				   __be32 remote, __be32 local,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 10da6c588bf8..116455ac3db5 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
 	return 0;
 }
 
-static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+static void vlan_dev_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct vlan_pcpu_stats *p;
 	u32 rx_errors = 0, tx_dropped = 0;
@@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
 	}
 	stats->rx_errors  = rx_errors;
 	stats->tx_dropped = tx_dropped;
-
-	return stats;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ed3b3192fb00..6c46d1b4cdbb 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -153,8 +153,8 @@ static int br_dev_stop(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *stats)
+static void br_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *stats)
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -178,8 +178,6 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
 	stats->tx_packets = sum.tx_packets;
 	stats->rx_bytes   = sum.rx_bytes;
 	stats->rx_packets = sum.rx_packets;
-
-	return stats;
 }
 
 static int br_change_mtu(struct net_device *dev, int new_mtu)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index fed3d29f9eb3..5476110598f7 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
 EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
 
 /* Often modified stats are per cpu, other are shared (netdev->stats) */
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
-						struct rtnl_link_stats64 *tot)
+void ip_tunnel_get_stats64(struct net_device *dev,
+			   struct rtnl_link_stats64 *tot)
 {
 	int i;
 
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 		tot->rx_bytes   += rx_bytes;
 		tot->tx_bytes   += tx_bytes;
 	}
-
-	return tot;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
 
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index e2c6ae024565..8bf18a5f66e0 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
-						      struct rtnl_link_stats64 *stats)
+static void l2tp_eth_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct l2tp_eth *priv = netdev_priv(dev);
 
@@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
 	stats->rx_bytes   = atomic_long_read(&priv->rx_bytes);
 	stats->rx_packets = atomic_long_read(&priv->rx_packets);
 	stats->rx_errors  = atomic_long_read(&priv->rx_errors);
-	return stats;
 }
 
-
 static const struct net_device_ops l2tp_eth_netdev_ops = {
 	.ndo_init		= l2tp_eth_dev_init,
 	.ndo_uninit		= l2tp_eth_dev_uninit,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 41497b670e2b..77e8a42225f9 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1122,7 +1122,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev,
 	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
 }
 
-static struct rtnl_link_stats64 *
+static void
 ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -1147,8 +1147,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->rx_bytes   += rx_bytes;
 		stats->tx_bytes   += tx_bytes;
 	}
-
-	return stats;
 }
 
 static const struct net_device_ops ieee80211_dataif_ops = {
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index d5d6caecd072..09141a18ee2d 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -97,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev)
 	free_netdev(dev);
 }
 
-static struct rtnl_link_stats64 *
+static void
 internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	int i;
@@ -125,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_bytes         += local_stats.tx_bytes;
 		stats->tx_packets       += local_stats.tx_packets;
 	}
-
-	return stats;
 }
 
 static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index b0196366d58d..9fe6b427afed 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
-						     struct rtnl_link_stats64 *stats)
+static void teql_master_stats64(struct net_device *dev,
+				struct rtnl_link_stats64 *stats)
 {
 	struct teql_master *m = netdev_priv(dev);
 
@@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
 	stats->tx_bytes		= m->tx_bytes;
 	stats->tx_errors	= m->tx_errors;
 	stats->tx_dropped	= m->tx_dropped;
-	return stats;
 }
 
 static int teql_master_mtu(struct net_device *dev, int new_mtu)
-- 
cgit v1.2.3


From 8a522bf2d4f788306443d36b26b54f0aedcdfdbe Mon Sep 17 00:00:00 2001
From: Peter Foley <pefoley2@pefoley.com>
Date: Sun, 27 Nov 2016 21:37:20 -0500
Subject: extcon: adc-jack: Fix incompatible pointer type warning

This patch fixes the incompatible warning of extcon-adc-jack.c driver
when calling devm_extcon_dev_allocate().

Signed-off-by: Peter Foley <pefoley2@pefoley.com>
[cw00.choi: Modify the patch title and descritpion]
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 include/linux/extcon/extcon-adc-jack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/extcon/extcon-adc-jack.h b/include/linux/extcon/extcon-adc-jack.h
index a0e03b13b449..2aa32075bca1 100644
--- a/include/linux/extcon/extcon-adc-jack.h
+++ b/include/linux/extcon/extcon-adc-jack.h
@@ -59,7 +59,7 @@ struct adc_jack_pdata {
 	const char *name;
 	const char *consumer_channel;
 
-	const enum extcon *cable_names;
+	const unsigned int *cable_names;
 
 	/* The last entry's state should be 0 */
 	struct adc_jack_cond *adc_conditions;
-- 
cgit v1.2.3


From db6228612ce297949621a62e9d2331ee55016778 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Wed, 21 Dec 2016 14:10:47 +0800
Subject: extcon: Add documentation for EXTCON_CHG_USB_* and EXTCON_USB_*

Current there is both "EXTCON_USB" and "EXTCON_CHG_USB_SDP" which
both seem to suggest a standard downstream port. But there is no
documentation describing how these relate.

Thus add documentation to describe EXTCON_CHG_USB_SDP should always
appear together with EXTCON_USB, and EXTCON_CHG_USB_ACA would normally
appear with EXTCON_USB_HOST.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 include/linux/extcon.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index b871c0cb1f02..00201230bea5 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -46,7 +46,14 @@
 #define EXTCON_USB		1
 #define EXTCON_USB_HOST		2
 
-/* Charging external connector */
+/*
+ * Charging external connector
+ *
+ * When one SDP charger connector was reported, we should also report
+ * the USB connector, which means EXTCON_CHG_USB_SDP should always
+ * appear together with EXTCON_USB. The same as ACA charger connector,
+ * EXTCON_CHG_USB_ACA would normally appear with EXTCON_USB_HOST.
+ */
 #define EXTCON_CHG_USB_SDP	5	/* Standard Downstream Port */
 #define EXTCON_CHG_USB_DCP	6	/* Dedicated Charging Port */
 #define EXTCON_CHG_USB_CDP	7	/* Charging Downstream Port */
-- 
cgit v1.2.3


From e6cf046543763878614d51e8283abd40d5e5327e Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 26 Dec 2016 20:37:38 +0900
Subject: extcon: Move defintion of struct extcon_dev to driver/extcon
 directory

This patch moves the 'struct extcon_dev' of extcon subsystem
to driver/extcon/extcon.h header file because the struct extcon_dev have to
be handled by extcon API to guarantee the consistency of strcut extcon_dev.
If external drivers are able to touch the struct extcon_dev directly, it might
cause the critical and unknown problem.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/devres.c |  2 +-
 drivers/extcon/extcon.c |  3 ++-
 drivers/extcon/extcon.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/extcon.h  | 57 +--------------------------------------------
 4 files changed, 66 insertions(+), 58 deletions(-)
 create mode 100644 drivers/extcon/extcon.h

(limited to 'include/linux')

diff --git a/drivers/extcon/devres.c b/drivers/extcon/devres.c
index e686acd1c459..b40eb1805927 100644
--- a/drivers/extcon/devres.c
+++ b/drivers/extcon/devres.c
@@ -14,7 +14,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/extcon.h>
+#include "extcon.h"
 
 static int devm_extcon_dev_match(struct device *dev, void *res, void *data)
 {
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index d0e367959c91..591582b0d2b7 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -30,11 +30,12 @@
 #include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/err.h>
-#include <linux/extcon.h>
 #include <linux/of.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 
+#include "extcon.h"
+
 #define SUPPORTED_CABLE_MAX	32
 #define CABLE_NAME_MAX		30
 
diff --git a/drivers/extcon/extcon.h b/drivers/extcon/extcon.h
new file mode 100644
index 000000000000..993ddccafe11
--- /dev/null
+++ b/drivers/extcon/extcon.h
@@ -0,0 +1,62 @@
+#ifndef __LINUX_EXTCON_INTERNAL_H__
+#define __LINUX_EXTCON_INTERNAL_H__
+
+#include <linux/extcon.h>
+
+/**
+ * struct extcon_dev - An extcon device represents one external connector.
+ * @name:		The name of this extcon device. Parent device name is
+ *			used if NULL.
+ * @supported_cable:	Array of supported cable names ending with EXTCON_NONE.
+ *			If supported_cable is NULL, cable name related APIs
+ *			are disabled.
+ * @mutually_exclusive:	Array of mutually exclusive set of cables that cannot
+ *			be attached simultaneously. The array should be
+ *			ending with NULL or be NULL (no mutually exclusive
+ *			cables). For example, if it is { 0x7, 0x30, 0}, then,
+ *			{0, 1}, {0, 1, 2}, {0, 2}, {1, 2}, or {4, 5} cannot
+ *			be attached simulataneously. {0x7, 0} is equivalent to
+ *			{0x3, 0x6, 0x5, 0}. If it is {0xFFFFFFFF, 0}, there
+ *			can be no simultaneous connections.
+ * @dev:		Device of this extcon.
+ * @state:		Attach/detach state of this extcon. Do not provide at
+ *			register-time.
+ * @nh:			Notifier for the state change events from this extcon
+ * @entry:		To support list of extcon devices so that users can
+ *			search for extcon devices based on the extcon name.
+ * @lock:
+ * @max_supported:	Internal value to store the number of cables.
+ * @extcon_dev_type:	Device_type struct to provide attribute_groups
+ *			customized for each extcon device.
+ * @cables:		Sysfs subdirectories. Each represents one cable.
+ *
+ * In most cases, users only need to provide "User initializing data" of
+ * this struct when registering an extcon. In some exceptional cases,
+ * optional callbacks may be needed. However, the values in "internal data"
+ * are overwritten by register function.
+ */
+struct extcon_dev {
+	/* Optional user initializing data */
+	const char *name;
+	const unsigned int *supported_cable;
+	const u32 *mutually_exclusive;
+
+	/* Internal data. Please do not set. */
+	struct device dev;
+	struct raw_notifier_head *nh;
+	struct list_head entry;
+	int max_supported;
+	spinlock_t lock;	/* could be called by irq handler */
+	u32 state;
+
+	/* /sys/class/extcon/.../cable.n/... */
+	struct device_type extcon_dev_type;
+	struct extcon_cable *cables;
+
+	/* /sys/class/extcon/.../mutually_exclusive/... */
+	struct attribute_group attr_g_muex;
+	struct attribute **attrs_muex;
+	struct device_attribute *d_attrs_muex;
+};
+
+#endif /* __LINUX_EXTCON_INTERNAL_H__ */
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 00201230bea5..d57e52443841 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -167,62 +167,7 @@ union extcon_property_value {
 };
 
 struct extcon_cable;
-
-/**
- * struct extcon_dev - An extcon device represents one external connector.
- * @name:		The name of this extcon device. Parent device name is
- *			used if NULL.
- * @supported_cable:	Array of supported cable names ending with EXTCON_NONE.
- *			If supported_cable is NULL, cable name related APIs
- *			are disabled.
- * @mutually_exclusive:	Array of mutually exclusive set of cables that cannot
- *			be attached simultaneously. The array should be
- *			ending with NULL or be NULL (no mutually exclusive
- *			cables). For example, if it is { 0x7, 0x30, 0}, then,
- *			{0, 1}, {0, 1, 2}, {0, 2}, {1, 2}, or {4, 5} cannot
- *			be attached simulataneously. {0x7, 0} is equivalent to
- *			{0x3, 0x6, 0x5, 0}. If it is {0xFFFFFFFF, 0}, there
- *			can be no simultaneous connections.
- * @dev:		Device of this extcon.
- * @state:		Attach/detach state of this extcon. Do not provide at
- *			register-time.
- * @nh:			Notifier for the state change events from this extcon
- * @entry:		To support list of extcon devices so that users can
- *			search for extcon devices based on the extcon name.
- * @lock:
- * @max_supported:	Internal value to store the number of cables.
- * @extcon_dev_type:	Device_type struct to provide attribute_groups
- *			customized for each extcon device.
- * @cables:		Sysfs subdirectories. Each represents one cable.
- *
- * In most cases, users only need to provide "User initializing data" of
- * this struct when registering an extcon. In some exceptional cases,
- * optional callbacks may be needed. However, the values in "internal data"
- * are overwritten by register function.
- */
-struct extcon_dev {
-	/* Optional user initializing data */
-	const char *name;
-	const unsigned int *supported_cable;
-	const u32 *mutually_exclusive;
-
-	/* Internal data. Please do not set. */
-	struct device dev;
-	struct raw_notifier_head *nh;
-	struct list_head entry;
-	int max_supported;
-	spinlock_t lock;	/* could be called by irq handler */
-	u32 state;
-
-	/* /sys/class/extcon/.../cable.n/... */
-	struct device_type extcon_dev_type;
-	struct extcon_cable *cables;
-
-	/* /sys/class/extcon/.../mutually_exclusive/... */
-	struct attribute_group attr_g_muex;
-	struct attribute **attrs_muex;
-	struct device_attribute *d_attrs_muex;
-};
+struct extcon_dev;
 
 #if IS_ENABLED(CONFIG_EXTCON)
 
-- 
cgit v1.2.3


From 62a37443e93bbae74410cb72aa9d7e15a1da0b98 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Tue, 3 Jan 2017 13:50:54 +0800
Subject: extcon: Add documentation for EXTCON_CHG_USB_SLOW/FAST

Currently there are no documentation for EXTCON_CHG_USB_SLOW/FAST
charger connector. These names don't mean much and no guide to tell
users how to use it, thus try to add documentation to make them clear.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
[cw00.choi: Use the 'connector' expression instead of 'cable']
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 include/linux/extcon.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index d57e52443841..242157cad25d 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -53,6 +53,10 @@
  * the USB connector, which means EXTCON_CHG_USB_SDP should always
  * appear together with EXTCON_USB. The same as ACA charger connector,
  * EXTCON_CHG_USB_ACA would normally appear with EXTCON_USB_HOST.
+ *
+ * The EXTCON_CHG_USB_SLOW connector can provide at least 500mA of
+ * current at 5V. The EXTCON_CHG_USB_FAST connector can provide at
+ * least 1A of current at 5V.
  */
 #define EXTCON_CHG_USB_SDP	5	/* Standard Downstream Port */
 #define EXTCON_CHG_USB_DCP	6	/* Dedicated Charging Port */
-- 
cgit v1.2.3


From 3c5f0e076833c407cca372c663d47499ae4dab45 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 2 Jan 2017 13:03:03 +0900
Subject: extcon: Add new EXTCON_CHG_USB_PD type for USB Power Delivery

This patch adds the new EXTCON_CHG_USB_PD for USB PD (Power Delivery)[1].
The USB Power Delivery specification specifies that USB cable provides
the increased power more than 7.5W to device with larger power demand.
The EXTCON_CHG_USB_PD has the EXTCON_TYPE_CHG and EXTCON_TYPE_USB type.

[1] https://en.wikipedia.org/wiki/USB#PD

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon.c | 5 +++++
 include/linux/extcon.h  | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 591582b0d2b7..768e36769870 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -99,6 +99,11 @@ struct __extcon_info {
 		.id = EXTCON_CHG_WPT,
 		.name = "WPT",
 	},
+	[EXTCON_CHG_USB_PD] = {
+		.type = EXTCON_TYPE_CHG | EXTCON_TYPE_USB,
+		.id = EXTCON_CHG_USB_PD,
+		.name = "PD",
+	},
 
 	/* Jack external connector */
 	[EXTCON_JACK_MICROPHONE] = {
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 242157cad25d..7010fb01a81a 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -65,6 +65,7 @@
 #define EXTCON_CHG_USB_FAST	9
 #define EXTCON_CHG_USB_SLOW	10
 #define EXTCON_CHG_WPT		11	/* Wireless Power Transfer */
+#define EXTCON_CHG_USB_PD	12	/* USB Power Delivery */
 
 /* Jack external connector */
 #define EXTCON_JACK_MICROPHONE	20
-- 
cgit v1.2.3


From e7246e122aaa99ebbb8ad7da80f35a20577bd8af Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:35 -0500
Subject: net-tc: extract skip classify bit from tc_verd

Packets sent by the IFB device skip subsequent tc classification.
A single bit governs this state. Move it out of tc_verd in
anticipation of removing that __u16 completely.

The new bitfield tc_skip_classify temporarily uses one bit of a
hole, until tc_verd is removed completely in a follow-up patch.

Remove the bit hole comment. It could be 2, 3, 4 or 5 bits long.
With that many options, little value in documenting it.

Introduce a helper function to deduplicate the logic in the two
sites that check this bit.

The field tc_skip_classify is set only in IFB on skbs cloned in
act_mirred, so original packet sources do not have to clear the
bit when reusing packets (notably, pktgen and octeon).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c            |  2 +-
 include/linux/skbuff.h       |  5 ++++-
 include/net/sch_generic.h    | 11 +++++++++++
 include/uapi/linux/pkt_cls.h |  6 ------
 net/core/dev.c               | 10 +++-------
 net/sched/act_api.c          | 11 ++++-------
 6 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 082534e187fc..442c4c4a9606 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -81,7 +81,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		u32 from = G_TC_FROM(skb->tc_verd);
 
 		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
 		txp->tx_packets++;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..570f60ec6cb4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -589,6 +589,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@pkt_type: Packet class
  *	@fclone: skbuff clone status
  *	@ipvs_property: skbuff is owned by ipvs
+ *	@tc_skip_classify: do not classify packet. set by IFB device
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -749,7 +750,9 @@ struct sk_buff {
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 #endif
-	/* 2, 4 or 5 bit hole */
+#ifdef CONFIG_NET_CLS_ACT
+	__u8			tc_skip_classify:1;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 498f81b229a4..857356f2d74b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -418,6 +418,17 @@ static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_skip_tc_classify(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (skb->tc_skip_classify) {
+		skb->tc_skip_classify = 0;
+		return true;
+	}
+#endif
+	return false;
+}
+
 /* Reset all TX qdiscs greater then index of a device.  */
 static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
 {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index bba23dbb3ab6..1eed5d7509bc 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -22,8 +22,6 @@ bit 6,7: Where this packet was last seen
 1: on the Ingress
 2: on the Egress
 
-bit 8: when set --> Request not to classify on ingress. 
-
  *
  * */
 
@@ -36,10 +34,6 @@ bit 8: when set --> Request not to classify on ingress.
 #define AT_INGRESS	0x1
 #define AT_EGRESS	0x2
 
-#define TC_NCLS          _TC_MAKEMASK1(8)
-#define SET_TC_NCLS(v)   ( TC_NCLS | (v & ~TC_NCLS))
-#define CLR_TC_NCLS(v)   ( v & ~TC_NCLS)
-
 #define S_TC_AT          _TC_MAKE32(12)
 #define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
 #define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
diff --git a/net/core/dev.c b/net/core/dev.c
index 56818f7eab2b..e39e35d2e082 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4093,12 +4093,8 @@ another_round:
 			goto out;
 	}
 
-#ifdef CONFIG_NET_CLS_ACT
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		goto ncls;
-	}
-#endif
+	if (skb_skip_tc_classify(skb))
+		goto skip_classify;
 
 	if (pfmemalloc)
 		goto skip_taps;
@@ -4128,8 +4124,8 @@ skip_taps:
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = 0;
-ncls:
 #endif
+skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 2095c83ce773..f04715a57300 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -426,11 +426,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 {
 	int ret = -1, i;
 
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		ret = TC_ACT_OK;
-		goto exec_done;
-	}
+	if (skb_skip_tc_classify(skb))
+		return TC_ACT_OK;
+
 	for (i = 0; i < nr_actions; i++) {
 		const struct tc_action *a = actions[i];
 
@@ -439,9 +437,8 @@ repeat:
 		if (ret == TC_ACT_REPEAT)
 			goto repeat;	/* we need a ttl - JHS */
 		if (ret != TC_ACT_PIPE)
-			goto exec_done;
+			break;
 	}
-exec_done:
 	return ret;
 }
 EXPORT_SYMBOL(tcf_action_exec);
-- 
cgit v1.2.3


From a5135bcfba7345031df45e02cd150a45add47cf8 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:36 -0500
Subject: net-tc: convert tc_verd to integer bitfields

Extract the remaining two fields from tc_verd and remove the __u16
completely. TC_AT and TC_FROM are converted to equivalent two-bit
integer fields tc_at and tc_from. Where possible, use existing
helper skb_at_tc_ingress when reading tc_at. Introduce helper
skb_reset_tc to clear fields.

Not documenting tc_from and tc_at, because they will be replaced
with single bit fields in follow-on patches.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c                    |  7 +++----
 drivers/staging/octeon/ethernet-tx.c |  5 ++---
 include/linux/skbuff.h               |  6 ++----
 include/net/sch_generic.h            | 10 +++++++++-
 include/uapi/linux/pkt_cls.h         | 31 -------------------------------
 net/core/dev.c                       | 10 ++++------
 net/core/pktgen.c                    |  4 +---
 net/core/skbuff.c                    |  3 ---
 net/sched/act_ife.c                  |  7 +++----
 net/sched/act_mirred.c               |  9 ++++-----
 net/sched/sch_netem.c                |  2 +-
 11 files changed, 29 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 442c4c4a9606..b73b6b6c066b 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -78,9 +78,9 @@ static void ifb_ri_tasklet(unsigned long _txp)
 	}
 
 	while ((skb = __skb_dequeue(&txp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
+		u32 from = skb->tc_from;
 
-		skb->tc_verd = 0;
+		skb_reset_tc(skb);
 		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
@@ -239,7 +239,6 @@ static void ifb_setup(struct net_device *dev)
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ifb_dev_private *dp = netdev_priv(dev);
-	u32 from = G_TC_FROM(skb->tc_verd);
 	struct ifb_q_private *txp = dp->tx_private + skb_get_queue_mapping(skb);
 
 	u64_stats_update_begin(&txp->rsync);
@@ -247,7 +246,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	txp->rx_bytes += skb->len;
 	u64_stats_update_end(&txp->rsync);
 
-	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
+	if (skb->tc_from == AT_STACK || !skb->skb_iif) {
 		dev_kfree_skb(skb);
 		dev->stats.rx_dropped++;
 		return NETDEV_TX_OK;
diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c
index 6b4c20872323..0b8053205091 100644
--- a/drivers/staging/octeon/ethernet-tx.c
+++ b/drivers/staging/octeon/ethernet-tx.c
@@ -23,6 +23,7 @@
 #endif /* CONFIG_XFRM */
 
 #include <linux/atomic.h>
+#include <net/sch_generic.h>
 
 #include <asm/octeon/octeon.h>
 
@@ -369,9 +370,7 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 
 #ifdef CONFIG_NET_SCHED
 	skb->tc_index = 0;
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = 0;
-#endif /* CONFIG_NET_CLS_ACT */
+	skb_reset_tc(skb);
 #endif /* CONFIG_NET_SCHED */
 #endif /* REUSE_SKBUFFS_WITHOUT_FREE */
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 570f60ec6cb4..f738d09947b2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -599,7 +599,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
  *	@tc_index: Traffic control index
- *	@tc_verd: traffic control verdict
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
@@ -752,13 +751,12 @@ struct sk_buff {
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
+	__u8			tc_at:2;
+	__u8			tc_from:2;
 #endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
-#ifdef CONFIG_NET_CLS_ACT
-	__u16			tc_verd;	/* traffic control verdict */
-#endif
 #endif
 
 	union {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 857356f2d74b..f80dba516964 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -409,10 +409,18 @@ bool tcf_destroy(struct tcf_proto *tp, bool force);
 void tcf_destroy_chain(struct tcf_proto __rcu **fl);
 int skb_do_redirect(struct sk_buff *);
 
+static inline void skb_reset_tc(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	skb->tc_at = 0;
+	skb->tc_from = 0;
+#endif
+}
+
 static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	return G_TC_AT(skb->tc_verd) & AT_INGRESS;
+	return skb->tc_at & AT_INGRESS;
 #else
 	return false;
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 1eed5d7509bc..cee753a7a40c 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -5,40 +5,9 @@
 #include <linux/pkt_sched.h>
 
 #ifdef __KERNEL__
-/* I think i could have done better macros ; for now this is stolen from
- * some arch/mips code - jhs
-*/
-#define _TC_MAKE32(x) ((x))
-
-#define _TC_MAKEMASK1(n) (_TC_MAKE32(1) << _TC_MAKE32(n))
-#define _TC_MAKEMASK(v,n) (_TC_MAKE32((_TC_MAKE32(1)<<(v))-1) << _TC_MAKE32(n))
-#define _TC_MAKEVALUE(v,n) (_TC_MAKE32(v) << _TC_MAKE32(n))
-#define _TC_GETVALUE(v,n,m) ((_TC_MAKE32(v) & _TC_MAKE32(m)) >> _TC_MAKE32(n))
-
-/* verdict bit breakdown 
- *
-bit 6,7: Where this packet was last seen 
-0: Above the transmit example at the socket level
-1: on the Ingress
-2: on the Egress
-
- *
- * */
-
-#define S_TC_FROM          _TC_MAKE32(6)
-#define M_TC_FROM          _TC_MAKEMASK(2,S_TC_FROM)
-#define G_TC_FROM(x)       _TC_GETVALUE(x,S_TC_FROM,M_TC_FROM)
-#define V_TC_FROM(x)       _TC_MAKEVALUE(x,S_TC_FROM)
-#define SET_TC_FROM(v,n)   ((V_TC_FROM(n)) | (v & ~M_TC_FROM))
 #define AT_STACK	0x0
 #define AT_INGRESS	0x1
 #define AT_EGRESS	0x2
-
-#define S_TC_AT          _TC_MAKE32(12)
-#define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
-#define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
-#define V_TC_AT(x)       _TC_MAKEVALUE(x,S_TC_AT)
-#define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
 #endif
 
 /* Action attributes */
diff --git a/net/core/dev.c b/net/core/dev.c
index e39e35d2e082..8b5d6d033473 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3153,7 +3153,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	if (!cl)
 		return skb;
 
-	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
+	/* skb->tc_at and qdisc_skb_cb(skb)->pkt_len were already set
 	 * earlier by the caller.
 	 */
 	qdisc_bstats_cpu_update(cl->q, skb);
@@ -3320,7 +3320,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+	skb->tc_at = AT_EGRESS;
 # ifdef CONFIG_NET_EGRESS
 	if (static_key_false(&egress_needed)) {
 		skb = sch_handle_egress(skb, &rc, dev);
@@ -3920,7 +3920,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	}
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+	skb->tc_at = AT_INGRESS;
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -4122,9 +4122,7 @@ skip_taps:
 			goto out;
 	}
 #endif
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = 0;
-#endif
+	skb_reset_tc(skb);
 skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8e69ce472236..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 			/* skb was 'freed' by stack, so clean few
 			 * bits and reuse it
 			 */
-#ifdef CONFIG_NET_CLS_ACT
-			skb->tc_verd = 0; /* reset reclass/redir ttl */
-#endif
+			skb_reset_tc(skb);
 		} while (--burst > 0);
 		goto out; /* Skips xmit_mode M_START_XMIT */
 	} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..adec4bf807d8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -878,9 +878,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 #ifdef CONFIG_NET_SCHED
 	CHECK_SKB_FIELD(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
-	CHECK_SKB_FIELD(tc_verd);
-#endif
 #endif
 
 }
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 80b848d3f096..921fb20eaa7c 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -736,12 +736,11 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	u16 metalen = ife_get_sz(skb, ife);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
 	unsigned int skboff = skb->dev->hard_header_len;
-	u32 at = G_TC_AT(skb->tc_verd);
 	int new_len = skb->len + hdrm;
 	bool exceed_mtu = false;
 	int err;
 
-	if (at & AT_EGRESS) {
+	if (!skb_at_tc_ingress(skb)) {
 		if (new_len > skb->dev->mtu)
 			exceed_mtu = true;
 	}
@@ -773,7 +772,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		return TC_ACT_SHOT;
 	}
 
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_push(skb, skb->dev->hard_header_len);
 
 	iethh = (struct ethhdr *)skb->data;
@@ -816,7 +815,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		ether_addr_copy(oethh->h_dest, iethh->h_dest);
 	oethh->h_proto = htons(ife->eth_type);
 
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_pull(skb, skb->dev->hard_header_len);
 
 	spin_unlock(&ife->tcf_lock);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 2d9fa6e0a1b4..8543279bba49 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -170,7 +170,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	int retval, err = 0;
 	int m_eaction;
 	int mac_len;
-	u32 at;
 
 	tcf_lastuse_update(&m->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -191,7 +190,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 		goto out;
 	}
 
-	at = G_TC_AT(skb->tc_verd);
 	skb2 = skb_clone(skb, GFP_ATOMIC);
 	if (!skb2)
 		goto out;
@@ -200,8 +198,9 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	 * and devices expect a mac header on xmit, then mac push/pull is
 	 * needed.
 	 */
-	if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) {
-		if (at & AT_EGRESS) {
+	if (skb->tc_at != tcf_mirred_act_direction(m_eaction) &&
+	    m_mac_header_xmit) {
+		if (!skb_at_tc_ingress(skb)) {
 			/* caught at egress, act ingress: pull mac */
 			mac_len = skb_network_header(skb) - skb_mac_header(skb);
 			skb_pull_rcsum(skb2, mac_len);
@@ -213,7 +212,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	/* mirror is always swallowed */
 	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+		skb2->tc_from = skb2->tc_at;
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bcfadfdea8e0..bb5c638b6852 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
 			 * If it's at ingress let's pretend the delay is
 			 * from the network (tstamp will be updated).
 			 */
-			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+			if (skb->tc_from & AT_INGRESS)
 				skb->tstamp = 0;
 #endif
 
-- 
cgit v1.2.3


From 8dc07fdbf2054f157e8333f940a1ad728916c786 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:37 -0500
Subject: net-tc: convert tc_at to tc_at_ingress

Field tc_at is used only within tc actions to distinguish ingress from
egress processing. A single bit is sufficient for this purpose.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  3 ++-
 include/net/sch_generic.h |  3 +--
 net/core/dev.c            |  8 +++-----
 net/sched/act_mirred.c    | 12 ++++++------
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f738d09947b2..fab3f87e9bd1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -590,6 +590,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@fclone: skbuff clone status
  *	@ipvs_property: skbuff is owned by ipvs
  *	@tc_skip_classify: do not classify packet. set by IFB device
+ *	@tc_at_ingress: used within tc_classify to distinguish in/egress
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -751,7 +752,7 @@ struct sk_buff {
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
-	__u8			tc_at:2;
+	__u8			tc_at_ingress:1;
 	__u8			tc_from:2;
 #endif
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f80dba516964..4bd6d5387209 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -412,7 +412,6 @@ int skb_do_redirect(struct sk_buff *);
 static inline void skb_reset_tc(struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_at = 0;
 	skb->tc_from = 0;
 #endif
 }
@@ -420,7 +419,7 @@ static inline void skb_reset_tc(struct sk_buff *skb)
 static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	return skb->tc_at & AT_INGRESS;
+	return skb->tc_at_ingress;
 #else
 	return false;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 8b5d6d033473..c143f1391117 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3153,9 +3153,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	if (!cl)
 		return skb;
 
-	/* skb->tc_at and qdisc_skb_cb(skb)->pkt_len were already set
-	 * earlier by the caller.
-	 */
+	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -3320,7 +3318,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_at = AT_EGRESS;
+	skb->tc_at_ingress = 0;
 # ifdef CONFIG_NET_EGRESS
 	if (static_key_false(&egress_needed)) {
 		skb = sch_handle_egress(skb, &rc, dev);
@@ -3920,7 +3918,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	}
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	skb->tc_at = AT_INGRESS;
+	skb->tc_at_ingress = 1;
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8543279bba49..e832c62fd705 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -39,15 +39,15 @@ static bool tcf_mirred_is_act_redirect(int action)
 	return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
 }
 
-static u32 tcf_mirred_act_direction(int action)
+static bool tcf_mirred_act_wants_ingress(int action)
 {
 	switch (action) {
 	case TCA_EGRESS_REDIR:
 	case TCA_EGRESS_MIRROR:
-		return AT_EGRESS;
+		return false;
 	case TCA_INGRESS_REDIR:
 	case TCA_INGRESS_MIRROR:
-		return AT_INGRESS;
+		return true;
 	default:
 		BUG();
 	}
@@ -198,7 +198,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	 * and devices expect a mac header on xmit, then mac push/pull is
 	 * needed.
 	 */
-	if (skb->tc_at != tcf_mirred_act_direction(m_eaction) &&
+	if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
 	    m_mac_header_xmit) {
 		if (!skb_at_tc_ingress(skb)) {
 			/* caught at egress, act ingress: pull mac */
@@ -212,11 +212,11 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	/* mirror is always swallowed */
 	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_from = skb2->tc_at;
+		skb2->tc_from = skb_at_tc_ingress(skb) ? AT_INGRESS : AT_EGRESS;
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
-	if (tcf_mirred_act_direction(m_eaction) & AT_EGRESS)
+	if (!tcf_mirred_act_wants_ingress(m_eaction))
 		err = dev_queue_xmit(skb2);
 	else
 		err = netif_receive_skb(skb2);
-- 
cgit v1.2.3


From bc31c905e946b5c55df5d2938335e78ffb3157ca Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:38 -0500
Subject: net-tc: convert tc_from to tc_from_ingress and tc_redirected

The tc_from field fulfills two roles. It encodes whether a packet was
redirected by an act_mirred device and, if so, whether act_mirred was
called on ingress or egress. Split it into separate fields.

The information is needed by the special IFB loop, where packets are
taken out of the normal path by act_mirred, forwarded to IFB, then
reinjected at their original location (ingress or egress) by IFB.

The IFB device cannot use skb->tc_at_ingress, because that may have
been overwritten as the packet travels from act_mirred to ifb_xmit,
when it passes through tc_classify on the IFB egress path. Cache this
value in skb->tc_from_ingress.

That field is valid only if a packet arriving at ifb_xmit came from
act_mirred. Other packets can be crafted to reach ifb_xmit. These
must be dropped. Set tc_redirected on redirection and drop all packets
that do not have this bit set.

Both fields are set only on cloned skbs in tc actions, so original
packet sources do not have to clear the bit when reusing packets
(notably, pktgen and octeon).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c            | 13 +++++--------
 include/linux/skbuff.h       |  5 ++++-
 include/net/sch_generic.h    |  2 +-
 include/uapi/linux/pkt_cls.h |  6 ------
 net/sched/act_mirred.c       |  6 ++++--
 net/sched/sch_netem.c        |  2 +-
 6 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index b73b6b6c066b..312fce7302d3 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -78,9 +78,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 	}
 
 	while ((skb = __skb_dequeue(&txp->tq)) != NULL) {
-		u32 from = skb->tc_from;
-
-		skb_reset_tc(skb);
+		skb->tc_redirected = 0;
 		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
@@ -101,13 +99,12 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		rcu_read_unlock();
 		skb->skb_iif = txp->dev->ifindex;
 
-		if (from & AT_EGRESS) {
+		if (!skb->tc_from_ingress) {
 			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
+		} else {
 			skb_pull(skb, skb->mac_len);
 			netif_receive_skb(skb);
-		} else
-			BUG();
+		}
 	}
 
 	if (__netif_tx_trylock(txq)) {
@@ -246,7 +243,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	txp->rx_bytes += skb->len;
 	u64_stats_update_end(&txp->rsync);
 
-	if (skb->tc_from == AT_STACK || !skb->skb_iif) {
+	if (!skb->tc_redirected || !skb->skb_iif) {
 		dev_kfree_skb(skb);
 		dev->stats.rx_dropped++;
 		return NETDEV_TX_OK;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fab3f87e9bd1..3149a88de548 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -591,6 +591,8 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@ipvs_property: skbuff is owned by ipvs
  *	@tc_skip_classify: do not classify packet. set by IFB device
  *	@tc_at_ingress: used within tc_classify to distinguish in/egress
+ *	@tc_redirected: packet was redirected by a tc action
+ *	@tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -753,7 +755,8 @@ struct sk_buff {
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;
-	__u8			tc_from:2;
+	__u8			tc_redirected:1;
+	__u8			tc_from_ingress:1;
 #endif
 
 #ifdef CONFIG_NET_SCHED
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4bd6d5387209..e2f426f6d62f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -412,7 +412,7 @@ int skb_do_redirect(struct sk_buff *);
 static inline void skb_reset_tc(struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_from = 0;
+	skb->tc_redirected = 0;
 #endif
 }
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index cee753a7a40c..a081efbd61a2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -4,12 +4,6 @@
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
 
-#ifdef __KERNEL__
-#define AT_STACK	0x0
-#define AT_INGRESS	0x1
-#define AT_EGRESS	0x2
-#endif
-
 /* Action attributes */
 enum {
 	TCA_ACT_UNSPEC,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e832c62fd705..84682f02b611 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -211,8 +211,10 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	/* mirror is always swallowed */
-	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_from = skb_at_tc_ingress(skb) ? AT_INGRESS : AT_EGRESS;
+	if (tcf_mirred_is_act_redirect(m_eaction)) {
+		skb2->tc_redirected = 1;
+		skb2->tc_from_ingress = skb2->tc_at_ingress;
+	}
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb5c638b6852..c8bb62a1e744 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
 			 * If it's at ingress let's pretend the delay is
 			 * from the network (tstamp will be updated).
 			 */
-			if (skb->tc_from & AT_INGRESS)
+			if (skb->tc_redirected && skb->tc_from_ingress)
 				skb->tstamp = 0;
 #endif
 
-- 
cgit v1.2.3


From b21507e272627c434e8dd74e8d51fd8245281b59 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 9 Jan 2017 10:07:31 -0500
Subject: proc,security: move restriction on writing /proc/pid/attr nodes to
 proc

Processes can only alter their own security attributes via
/proc/pid/attr nodes.  This is presently enforced by each individual
security module and is also imposed by the Linux credentials
implementation, which only allows a task to alter its own credentials.
Move the check enforcing this restriction from the individual
security modules to proc_pid_attr_write() before calling the security hook,
and drop the unnecessary task argument to the security hook since it can
only ever be the current task.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/proc/base.c             | 13 +++++++++----
 include/linux/lsm_hooks.h  |  3 +--
 include/linux/security.h   |  4 ++--
 security/apparmor/lsm.c    |  7 ++-----
 security/security.c        |  4 ++--
 security/selinux/hooks.c   | 13 +------------
 security/smack/smack_lsm.c | 11 +----------
 7 files changed, 18 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e7e61b28f31..988c5a77e888 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2488,6 +2488,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	length = -ESRCH;
 	if (!task)
 		goto out_no_task;
+
+	/* A task may only write its own attributes. */
+	length = -EACCES;
+	if (current != task)
+		goto out;
+
 	if (count > PAGE_SIZE)
 		count = PAGE_SIZE;
 
@@ -2503,14 +2509,13 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	}
 
 	/* Guard against adverse ptrace interaction */
-	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+	length = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
 	if (length < 0)
 		goto out_free;
 
-	length = security_setprocattr(task,
-				      (char*)file->f_path.dentry->d_name.name,
+	length = security_setprocattr(file->f_path.dentry->d_name.name,
 				      page, count);
-	mutex_unlock(&task->signal->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 out_free:
 	kfree(page);
 out:
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 558adfa5c8a8..0dde95900196 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1547,8 +1547,7 @@ union security_list_options {
 	void (*d_instantiate)(struct dentry *dentry, struct inode *inode);
 
 	int (*getprocattr)(struct task_struct *p, char *name, char **value);
-	int (*setprocattr)(struct task_struct *p, char *name, void *value,
-				size_t size);
+	int (*setprocattr)(const char *name, void *value, size_t size);
 	int (*ismaclabel)(const char *name);
 	int (*secid_to_secctx)(u32 secid, char **secdata, u32 *seclen);
 	int (*secctx_to_secid)(const char *secdata, u32 seclen, u32 *secid);
diff --git a/include/linux/security.h b/include/linux/security.h
index c2125e9093e8..f4ebac117fa6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -361,7 +361,7 @@ int security_sem_semop(struct sem_array *sma, struct sembuf *sops,
 			unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
 int security_getprocattr(struct task_struct *p, char *name, char **value);
-int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size);
+int security_setprocattr(const char *name, void *value, size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
 int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
@@ -1106,7 +1106,7 @@ static inline int security_getprocattr(struct task_struct *p, char *name, char *
 	return -EINVAL;
 }
 
-static inline int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size)
+static inline int security_setprocattr(char *name, void *value, size_t size)
 {
 	return -EINVAL;
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 41b8cb115801..8202e5583479 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -495,8 +495,8 @@ static int apparmor_getprocattr(struct task_struct *task, char *name,
 	return error;
 }
 
-static int apparmor_setprocattr(struct task_struct *task, char *name,
-				void *value, size_t size)
+static int apparmor_setprocattr(const char *name, void *value,
+				size_t size)
 {
 	struct common_audit_data sa;
 	struct apparmor_audit_data aad = {0,};
@@ -506,9 +506,6 @@ static int apparmor_setprocattr(struct task_struct *task, char *name,
 
 	if (size == 0)
 		return -EINVAL;
-	/* task can only write its own attributes */
-	if (current != task)
-		return -EACCES;
 
 	/* AppArmor requires that the buffer must be null terminated atm */
 	if (args[size - 1] != '\0') {
diff --git a/security/security.c b/security/security.c
index f825304f04a7..32052f5c76b2 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1170,9 +1170,9 @@ int security_getprocattr(struct task_struct *p, char *name, char **value)
 	return call_int_hook(getprocattr, -EINVAL, p, name, value);
 }
 
-int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size)
+int security_setprocattr(const char *name, void *value, size_t size)
 {
-	return call_int_hook(setprocattr, -EINVAL, p, name, value, size);
+	return call_int_hook(setprocattr, -EINVAL, name, value, size);
 }
 
 int security_netlink_send(struct sock *sk, struct sk_buff *skb)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 262e108c36d4..bada3cd42b9c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5862,8 +5862,7 @@ bad:
 	return error;
 }
 
-static int selinux_setprocattr(struct task_struct *p,
-			       char *name, void *value, size_t size)
+static int selinux_setprocattr(const char *name, void *value, size_t size)
 {
 	struct task_security_struct *tsec;
 	struct cred *new;
@@ -5871,16 +5870,6 @@ static int selinux_setprocattr(struct task_struct *p,
 	int error;
 	char *str = value;
 
-	if (current != p) {
-		/*
-		 * A task may only alter its own credentials.
-		 * SELinux has always enforced this restriction,
-		 * and it is now mandated by the Linux credentials
-		 * infrastructure; see Documentation/security/credentials.txt.
-		 */
-		return -EACCES;
-	}
-
 	/*
 	 * Basic control over ability to set these attributes at all.
 	 */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 94dc9d406ce3..8da4a6b9ca4d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3620,7 +3620,6 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 
 /**
  * smack_setprocattr - Smack process attribute setting
- * @p: the object task
  * @name: the name of the attribute in /proc/.../attr
  * @value: the value to set
  * @size: the size of the value
@@ -3630,8 +3629,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
  *
  * Returns the length of the smack label or an error code
  */
-static int smack_setprocattr(struct task_struct *p, char *name,
-			     void *value, size_t size)
+static int smack_setprocattr(const char *name, void *value, size_t size)
 {
 	struct task_smack *tsp = current_security();
 	struct cred *new;
@@ -3639,13 +3637,6 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	struct smack_known_list_elem *sklep;
 	int rc;
 
-	/*
-	 * Changing another process' Smack value is too dangerous
-	 * and supports no sane use case.
-	 */
-	if (p != current)
-		return -EPERM;
-
 	if (!smack_privileged(CAP_MAC_ADMIN) && list_empty(&tsp->smk_relabel))
 		return -EPERM;
 
-- 
cgit v1.2.3


From f32815d21d4d8287336fb9cef4d2d9e0866214c2 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 2 Jan 2017 17:19:40 -0500
Subject: xtables: add xt_match, xt_target and data copy_to_user functions

xt_entry_target, xt_entry_match and their private data may contain
kernel data.

Introduce helper functions xt_match_to_user, xt_target_to_user and
xt_data_to_user that copy only the expected fields. These replace
existing logic that calls copy_to_user on entire structs, then
overwrites select fields.

Private data is defined in xt_match and xt_target. All matches and
targets that maintain kernel data store this at the tail of their
private structure. Extend xt_match and xt_target with .usersize to
limit how many bytes of data are copied. The remainder is cleared.

If compatsize is specified, usersize can only safely be used if all
fields up to usersize use platform-independent types. Otherwise, the
compat_to_user callback must be defined.

This patch does not yet enable the support logic.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  9 +++++++
 net/netfilter/x_tables.c           | 54 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 5117e4d2ddfa..be378cf47fcc 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -167,6 +167,7 @@ struct xt_match {
 
 	const char *table;
 	unsigned int matchsize;
+	unsigned int usersize;
 #ifdef CONFIG_COMPAT
 	unsigned int compatsize;
 #endif
@@ -207,6 +208,7 @@ struct xt_target {
 
 	const char *table;
 	unsigned int targetsize;
+	unsigned int usersize;
 #ifdef CONFIG_COMPAT
 	unsigned int compatsize;
 #endif
@@ -287,6 +289,13 @@ int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
 		    bool inv_proto);
 
+int xt_match_to_user(const struct xt_entry_match *m,
+		     struct xt_entry_match __user *u);
+int xt_target_to_user(const struct xt_entry_target *t,
+		      struct xt_entry_target __user *u);
+int xt_data_to_user(void __user *dst, const void *src,
+		    int usersize, int size);
+
 void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 				 struct xt_counters_info *info, bool compat);
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2ff499680cc6..feccf527abdd 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -262,6 +262,60 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
 }
 EXPORT_SYMBOL_GPL(xt_request_find_target);
 
+
+static int xt_obj_to_user(u16 __user *psize, u16 size,
+			  void __user *pname, const char *name,
+			  u8 __user *prev, u8 rev)
+{
+	if (put_user(size, psize))
+		return -EFAULT;
+	if (copy_to_user(pname, name, strlen(name) + 1))
+		return -EFAULT;
+	if (put_user(rev, prev))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE)				\
+	xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size,	\
+		       U->u.user.name, K->u.kernel.TYPE->name,		\
+		       &U->u.user.revision, K->u.kernel.TYPE->revision)
+
+int xt_data_to_user(void __user *dst, const void *src,
+		    int usersize, int size)
+{
+	usersize = usersize ? : size;
+	if (copy_to_user(dst, src, usersize))
+		return -EFAULT;
+	if (usersize != size && clear_user(dst + usersize, size - usersize))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_data_to_user);
+
+#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE)				\
+	xt_data_to_user(U->data, K->data,				\
+			K->u.kernel.TYPE->usersize,			\
+			C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+
+int xt_match_to_user(const struct xt_entry_match *m,
+		     struct xt_entry_match __user *u)
+{
+	return XT_OBJ_TO_USER(u, m, match, 0) ||
+	       XT_DATA_TO_USER(u, m, match, 0);
+}
+EXPORT_SYMBOL_GPL(xt_match_to_user);
+
+int xt_target_to_user(const struct xt_entry_target *t,
+		      struct xt_entry_target __user *u)
+{
+	return XT_OBJ_TO_USER(u, t, target, 0) ||
+	       XT_DATA_TO_USER(u, t, target, 0);
+}
+EXPORT_SYMBOL_GPL(xt_target_to_user);
+
 static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
 {
 	const struct xt_match *m;
-- 
cgit v1.2.3


From 0118717583cda6f4f36092853ad0345e8150b286 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:24 +0200
Subject: net/mlx5: Add interface to get reference to a UAR

A reference to a UAR is required to generate CQ or EQ doorbells. Since
CQ or EQ doorbells can all be generated using the same UAR area without
any effect on performance, we are just getting a reference to any
available UAR, If one is not available we allocate it but we don't waste
the blue flame registers it can provide and we will use them for
subsequent allocations.
We get a reference to such UAR and put in mlx5_priv so any kernel
consumer can make use of it.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c   | 14 ++++-------
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 22 ++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/uar.c  | 32 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  5 +++-
 4 files changed, 59 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 11a8d638bcd0..5130d65dd41a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -512,7 +512,7 @@ static void init_eq_buf(struct mlx5_eq *eq)
 
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		       int nent, u64 mask, const char *name,
-		       struct mlx5_uar *uar, enum mlx5_eq_type type)
+		       enum mlx5_eq_type type)
 {
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
@@ -556,7 +556,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 
 	eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
 	MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
-	MLX5_SET(eqc, eqc, uar_page, uar->index);
+	MLX5_SET(eqc, eqc, uar_page, priv->uar->index);
 	MLX5_SET(eqc, eqc, intr, vecidx);
 	MLX5_SET(eqc, eqc, log_page_size,
 		 eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
@@ -571,7 +571,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
 	eq->irqn = priv->msix_arr[vecidx].vector;
 	eq->dev = dev;
-	eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET;
+	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
 	err = request_irq(eq->irqn, handler, 0,
 			  priv->irq_info[vecidx].name, eq);
 	if (err)
@@ -686,8 +686,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
-				 "mlx5_cmd_eq", &dev->priv.bfregi.uars[0],
-				 MLX5_EQ_TYPE_ASYNC);
+				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		return err;
@@ -697,8 +696,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
 				 MLX5_NUM_ASYNC_EQE, async_event_mask,
-				 "mlx5_async_eq", &dev->priv.bfregi.uars[0],
-				 MLX5_EQ_TYPE_ASYNC);
+				 "mlx5_async_eq", MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
 		goto err1;
@@ -708,7 +706,6 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 				 MLX5_EQ_VEC_PAGES,
 				 /* TODO: sriov max_vf + */ 1,
 				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
-				 &dev->priv.bfregi.uars[0],
 				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
@@ -722,7 +719,6 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 					 MLX5_NUM_ASYNC_EQE,
 					 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 					 "mlx5_page_fault_eq",
-					 &dev->priv.bfregi.uars[0],
 					 MLX5_EQ_TYPE_PF);
 		if (err) {
 			mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 634e96a02516..2882d0483ed8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -753,8 +753,7 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
-					 name, &dev->priv.bfregi.uars[0],
-					 MLX5_EQ_TYPE_COMP);
+					 name, MLX5_EQ_TYPE_COMP);
 		if (err) {
 			kfree(eq);
 			goto clean;
@@ -1094,12 +1093,18 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_cleanup_once;
 	}
 
-	err = mlx5_alloc_bfregs(dev, &priv->bfregi);
-	if (err) {
+	dev->priv.uar = mlx5_get_uars_page(dev);
+	if (!dev->priv.uar) {
 		dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
 		goto err_disable_msix;
 	}
 
+	err = mlx5_alloc_bfregs(dev, &priv->bfregi);
+	if (err) {
+		dev_err(&pdev->dev, "Failed allocating uuars, aborting\n");
+		goto err_uar_cleanup;
+	}
+
 	err = mlx5_start_eqs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to start pages and async EQs\n");
@@ -1172,6 +1177,9 @@ err_stop_eqs:
 err_free_uar:
 	mlx5_free_bfregs(dev, &priv->bfregi);
 
+err_uar_cleanup:
+	mlx5_put_uars_page(dev, priv->uar);
+
 err_disable_msix:
 	mlx5_disable_msix(dev);
 
@@ -1231,6 +1239,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_free_bfregs(dev, &priv->bfregi);
+	mlx5_put_uars_page(dev, priv->uar);
 	mlx5_disable_msix(dev);
 	if (cleanup)
 		mlx5_cleanup_once(dev);
@@ -1305,6 +1314,11 @@ static int init_one(struct pci_dev *pdev,
 		goto clean_dev;
 	}
 #endif
+	mutex_init(&priv->bfregs.reg_head.lock);
+	mutex_init(&priv->bfregs.wc_head.lock);
+	INIT_LIST_HEAD(&priv->bfregs.reg_head.list);
+	INIT_LIST_HEAD(&priv->bfregs.wc_head.list);
+
 	err = mlx5_pci_init(dev, priv);
 	if (err) {
 		dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index 6a081a8787a7..fcc0270ea72f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -332,6 +332,38 @@ error1:
 	return ERR_PTR(err);
 }
 
+struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_uars_page *ret;
+
+	mutex_lock(&mdev->priv.bfregs.reg_head.lock);
+	if (list_empty(&mdev->priv.bfregs.reg_head.list)) {
+		ret = alloc_uars_page(mdev, false);
+		if (IS_ERR(ret)) {
+			ret = NULL;
+			goto out;
+		}
+		list_add(&ret->list, &mdev->priv.bfregs.reg_head.list);
+	} else {
+		ret = list_first_entry(&mdev->priv.bfregs.reg_head.list,
+				       struct mlx5_uars_page, list);
+		kref_get(&ret->ref_count);
+	}
+out:
+	mutex_unlock(&mdev->priv.bfregs.reg_head.lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(mlx5_get_uars_page);
+
+void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up)
+{
+	mutex_lock(&mdev->priv.bfregs.reg_head.lock);
+	kref_put(&up->ref_count, up_rel_func);
+	mutex_unlock(&mdev->priv.bfregs.reg_head.lock);
+}
+EXPORT_SYMBOL(mlx5_put_uars_page);
+
 static unsigned long map_offset(struct mlx5_core_dev *mdev, int dbi)
 {
 	/* return the offset in bytes from the start of the page to the
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 969aa1fe17e2..9a3a0954855b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -679,6 +679,7 @@ struct mlx5_priv {
 	struct srcu_struct      pfault_srcu;
 #endif
 	struct mlx5_bfreg_data		bfregs;
+	struct mlx5_uars_page	       *uar;
 };
 
 enum mlx5_device_state {
@@ -1007,7 +1008,7 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec);
 void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		       int nent, u64 mask, const char *name,
-		       struct mlx5_uar *uar, enum mlx5_eq_type type);
+		       enum mlx5_eq_type type);
 int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_start_eqs(struct mlx5_core_dev *dev);
 int mlx5_stop_eqs(struct mlx5_core_dev *dev);
@@ -1118,6 +1119,8 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
+struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
+void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
 
 struct mlx5_profile {
 	u64	mask;
-- 
cgit v1.2.3


From 5fe9dec0d045437e48f112b8fa705197bd7bc3c0 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:25 +0200
Subject: IB/mlx5: Use blue flame register allocator in mlx5_ib

Make use of the blue flame registers allocator at mlx5_ib. Since blue
flame was not really supported we remove all the code that is related to
blue flame and we let all consumers to use the same blue flame register.
Once blue flame is supported we will add the code. As part of this patch
we also move the definition of struct mlx5_bf to mlx5_ib.h as it is only
used by mlx5_ib.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c                |   8 +-
 drivers/infiniband/hw/mlx5/main.c              |  28 +++++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h           |  11 ++-
 drivers/infiniband/hw/mlx5/qp.c                |  73 +++-------------
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  16 +---
 drivers/net/ethernet/mellanox/mlx5/core/uar.c  | 114 -------------------------
 include/linux/mlx5/cq.h                        |   3 +-
 include/linux/mlx5/doorbell.h                  |   6 +-
 include/linux/mlx5/driver.h                    |  19 -----
 10 files changed, 59 insertions(+), 221 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index bb7e91c55003..a28ec33b82ed 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -689,7 +689,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 {
 	struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev;
 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
-	void __iomem *uar_page = mdev->priv.bfregi.uars[0].map;
+	void __iomem *uar_page = mdev->priv.uar->map;
 	unsigned long irq_flags;
 	int ret = 0;
 
@@ -704,9 +704,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 	mlx5_cq_arm(&cq->mcq,
 		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
 		    MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT,
-		    uar_page,
-		    MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock),
-		    to_mcq(ibcq)->mcq.cons_index);
+		    uar_page, to_mcq(ibcq)->mcq.cons_index);
 
 	return ret;
 }
@@ -886,7 +884,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = dev->mdev->priv.bfregi.uars[0].index;
+	*index = dev->mdev->priv.uar->index;
 
 	return 0;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d5cf82b387d3..e9f0830eca1c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3074,8 +3074,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	if (mlx5_use_mad_ifc(dev))
 		get_ext_port_caps(dev);
 
-	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
-
 	if (!mlx5_lag_is_active(mdev))
 		name = "mlx5_%d";
 	else
@@ -3251,9 +3249,21 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	if (err)
 		goto err_odp;
 
+	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
+	if (!dev->mdev->priv.uar)
+		goto err_q_cnt;
+
+	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
+	if (err)
+		goto err_uar_page;
+
+	err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
+	if (err)
+		goto err_bfreg;
+
 	err = ib_register_device(&dev->ib_dev, NULL);
 	if (err)
-		goto err_q_cnt;
+		goto err_fp_bfreg;
 
 	err = create_umr_res(dev);
 	if (err)
@@ -3276,6 +3286,15 @@ err_umrc:
 err_dev:
 	ib_unregister_device(&dev->ib_dev);
 
+err_fp_bfreg:
+	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
+
+err_bfreg:
+	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
+
+err_uar_page:
+	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
+
 err_q_cnt:
 	mlx5_ib_dealloc_q_counters(dev);
 
@@ -3307,6 +3326,9 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 
 	mlx5_remove_netdev_notifier(dev);
 	ib_unregister_device(&dev->ib_dev);
+	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
+	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
+	mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
 	mlx5_ib_dealloc_q_counters(dev);
 	destroy_umrc_res(dev);
 	mlx5_ib_odp_remove_one(dev);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d4d1329df94a..ae3bc4a1bfed 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -324,6 +324,12 @@ struct mlx5_ib_raw_packet_qp {
 	struct mlx5_ib_rq rq;
 };
 
+struct mlx5_bf {
+	int			buf_size;
+	unsigned long		offset;
+	struct mlx5_sq_bfreg   *bfreg;
+};
+
 struct mlx5_ib_qp {
 	struct ib_qp		ibqp;
 	union {
@@ -349,7 +355,7 @@ struct mlx5_ib_qp {
 	int			wq_sig;
 	int			scat_cqe;
 	int			max_inline_data;
-	struct mlx5_bf	       *bf;
+	struct mlx5_bf	        bf;
 	int			has_rq;
 
 	/* only for user space QPs. For kernel
@@ -591,7 +597,6 @@ struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
 	struct mlx5_roce		roce;
-	MLX5_DECLARE_DOORBELL_LOCK(uar_lock);
 	int				num_ports;
 	/* serialize update of capability mask
 	 */
@@ -621,6 +626,8 @@ struct mlx5_ib_dev {
 	struct list_head	qp_list;
 	/* Array with num_ports elements */
 	struct mlx5_ib_port	*port;
+	struct mlx5_sq_bfreg     bfreg;
+	struct mlx5_sq_bfreg     fp_bfreg;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 240fbb0c63ba..fce1c6db393b 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -909,14 +909,10 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 			    u32 **in, int *inlen,
 			    struct mlx5_ib_qp_base *base)
 {
-	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
-	struct mlx5_bfreg_info *bfregi;
 	int uar_index;
 	void *qpc;
-	int bfregn;
 	int err;
 
-	bfregi = &dev->mdev->priv.bfregi;
 	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
 					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
 					IB_QP_CREATE_IPOIB_UD_LSO |
@@ -924,21 +920,17 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 		return -EINVAL;
 
 	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
-		lc = MLX5_IB_LATENCY_CLASS_FAST_PATH;
-
-	bfregn = alloc_bfreg(bfregi, lc);
-	if (bfregn < 0) {
-		mlx5_ib_dbg(dev, "\n");
-		return -ENOMEM;
-	}
+		qp->bf.bfreg = &dev->fp_bfreg;
+	else
+		qp->bf.bfreg = &dev->bfreg;
 
-	qp->bf = &bfregi->bfs[bfregn];
-	uar_index = qp->bf->uar->index;
+	qp->bf.buf_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
+	uar_index = qp->bf.bfreg->index;
 
 	err = calc_sq_size(dev, init_attr, qp);
 	if (err < 0) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
-		goto err_bfreg;
+		return err;
 	}
 
 	qp->rq.offset = 0;
@@ -948,7 +940,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 	err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
-		goto err_bfreg;
+		return err;
 	}
 
 	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
@@ -1010,9 +1002,6 @@ err_free:
 
 err_buf:
 	mlx5_buf_free(dev->mdev, &qp->buf);
-
-err_bfreg:
-	free_bfreg(&dev->mdev->priv.bfregi, bfregn);
 	return err;
 }
 
@@ -1025,7 +1014,6 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 	kfree(qp->rq.wrid);
 	mlx5_db_free(dev->mdev, &qp->db);
 	mlx5_buf_free(dev->mdev, &qp->buf);
-	free_bfreg(&dev->mdev->priv.bfregi, qp->bf->bfregn);
 }
 
 static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
@@ -3744,24 +3732,6 @@ static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
 	}
 }
 
-static void mlx5_bf_copy(u64 __iomem *dst, u64 *src,
-			 unsigned bytecnt, struct mlx5_ib_qp *qp)
-{
-	while (bytecnt > 0) {
-		__iowrite64_copy(dst++, src++, 8);
-		__iowrite64_copy(dst++, src++, 8);
-		__iowrite64_copy(dst++, src++, 8);
-		__iowrite64_copy(dst++, src++, 8);
-		__iowrite64_copy(dst++, src++, 8);
-		__iowrite64_copy(dst++, src++, 8);
-		__iowrite64_copy(dst++, src++, 8);
-		__iowrite64_copy(dst++, src++, 8);
-		bytecnt -= 64;
-		if (unlikely(src == qp->sq.qend))
-			src = mlx5_get_send_wqe(qp, 0);
-	}
-}
-
 static u8 get_fence(u8 fence, struct ib_send_wr *wr)
 {
 	if (unlikely(wr->opcode == IB_WR_LOCAL_INV &&
@@ -3857,7 +3827,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr);
 
 	qp = to_mqp(ibqp);
-	bf = qp->bf;
+	bf = &qp->bf;
 	qend = qp->sq.qend;
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
@@ -4130,28 +4100,13 @@ out:
 		 * we hit doorbell */
 		wmb();
 
-		if (bf->need_lock)
-			spin_lock(&bf->lock);
-		else
-			__acquire(&bf->lock);
-
-		/* TBD enable WC */
-		if (0 && nreq == 1 && bf->bfregn && inl && size > 1 && size <= bf->buf_size / 16) {
-			mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp);
-			/* wc_wmb(); */
-		} else {
-			mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset,
-				     MLX5_GET_DOORBELL_LOCK(&bf->lock32));
-			/* Make sure doorbells don't leak out of SQ spinlock
-			 * and reach the HCA out of order.
-			 */
-			mmiowb();
-		}
+		/* currently we support only regular doorbells */
+		mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, NULL);
+		/* Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
 		bf->offset ^= bf->buf_size;
-		if (bf->need_lock)
-			spin_unlock(&bf->lock);
-		else
-			__release(&bf->lock);
 	}
 
 	spin_unlock_irqrestore(&qp->sq.lock, flags);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 951dbd58594d..3037631570b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -832,7 +832,7 @@ static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
 	struct mlx5_core_cq *mcq;
 
 	mcq = &cq->mcq;
-	mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, NULL, cq->wq.cc);
+	mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc);
 }
 
 static inline u32 mlx5e_get_wqe_mtt_offset(struct mlx5e_rq *rq, u16 wqe_ix)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 2882d0483ed8..ff1f14498c22 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -913,8 +913,6 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto out;
 	}
 
-	MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
-
 	err = mlx5_init_cq_table(dev);
 	if (err) {
 		dev_err(&pdev->dev, "failed to initialize cq table\n");
@@ -1099,16 +1097,10 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_disable_msix;
 	}
 
-	err = mlx5_alloc_bfregs(dev, &priv->bfregi);
-	if (err) {
-		dev_err(&pdev->dev, "Failed allocating uuars, aborting\n");
-		goto err_uar_cleanup;
-	}
-
 	err = mlx5_start_eqs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to start pages and async EQs\n");
-		goto err_free_uar;
+		goto err_put_uars;
 	}
 
 	err = alloc_comp_eqs(dev);
@@ -1174,10 +1166,7 @@ err_affinity_hints:
 err_stop_eqs:
 	mlx5_stop_eqs(dev);
 
-err_free_uar:
-	mlx5_free_bfregs(dev, &priv->bfregi);
-
-err_uar_cleanup:
+err_put_uars:
 	mlx5_put_uars_page(dev, priv->uar);
 
 err_disable_msix:
@@ -1238,7 +1227,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
-	mlx5_free_bfregs(dev, &priv->bfregi);
 	mlx5_put_uars_page(dev, priv->uar);
 	mlx5_disable_msix(dev);
 	if (cleanup)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index fcc0270ea72f..07b273cccc26 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -67,120 +67,6 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
 }
 EXPORT_SYMBOL(mlx5_cmd_free_uar);
 
-static int need_bfreg_lock(int bfregn)
-{
-	int tot_bfregs = NUM_DRIVER_UARS * MLX5_BFREGS_PER_UAR;
-
-	if (bfregn == 0 || tot_bfregs - NUM_LOW_LAT_BFREGS)
-		return 0;
-
-	return 1;
-}
-
-int mlx5_alloc_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi)
-{
-	int tot_bfregs = NUM_DRIVER_UARS * MLX5_BFREGS_PER_UAR;
-	struct mlx5_bf *bf;
-	phys_addr_t addr;
-	int err;
-	int i;
-
-	bfregi->num_uars = NUM_DRIVER_UARS;
-	bfregi->num_low_latency_bfregs = NUM_LOW_LAT_BFREGS;
-
-	mutex_init(&bfregi->lock);
-	bfregi->uars = kcalloc(bfregi->num_uars, sizeof(*bfregi->uars), GFP_KERNEL);
-	if (!bfregi->uars)
-		return -ENOMEM;
-
-	bfregi->bfs = kcalloc(tot_bfregs, sizeof(*bfregi->bfs), GFP_KERNEL);
-	if (!bfregi->bfs) {
-		err = -ENOMEM;
-		goto out_uars;
-	}
-
-	bfregi->bitmap = kcalloc(BITS_TO_LONGS(tot_bfregs), sizeof(*bfregi->bitmap),
-				GFP_KERNEL);
-	if (!bfregi->bitmap) {
-		err = -ENOMEM;
-		goto out_bfs;
-	}
-
-	bfregi->count = kcalloc(tot_bfregs, sizeof(*bfregi->count), GFP_KERNEL);
-	if (!bfregi->count) {
-		err = -ENOMEM;
-		goto out_bitmap;
-	}
-
-	for (i = 0; i < bfregi->num_uars; i++) {
-		err = mlx5_cmd_alloc_uar(dev, &bfregi->uars[i].index);
-		if (err)
-			goto out_count;
-
-		addr = dev->iseg_base + ((phys_addr_t)(bfregi->uars[i].index) << PAGE_SHIFT);
-		bfregi->uars[i].map = ioremap(addr, PAGE_SIZE);
-		if (!bfregi->uars[i].map) {
-			mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
-			err = -ENOMEM;
-			goto out_count;
-		}
-		mlx5_core_dbg(dev, "allocated uar index 0x%x, mmaped at %p\n",
-			      bfregi->uars[i].index, bfregi->uars[i].map);
-	}
-
-	for (i = 0; i < tot_bfregs; i++) {
-		bf = &bfregi->bfs[i];
-
-		bf->buf_size = (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) / 2;
-		bf->uar = &bfregi->uars[i / MLX5_BFREGS_PER_UAR];
-		bf->regreg = bfregi->uars[i / MLX5_BFREGS_PER_UAR].map;
-		bf->reg = NULL; /* Add WC support */
-		bf->offset = (i % MLX5_BFREGS_PER_UAR) *
-			     (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) +
-			     MLX5_BF_OFFSET;
-		bf->need_lock = need_bfreg_lock(i);
-		spin_lock_init(&bf->lock);
-		spin_lock_init(&bf->lock32);
-		bf->bfregn = i;
-	}
-
-	return 0;
-
-out_count:
-	for (i--; i >= 0; i--) {
-		iounmap(bfregi->uars[i].map);
-		mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
-	}
-	kfree(bfregi->count);
-
-out_bitmap:
-	kfree(bfregi->bitmap);
-
-out_bfs:
-	kfree(bfregi->bfs);
-
-out_uars:
-	kfree(bfregi->uars);
-	return err;
-}
-
-int mlx5_free_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi)
-{
-	int i = bfregi->num_uars;
-
-	for (i--; i >= 0; i--) {
-		iounmap(bfregi->uars[i].map);
-		mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
-	}
-
-	kfree(bfregi->count);
-	kfree(bfregi->bitmap);
-	kfree(bfregi->bfs);
-	kfree(bfregi->uars);
-
-	return 0;
-}
-
 int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
 		       bool map_wc)
 {
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 7c3c0d3aca37..996863381bc8 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -144,7 +144,6 @@ enum {
 
 static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd,
 			       void __iomem *uar_page,
-			       spinlock_t *doorbell_lock,
 			       u32 cons_index)
 {
 	__be32 doorbell[2];
@@ -164,7 +163,7 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd,
 	doorbell[0] = cpu_to_be32(sn << 28 | cmd | ci);
 	doorbell[1] = cpu_to_be32(cq->cqn);
 
-	mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, doorbell_lock);
+	mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, NULL);
 }
 
 int mlx5_init_cq_table(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h
index afc78a3f4462..0787de28f2fc 100644
--- a/include/linux/mlx5/doorbell.h
+++ b/include/linux/mlx5/doorbell.h
@@ -68,10 +68,12 @@ static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(doorbell_lock, flags);
+	if (doorbell_lock)
+		spin_lock_irqsave(doorbell_lock, flags);
 	__raw_writel((__force u32) val[0], dest);
 	__raw_writel((__force u32) val[1], dest + 4);
-	spin_unlock_irqrestore(doorbell_lock, flags);
+	if (doorbell_lock)
+		spin_unlock_irqrestore(doorbell_lock, flags);
 }
 
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9a3a0954855b..bb362f506a2e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -203,23 +203,6 @@ struct mlx5_bfreg_info {
 	u32			ver;
 };
 
-struct mlx5_bf {
-	void __iomem	       *reg;
-	void __iomem	       *regreg;
-	int			buf_size;
-	struct mlx5_uar	       *uar;
-	unsigned long		offset;
-	int			need_lock;
-	/* protect blue flame buffer selection when needed
-	 */
-	spinlock_t		lock;
-
-	/* serialize 64 bit writes when done as two 32 bit accesses
-	 */
-	spinlock_t		lock32;
-	int			bfregn;
-};
-
 struct mlx5_cmd_first {
 	__be32		data[4];
 };
@@ -612,8 +595,6 @@ struct mlx5_priv {
 	struct mlx5_eq_table	eq_table;
 	struct msix_entry	*msix_arr;
 	struct mlx5_irq_info	*irq_info;
-	struct mlx5_bfreg_info	bfregi;
-	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
 	/* pages stuff */
 	struct workqueue_struct *pg_wq;
-- 
cgit v1.2.3


From b037c29a8056b8e896c4e084ba7cc30d6a1f165f Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:26 +0200
Subject: IB/mlx5: Allow future extension of libmlx5 input data

Current check requests that new fields in struct
mlx5_ib_alloc_ucontext_req_v2 that are not known to the driver be zero.
This was introduced so new libraries passing additional information to
the kernel through struct mlx5_ib_alloc_ucontext_req_v2 will be notified
by old kernels that do not support their request by failing the
operation. This schecme is problematic since it requires libmlx5 to issue
the requests with descending input size for struct
mlx5_ib_alloc_ucontext_req_v2.

To avoid this, we require that new features that will obey the following
rules:
If the feature requires one or more fields in the response and the at
least one of the fields can be encoded such that a zero value means the
kernel ignored the request then this field will provide the indication
to the library. If no response is required or if zero is a valid
response, a new field should be added that indicates to the library
whether its request was processed.

Fixes: b368d7cb8ceb ('IB/mlx5: Add hca_core_clock_offset to udata in init_ucontext')
Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c      |   2 +-
 drivers/infiniband/hw/mlx5/main.c    | 201 ++++++++++++++++++++++-------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  15 ++-
 drivers/infiniband/hw/mlx5/qp.c      | 133 ++++++++++-------------
 include/linux/mlx5/device.h          |  12 ++-
 include/linux/mlx5/driver.h          |  12 +--
 6 files changed, 209 insertions(+), 166 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index a28ec33b82ed..31803b367104 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -788,7 +788,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = to_mucontext(context)->bfregi.uars[0].index;
+	*index = to_mucontext(context)->bfregi.sys_pages[0];
 
 	if (ucmd.cqe_comp_en == 1) {
 		if (unlikely((*cqe_size != 64) ||
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e9f0830eca1c..664067239de5 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -992,6 +992,80 @@ out:
 	return err;
 }
 
+static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
+			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
+			     u32 *num_sys_pages)
+{
+	int uars_per_sys_page;
+	int bfregs_per_sys_page;
+	int ref_bfregs = req->total_num_bfregs;
+
+	if (req->total_num_bfregs == 0)
+		return -EINVAL;
+
+	BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
+	BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
+
+	if (req->total_num_bfregs > MLX5_MAX_BFREGS)
+		return -ENOMEM;
+
+	uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
+	bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
+	req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
+	*num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
+
+	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
+		return -EINVAL;
+
+	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, alloated %d, using %d sys pages\n",
+		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
+		    lib_uar_4k ? "yes" : "no", ref_bfregs,
+		    req->total_num_bfregs, *num_sys_pages);
+
+	return 0;
+}
+
+static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
+{
+	struct mlx5_bfreg_info *bfregi;
+	int err;
+	int i;
+
+	bfregi = &context->bfregi;
+	for (i = 0; i < bfregi->num_sys_pages; i++) {
+		err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
+		if (err)
+			goto error;
+
+		mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
+	}
+	return 0;
+
+error:
+	for (--i; i >= 0; i--)
+		if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
+			mlx5_ib_warn(dev, "failed to free uar %d\n", i);
+
+	return err;
+}
+
+static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
+{
+	struct mlx5_bfreg_info *bfregi;
+	int err;
+	int i;
+
+	bfregi = &context->bfregi;
+	for (i = 0; i < bfregi->num_sys_pages; i++) {
+		err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
+		if (err) {
+			mlx5_ib_warn(dev, "failed to free uar %d\n", i);
+			return err;
+		}
+	}
+	return 0;
+}
+
 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 						  struct ib_udata *udata)
 {
@@ -1000,16 +1074,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
 	struct mlx5_ib_ucontext *context;
 	struct mlx5_bfreg_info *bfregi;
-	struct mlx5_uar *uars;
-	int gross_bfregs;
-	int num_uars;
 	int ver;
-	int bfregn;
 	int err;
-	int i;
 	size_t reqlen;
 	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
 				     max_cqe_version);
+	bool lib_uar_4k;
 
 	if (!dev->ib_active)
 		return ERR_PTR(-EAGAIN);
@@ -1032,27 +1102,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (req.flags)
 		return ERR_PTR(-EINVAL);
 
-	if (req.total_num_bfregs > MLX5_MAX_BFREGS)
-		return ERR_PTR(-ENOMEM);
-
-	if (req.total_num_bfregs == 0)
-		return ERR_PTR(-EINVAL);
-
 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	if (reqlen > sizeof(req) &&
-	    !ib_is_udata_cleared(udata, sizeof(req),
-				 reqlen - sizeof(req)))
-		return ERR_PTR(-EOPNOTSUPP);
-
 	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
 				    MLX5_NON_FP_BFREGS_PER_UAR);
 	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
 		return ERR_PTR(-EINVAL);
 
-	num_uars = req.total_num_bfregs / MLX5_NON_FP_BFREGS_PER_UAR;
-	gross_bfregs = num_uars * MLX5_BFREGS_PER_UAR;
 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
 	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
 		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
@@ -1072,42 +1129,34 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
+	lib_uar_4k = false;
 	bfregi = &context->bfregi;
-	mutex_init(&bfregi->lock);
-	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
-	if (!uars) {
-		err = -ENOMEM;
+
+	/* updates req->total_num_bfregs */
+	err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages);
+	if (err)
 		goto out_ctx;
-	}
 
-	bfregi->bitmap = kcalloc(BITS_TO_LONGS(gross_bfregs),
-				sizeof(*bfregi->bitmap),
+	mutex_init(&bfregi->lock);
+	bfregi->lib_uar_4k = lib_uar_4k;
+	bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count),
 				GFP_KERNEL);
-	if (!bfregi->bitmap) {
+	if (!bfregi->count) {
 		err = -ENOMEM;
-		goto out_uar_ctx;
-	}
-	/*
-	 * clear all fast path bfregs
-	 */
-	for (i = 0; i < gross_bfregs; i++) {
-		bfregn = i & 3;
-		if (bfregn == 2 || bfregn == 3)
-			set_bit(i, bfregi->bitmap);
+		goto out_ctx;
 	}
 
-	bfregi->count = kcalloc(gross_bfregs,
-				sizeof(*bfregi->count), GFP_KERNEL);
-	if (!bfregi->count) {
+	bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
+				    sizeof(*bfregi->sys_pages),
+				    GFP_KERNEL);
+	if (!bfregi->sys_pages) {
 		err = -ENOMEM;
-		goto out_bitmap;
+		goto out_count;
 	}
 
-	for (i = 0; i < num_uars; i++) {
-		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
-		if (err)
-			goto out_count;
-	}
+	err = allocate_uars(dev, context);
+	if (err)
+		goto out_sys_pages;
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
@@ -1166,9 +1215,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 
 	bfregi->ver = ver;
 	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
-	bfregi->uars = uars;
-	bfregi->num_uars = num_uars;
 	context->cqe_version = resp.cqe_version;
+	context->lib_caps = false;
 
 	return &context->ibucontext;
 
@@ -1180,19 +1228,17 @@ out_page:
 	free_page(context->upd_xlt_page);
 
 out_uars:
-	for (i--; i >= 0; i--)
-		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
-out_count:
-	kfree(bfregi->count);
+	deallocate_uars(dev, context);
 
-out_bitmap:
-	kfree(bfregi->bitmap);
+out_sys_pages:
+	kfree(bfregi->sys_pages);
 
-out_uar_ctx:
-	kfree(uars);
+out_count:
+	kfree(bfregi->count);
 
 out_ctx:
 	kfree(context);
+
 	return ERR_PTR(err);
 }
 
@@ -1200,31 +1246,31 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
-	struct mlx5_bfreg_info *bfregi = &context->bfregi;
-	int i;
+	struct mlx5_bfreg_info *bfregi;
 
+	bfregi = &context->bfregi;
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
 
 	free_page(context->upd_xlt_page);
-
-	for (i = 0; i < bfregi->num_uars; i++) {
-		if (mlx5_cmd_free_uar(dev->mdev, bfregi->uars[i].index))
-			mlx5_ib_warn(dev, "Failed to free UAR 0x%x\n",
-				     bfregi->uars[i].index);
-	}
-
+	deallocate_uars(dev, context);
+	kfree(bfregi->sys_pages);
 	kfree(bfregi->count);
-	kfree(bfregi->bitmap);
-	kfree(bfregi->uars);
 	kfree(context);
 
 	return 0;
 }
 
-static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
+static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
+				 struct mlx5_bfreg_info *bfregi,
+				 int idx)
 {
-	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
+	int fw_uars_per_page;
+
+	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
+
+	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) +
+			bfregi->sys_pages[idx] / fw_uars_per_page;
 }
 
 static int get_command(unsigned long offset)
@@ -1384,6 +1430,18 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 	unsigned long idx;
 	phys_addr_t pfn, pa;
 	pgprot_t prot;
+	int uars_per_page;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
+	idx = get_index(vma->vm_pgoff);
+	if (idx % uars_per_page ||
+	    idx * uars_per_page >= bfregi->num_sys_pages) {
+		mlx5_ib_warn(dev, "invalid uar index %lu\n", idx);
+		return -EINVAL;
+	}
 
 	switch (cmd) {
 	case MLX5_IB_MMAP_WC_PAGE:
@@ -1406,14 +1464,7 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		return -EINVAL;
 	}
 
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
-		return -EINVAL;
-
-	idx = get_index(vma->vm_pgoff);
-	if (idx >= bfregi->num_uars)
-		return -EINVAL;
-
-	pfn = uar_index2pfn(dev, bfregi->uars[idx].index);
+	pfn = uar_index2pfn(dev, bfregi, idx);
 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
 
 	vma->vm_page_prot = prot;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ae3bc4a1bfed..e1a4b93dce6b 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -90,7 +90,6 @@ enum mlx5_ib_latency_class {
 	MLX5_IB_LATENCY_CLASS_LOW,
 	MLX5_IB_LATENCY_CLASS_MEDIUM,
 	MLX5_IB_LATENCY_CLASS_HIGH,
-	MLX5_IB_LATENCY_CLASS_FAST_PATH
 };
 
 enum mlx5_ib_mad_ifc_flags {
@@ -129,6 +128,7 @@ struct mlx5_ib_ucontext {
 	unsigned long		upd_xlt_page;
 	/* protect ODP/KSM */
 	struct mutex		upd_xlt_page_mutex;
+	u64			lib_caps;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -975,4 +975,17 @@ static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext,
 
 	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
 }
+
+static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_support)
+{
+	return lib_support && MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_UARS_IN_PAGE : 1;
+}
+
+static inline int get_num_uars(struct mlx5_ib_dev *dev,
+			       struct mlx5_bfreg_info *bfregi)
+{
+	return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_sys_pages;
+}
+
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index fce1c6db393b..6a83fb32599d 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -480,16 +480,6 @@ static int first_med_bfreg(void)
 	return 1;
 }
 
-static int next_bfreg(int n)
-{
-	n++;
-
-	while (((n % 4) & 2))
-		n++;
-
-	return n;
-}
-
 enum {
 	/* this is the first blue flame register in the array of bfregs assigned
 	 * to a processes. Since we do not use it for blue flame but rather
@@ -499,36 +489,38 @@ enum {
 	NUM_NON_BLUE_FLAME_BFREGS = 1,
 };
 
-static int num_med_bfreg(struct mlx5_bfreg_info *bfregi)
+static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi)
+{
+	return get_num_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR;
+}
+
+static int num_med_bfreg(struct mlx5_ib_dev *dev,
+			 struct mlx5_bfreg_info *bfregi)
 {
 	int n;
 
-	n = bfregi->num_uars * MLX5_NON_FP_BFREGS_PER_UAR -
-		bfregi->num_low_latency_bfregs - NUM_NON_BLUE_FLAME_BFREGS;
+	n = max_bfregs(dev, bfregi) - bfregi->num_low_latency_bfregs -
+	    NUM_NON_BLUE_FLAME_BFREGS;
 
 	return n >= 0 ? n : 0;
 }
 
-static int max_bfregi(struct mlx5_bfreg_info *bfregi)
-{
-	return bfregi->num_uars * 4;
-}
-
-static int first_hi_bfreg(struct mlx5_bfreg_info *bfregi)
+static int first_hi_bfreg(struct mlx5_ib_dev *dev,
+			  struct mlx5_bfreg_info *bfregi)
 {
 	int med;
 
-	med = num_med_bfreg(bfregi);
-	return next_bfreg(med);
+	med = num_med_bfreg(dev, bfregi);
+	return ++med;
 }
 
-static int alloc_high_class_bfreg(struct mlx5_bfreg_info *bfregi)
+static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev,
+				  struct mlx5_bfreg_info *bfregi)
 {
 	int i;
 
-	for (i = first_hi_bfreg(bfregi); i < max_bfregi(bfregi); i = next_bfreg(i)) {
-		if (!test_bit(i, bfregi->bitmap)) {
-			set_bit(i, bfregi->bitmap);
+	for (i = first_hi_bfreg(dev, bfregi); i < max_bfregs(dev, bfregi); i++) {
+		if (!bfregi->count[i]) {
 			bfregi->count[i]++;
 			return i;
 		}
@@ -537,12 +529,13 @@ static int alloc_high_class_bfreg(struct mlx5_bfreg_info *bfregi)
 	return -ENOMEM;
 }
 
-static int alloc_med_class_bfreg(struct mlx5_bfreg_info *bfregi)
+static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev,
+				 struct mlx5_bfreg_info *bfregi)
 {
 	int minidx = first_med_bfreg();
 	int i;
 
-	for (i = first_med_bfreg(); i < first_hi_bfreg(bfregi); i = next_bfreg(i)) {
+	for (i = first_med_bfreg(); i < first_hi_bfreg(dev, bfregi); i++) {
 		if (bfregi->count[i] < bfregi->count[minidx])
 			minidx = i;
 		if (!bfregi->count[minidx])
@@ -553,7 +546,8 @@ static int alloc_med_class_bfreg(struct mlx5_bfreg_info *bfregi)
 	return minidx;
 }
 
-static int alloc_bfreg(struct mlx5_bfreg_info *bfregi,
+static int alloc_bfreg(struct mlx5_ib_dev *dev,
+		       struct mlx5_bfreg_info *bfregi,
 		       enum mlx5_ib_latency_class lat)
 {
 	int bfregn = -EINVAL;
@@ -570,18 +564,14 @@ static int alloc_bfreg(struct mlx5_bfreg_info *bfregi,
 		if (bfregi->ver < 2)
 			bfregn = -ENOMEM;
 		else
-			bfregn = alloc_med_class_bfreg(bfregi);
+			bfregn = alloc_med_class_bfreg(dev, bfregi);
 		break;
 
 	case MLX5_IB_LATENCY_CLASS_HIGH:
 		if (bfregi->ver < 2)
 			bfregn = -ENOMEM;
 		else
-			bfregn = alloc_high_class_bfreg(bfregi);
-		break;
-
-	case MLX5_IB_LATENCY_CLASS_FAST_PATH:
-		bfregn = 2;
+			bfregn = alloc_high_class_bfreg(dev, bfregi);
 		break;
 	}
 	mutex_unlock(&bfregi->lock);
@@ -589,37 +579,10 @@ static int alloc_bfreg(struct mlx5_bfreg_info *bfregi,
 	return bfregn;
 }
 
-static void free_med_class_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
+static void free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	clear_bit(bfregn, bfregi->bitmap);
-	--bfregi->count[bfregn];
-}
-
-static void free_high_class_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
-{
-	clear_bit(bfregn, bfregi->bitmap);
-	--bfregi->count[bfregn];
-}
-
-static void free_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
-{
-	int nbfregs = bfregi->num_uars * MLX5_BFREGS_PER_UAR;
-	int high_bfreg = nbfregs - bfregi->num_low_latency_bfregs;
-
 	mutex_lock(&bfregi->lock);
-	if (bfregn == 0) {
-		--bfregi->count[bfregn];
-		goto out;
-	}
-
-	if (bfregn < high_bfreg) {
-		free_med_class_bfreg(bfregi, bfregn);
-		goto out;
-	}
-
-	free_high_class_bfreg(bfregi, bfregn);
-
-out:
+	bfregi->count[bfregn]--;
 	mutex_unlock(&bfregi->lock);
 }
 
@@ -661,9 +624,20 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
 static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
 			       struct mlx5_ib_cq *recv_cq);
 
-static int bfregn_to_uar_index(struct mlx5_bfreg_info *bfregi, int bfregn)
+static int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
+			       struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	return bfregi->uars[bfregn / MLX5_BFREGS_PER_UAR].index;
+	int bfregs_per_sys_page;
+	int index_of_sys_page;
+	int offset;
+
+	bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) *
+				MLX5_NON_FP_BFREGS_PER_UAR;
+	index_of_sys_page = bfregn / bfregs_per_sys_page;
+
+	offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR;
+
+	return bfregi->sys_pages[index_of_sys_page] + offset;
 }
 
 static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
@@ -766,6 +740,13 @@ err_umem:
 	return err;
 }
 
+static int adjust_bfregn(struct mlx5_ib_dev *dev,
+			 struct mlx5_bfreg_info *bfregi, int bfregn)
+{
+	return bfregn / MLX5_NON_FP_BFREGS_PER_UAR * MLX5_BFREGS_PER_UAR +
+				bfregn % MLX5_NON_FP_BFREGS_PER_UAR;
+}
+
 static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
 			  struct ib_qp_init_attr *attr,
@@ -800,15 +781,15 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
 		bfregn = MLX5_CROSS_CHANNEL_BFREG;
 	else {
-		bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH);
+		bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH);
 		if (bfregn < 0) {
 			mlx5_ib_dbg(dev, "failed to allocate low latency BFREG\n");
 			mlx5_ib_dbg(dev, "reverting to medium latency\n");
-			bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_MEDIUM);
+			bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_MEDIUM);
 			if (bfregn < 0) {
 				mlx5_ib_dbg(dev, "failed to allocate medium latency BFREG\n");
 				mlx5_ib_dbg(dev, "reverting to high latency\n");
-				bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_LOW);
+				bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_LOW);
 				if (bfregn < 0) {
 					mlx5_ib_warn(dev, "bfreg allocation failed\n");
 					return bfregn;
@@ -817,7 +798,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		}
 	}
 
-	uar_index = bfregn_to_uar_index(&context->bfregi, bfregn);
+	uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn);
 	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
 
 	qp->rq.offset = 0;
@@ -858,7 +839,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	MLX5_SET(qpc, qpc, page_offset, offset);
 
 	MLX5_SET(qpc, qpc, uar_page, uar_index);
-	resp->bfreg_index = bfregn;
+	resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn);
 	qp->bfregn = bfregn;
 
 	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
@@ -887,12 +868,12 @@ err_umem:
 		ib_umem_release(ubuffer->umem);
 
 err_bfreg:
-	free_bfreg(&context->bfregi, bfregn);
+	free_bfreg(dev, &context->bfregi, bfregn);
 	return err;
 }
 
-static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp,
-			    struct mlx5_ib_qp_base *base)
+static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base)
 {
 	struct mlx5_ib_ucontext *context;
 
@@ -900,7 +881,7 @@ static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp,
 	mlx5_ib_db_unmap_user(context, &qp->db);
 	if (base->ubuffer.umem)
 		ib_umem_release(base->ubuffer.umem);
-	free_bfreg(&context->bfregi, qp->bfregn);
+	free_bfreg(dev, &context->bfregi, qp->bfregn);
 }
 
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
@@ -1784,7 +1765,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 err_create:
 	if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(pd, qp, base);
+		destroy_qp_user(dev, pd, qp, base);
 	else if (qp->create_type == MLX5_QP_KERNEL)
 		destroy_qp_kernel(dev, qp);
 
@@ -1962,7 +1943,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 	if (qp->create_type == MLX5_QP_KERNEL)
 		destroy_qp_kernel(dev, qp);
 	else if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(&get_pd(qp)->ibpd, qp, base);
+		destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base);
 }
 
 static const char *ib_qp_type_str(enum ib_qp_type type)
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index db1b9287012f..dd345e8cf6f0 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -211,6 +211,11 @@ enum {
 	MLX5_EN_WR	= (u64)2
 };
 
+enum {
+	MLX5_ADAPTER_PAGE_SHIFT		= 12,
+	MLX5_ADAPTER_PAGE_SIZE		= 1 << MLX5_ADAPTER_PAGE_SHIFT,
+};
+
 enum {
 	MLX5_BFREGS_PER_UAR		= 4,
 	MLX5_MAX_UARS			= 1 << 8,
@@ -219,6 +224,8 @@ enum {
 					  MLX5_NON_FP_BFREGS_PER_UAR,
 	MLX5_MAX_BFREGS			= MLX5_MAX_UARS *
 					  MLX5_NON_FP_BFREGS_PER_UAR,
+	MLX5_UARS_IN_PAGE		= PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE,
+	MLX5_NON_FP_BFREGS_IN_PAGE	= MLX5_NON_FP_BFREGS_PER_UAR * MLX5_UARS_IN_PAGE,
 };
 
 enum {
@@ -391,11 +398,6 @@ enum {
 	MLX5_MAX_PAGE_SHIFT		= 31
 };
 
-enum {
-	MLX5_ADAPTER_PAGE_SHIFT		= 12,
-	MLX5_ADAPTER_PAGE_SIZE		= 1 << MLX5_ADAPTER_PAGE_SHIFT,
-};
-
 enum {
 	MLX5_CAP_OFF_CMDIF_CSUM		= 46,
 };
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bb362f506a2e..7e7394fef835 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -189,18 +189,17 @@ enum mlx5_eq_type {
 };
 
 struct mlx5_bfreg_info {
-	struct mlx5_uar	       *uars;
-	int			num_uars;
+	u32		       *sys_pages;
 	int			num_low_latency_bfregs;
-	unsigned long	       *bitmap;
 	unsigned int	       *count;
-	struct mlx5_bf	       *bfs;
 
 	/*
 	 * protect bfreg allocation data structs
 	 */
 	struct mutex		lock;
 	u32			ver;
+	bool			lib_uar_4k;
+	u32			num_sys_pages;
 };
 
 struct mlx5_cmd_first {
@@ -470,13 +469,10 @@ struct mlx5_sq_bfreg {
 
 struct mlx5_uar {
 	u32			index;
-	struct list_head	bf_list;
-	unsigned		free_bf_bmap;
-	void __iomem	       *bf_map;
 	void __iomem	       *map;
+	void __iomem	       *bf_map;
 };
 
-
 struct mlx5_core_health {
 	struct health_buffer __iomem   *health;
 	__be32 __iomem		       *health_counter;
-- 
cgit v1.2.3


From 30aa60b3bd12bd79b5324b7b595bd3446ab24b52 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:27 +0200
Subject: IB/mlx5: Support 4k UAR for libmlx5

Add fields to structs to convey to kernel an indication whether the
library supports multi UARs per page and return to the library the size
of a UAR based on the queried value.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                  | 21 +++++++-
 drivers/net/ethernet/mellanox/mlx5/core/cq.c       |  2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  9 ++--
 .../net/ethernet/mellanox/mlx5/core/en_common.c    | 12 +----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 21 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/uar.c      | 56 ----------------------
 include/linux/mlx5/cq.h                            |  2 +-
 include/linux/mlx5/driver.h                        | 12 -----
 include/uapi/rdma/mlx5-abi.h                       |  7 +++
 9 files changed, 42 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 664067239de5..a191b9327b0c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -992,6 +992,12 @@ out:
 	return err;
 }
 
+static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
+{
+	mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
+		    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
+}
+
 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
 			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
 			     u32 *num_sys_pages)
@@ -1122,6 +1128,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	resp.cqe_version = min_t(__u8,
 				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
 				 req.max_cqe_version);
+	resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
+	resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+					MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
 	resp.response_length = min(offsetof(typeof(resp), response_length) +
 				   sizeof(resp.response_length), udata->outlen);
 
@@ -1129,7 +1139,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
-	lib_uar_4k = false;
+	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
 	bfregi = &context->bfregi;
 
 	/* updates req->total_num_bfregs */
@@ -1209,6 +1219,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 					sizeof(resp.reserved2);
 	}
 
+	if (field_avail(typeof(resp), log_uar_size, udata->outlen))
+		resp.response_length += sizeof(resp.log_uar_size);
+
+	if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
+		resp.response_length += sizeof(resp.num_uars_per_page);
+
 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	if (err)
 		goto out_td;
@@ -1216,7 +1232,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	bfregi->ver = ver;
 	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
 	context->cqe_version = resp.cqe_version;
-	context->lib_caps = false;
+	context->lib_caps = req.lib_caps;
+	print_lib_caps(dev, context->lib_caps);
 
 	return &context->ibucontext;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 32d4af9b594d..336d4738b807 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -179,6 +179,8 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 		mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n",
 			      cq->cqn);
 
+	cq->uar = dev->priv.uar;
+
 	return 0;
 
 err_cmd:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3037631570b1..a473cea10c16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -465,7 +465,6 @@ struct mlx5e_sq {
 	/* read only */
 	struct mlx5_wq_cyc         wq;
 	u32                        dma_fifo_mask;
-	void __iomem              *uar_map;
 	struct netdev_queue       *txq;
 	u32                        sqn;
 	u16                        bf_buf_size;
@@ -479,7 +478,7 @@ struct mlx5e_sq {
 
 	/* control path */
 	struct mlx5_wq_ctrl        wq_ctrl;
-	struct mlx5_uar            uar;
+	struct mlx5_sq_bfreg	   bfreg;
 	struct mlx5e_channel      *channel;
 	int                        tc;
 	u32                        rate_limit;
@@ -806,7 +805,7 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 				      struct mlx5_wqe_ctrl_seg *ctrl, int bf_sz)
 {
-	u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
+	u16 ofst = sq->bf_offset;
 
 	/* ensure wqe is visible to device before updating doorbell record */
 	dma_wmb();
@@ -818,9 +817,9 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 	 */
 	wmb();
 	if (bf_sz)
-		__iowrite64_copy(sq->uar_map + ofst, ctrl, bf_sz);
+		__iowrite64_copy(sq->bfreg.map + ofst, ctrl, bf_sz);
 	else
-		mlx5_write64((__be32 *)ctrl, sq->uar_map + ofst, NULL);
+		mlx5_write64((__be32 *)ctrl, sq->bfreg.map + ofst, NULL);
 	/* flush the write-combining mapped buffer */
 	wmb();
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index f175518ff07a..bd898d8deda0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -89,16 +89,10 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 	struct mlx5e_resources *res = &mdev->mlx5e_res;
 	int err;
 
-	err = mlx5_alloc_map_uar(mdev, &res->cq_uar, false);
-	if (err) {
-		mlx5_core_err(mdev, "alloc_map uar failed, %d\n", err);
-		return err;
-	}
-
 	err = mlx5_core_alloc_pd(mdev, &res->pdn);
 	if (err) {
 		mlx5_core_err(mdev, "alloc pd failed, %d\n", err);
-		goto err_unmap_free_uar;
+		return err;
 	}
 
 	err = mlx5_core_alloc_transport_domain(mdev, &res->td.tdn);
@@ -121,9 +115,6 @@ err_dealloc_transport_domain:
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(mdev, res->pdn);
-err_unmap_free_uar:
-	mlx5_unmap_free_uar(mdev, &res->cq_uar);
-
 	return err;
 }
 
@@ -134,7 +125,6 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 	mlx5_core_destroy_mkey(mdev, &res->mkey);
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
 	mlx5_core_dealloc_pd(mdev, res->pdn);
-	mlx5_unmap_free_uar(mdev, &res->cq_uar);
 }
 
 int mlx5e_refresh_tirs_self_loopback(struct mlx5_core_dev *mdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5ff86f0ecb7b..c32754b1598e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -991,7 +991,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	sq->channel   = c;
 	sq->tc        = tc;
 
-	err = mlx5_alloc_map_uar(mdev, &sq->uar, !!MLX5_CAP_GEN(mdev, bf));
+	err = mlx5_alloc_bfreg(mdev, &sq->bfreg, MLX5_CAP_GEN(mdev, bf), false);
 	if (err)
 		return err;
 
@@ -1003,12 +1003,9 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 		goto err_unmap_free_uar;
 
 	sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
-	if (sq->uar.bf_map) {
+	if (sq->bfreg.wc)
 		set_bit(MLX5E_SQ_STATE_BF_ENABLE, &sq->state);
-		sq->uar_map = sq->uar.bf_map;
-	} else {
-		sq->uar_map = sq->uar.map;
-	}
+
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 	sq->max_inline  = param->max_inline;
 	sq->min_inline_mode =
@@ -1036,7 +1033,7 @@ err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
 err_unmap_free_uar:
-	mlx5_unmap_free_uar(mdev, &sq->uar);
+	mlx5_free_bfreg(mdev, &sq->bfreg);
 
 	return err;
 }
@@ -1048,7 +1045,7 @@ static void mlx5e_destroy_sq(struct mlx5e_sq *sq)
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
-	mlx5_unmap_free_uar(priv->mdev, &sq->uar);
+	mlx5_free_bfreg(priv->mdev, &sq->bfreg);
 }
 
 static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
@@ -1082,7 +1079,7 @@ static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
 	MLX5_SET(sqc,  sqc, tis_lst_sz, param->type == MLX5E_SQ_ICO ? 0 : 1);
 
 	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
-	MLX5_SET(wq,   wq, uar_page,      sq->uar.index);
+	MLX5_SET(wq,   wq, uar_page,      sq->bfreg.index);
 	MLX5_SET(wq,   wq, log_wq_pg_sz,  sq->wq_ctrl.buf.page_shift -
 					  MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr,      sq->wq_ctrl.db.dma);
@@ -1240,7 +1237,6 @@ static int mlx5e_create_cq(struct mlx5e_channel *c,
 	mcq->comp       = mlx5e_completion_event;
 	mcq->event      = mlx5e_cq_error_event;
 	mcq->irqn       = irqn;
-	mcq->uar        = &mdev->mlx5e_res.cq_uar;
 
 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
@@ -1289,7 +1285,7 @@ static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 
 	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
 	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
-	MLX5_SET(cqc,   cqc, uar_page,      mcq->uar->index);
+	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
 	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.frag_buf.page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
@@ -1701,7 +1697,7 @@ static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
 {
 	void *cqc = param->cqc;
 
-	MLX5_SET(cqc, cqc, uar_page, priv->mdev->mlx5e_res.cq_uar.index);
+	MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index);
 }
 
 static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
@@ -2320,7 +2316,6 @@ static int mlx5e_create_drop_cq(struct mlx5e_priv *priv,
 	mcq->comp       = mlx5e_completion_event;
 	mcq->event      = mlx5e_cq_error_event;
 	mcq->irqn       = irqn;
-	mcq->uar        = &mdev->mlx5e_res.cq_uar;
 
 	cq->priv = priv;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index 07b273cccc26..2e6b0f290ddc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -37,11 +37,6 @@
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
 
-enum {
-	NUM_DRIVER_UARS		= 4,
-	NUM_LOW_LAT_BFREGS	= 4,
-};
-
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
 {
 	u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {0};
@@ -67,57 +62,6 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
 }
 EXPORT_SYMBOL(mlx5_cmd_free_uar);
 
-int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
-		       bool map_wc)
-{
-	phys_addr_t pfn;
-	phys_addr_t uar_bar_start;
-	int err;
-
-	err = mlx5_cmd_alloc_uar(mdev, &uar->index);
-	if (err) {
-		mlx5_core_warn(mdev, "mlx5_cmd_alloc_uar() failed, %d\n", err);
-		return err;
-	}
-
-	uar_bar_start = pci_resource_start(mdev->pdev, 0);
-	pfn           = (uar_bar_start >> PAGE_SHIFT) + uar->index;
-
-	if (map_wc) {
-		uar->bf_map = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE);
-		if (!uar->bf_map) {
-			mlx5_core_warn(mdev, "ioremap_wc() failed\n");
-			uar->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
-			if (!uar->map)
-				goto err_free_uar;
-		}
-	} else {
-		uar->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
-		if (!uar->map)
-			goto err_free_uar;
-	}
-
-	return 0;
-
-err_free_uar:
-	mlx5_core_warn(mdev, "ioremap() failed\n");
-	err = -ENOMEM;
-	mlx5_cmd_free_uar(mdev, uar->index);
-
-	return err;
-}
-EXPORT_SYMBOL(mlx5_alloc_map_uar);
-
-void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
-{
-	if (uar->map)
-		iounmap(uar->map);
-	else
-		iounmap(uar->bf_map);
-	mlx5_cmd_free_uar(mdev, uar->index);
-}
-EXPORT_SYMBOL(mlx5_unmap_free_uar);
-
 static int uars_per_sys_page(struct mlx5_core_dev *mdev)
 {
 	if (MLX5_CAP_GEN(mdev, uar_4k))
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 996863381bc8..95898847c7d4 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -42,13 +42,13 @@ struct mlx5_core_cq {
 	int			cqe_sz;
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
+	struct mlx5_uars_page  *uar;
 	atomic_t		refcount;
 	struct completion	free;
 	unsigned		vector;
 	unsigned int		irqn;
 	void (*comp)		(struct mlx5_core_cq *);
 	void (*event)		(struct mlx5_core_cq *, enum mlx5_event);
-	struct mlx5_uar	       *uar;
 	u32			cons_index;
 	unsigned		arm_sn;
 	struct mlx5_rsc_debug	*dbg;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7e7394fef835..10e632588cd5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -467,12 +467,6 @@ struct mlx5_sq_bfreg {
 	unsigned int		offset;
 };
 
-struct mlx5_uar {
-	u32			index;
-	void __iomem	       *map;
-	void __iomem	       *bf_map;
-};
-
 struct mlx5_core_health {
 	struct health_buffer __iomem   *health;
 	__be32 __iomem		       *health_counter;
@@ -725,7 +719,6 @@ struct mlx5_td {
 };
 
 struct mlx5e_resources {
-	struct mlx5_uar            cq_uar;
 	u32                        pdn;
 	struct mlx5_td             td;
 	struct mlx5_core_mkey      mkey;
@@ -915,11 +908,6 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome);
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
-int mlx5_alloc_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
-int mlx5_free_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
-int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
-		       bool map_wc);
-void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 86a8f30060f3..85dc966ea70b 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -65,6 +65,10 @@ struct mlx5_ib_alloc_ucontext_req {
 	__u32	num_low_latency_bfregs;
 };
 
+enum mlx5_lib_caps {
+	MLX5_LIB_CAP_4K_UAR	= (u64)1 << 0,
+};
+
 struct mlx5_ib_alloc_ucontext_req_v2 {
 	__u32	total_num_bfregs;
 	__u32	num_low_latency_bfregs;
@@ -74,6 +78,7 @@ struct mlx5_ib_alloc_ucontext_req_v2 {
 	__u8	reserved0;
 	__u16	reserved1;
 	__u32	reserved2;
+	__u64	lib_caps;
 };
 
 enum mlx5_ib_alloc_ucontext_resp_mask {
@@ -103,6 +108,8 @@ struct mlx5_ib_alloc_ucontext_resp {
 	__u8	cmds_supp_uhw;
 	__u16	reserved2;
 	__u64	hca_core_clock_offset;
+	__u32	log_uar_size;
+	__u32	num_uars_per_page;
 };
 
 struct mlx5_ib_alloc_pd_resp {
-- 
cgit v1.2.3


From 2c956a60778cbb6a27e0c7a8a52a91378c90e1d1 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 8 Jan 2017 13:54:00 +0100
Subject: siphash: add cryptographically secure PRF

SipHash is a 64-bit keyed hash function that is actually a
cryptographically secure PRF, like HMAC. Except SipHash is super fast,
and is meant to be used as a hashtable keyed lookup function, or as a
general PRF for short input use cases, such as sequence numbers or RNG
chaining.

For the first usage:

There are a variety of attacks known as "hashtable poisoning" in which an
attacker forms some data such that the hash of that data will be the
same, and then preceeds to fill up all entries of a hashbucket. This is
a realistic and well-known denial-of-service vector. Currently
hashtables use jhash, which is fast but not secure, and some kind of
rotating key scheme (or none at all, which isn't good). SipHash is meant
as a replacement for jhash in these cases.

There are a modicum of places in the kernel that are vulnerable to
hashtable poisoning attacks, either via userspace vectors or network
vectors, and there's not a reliable mechanism inside the kernel at the
moment to fix it. The first step toward fixing these issues is actually
getting a secure primitive into the kernel for developers to use. Then
we can, bit by bit, port things over to it as deemed appropriate.

While SipHash is extremely fast for a cryptographically secure function,
it is likely a bit slower than the insecure jhash, and so replacements
will be evaluated on a case-by-case basis based on whether or not the
difference in speed is negligible and whether or not the current jhash usage
poses a real security risk.

For the second usage:

A few places in the kernel are using MD5 or SHA1 for creating secure
sequence numbers, syn cookies, port numbers, or fast random numbers.
SipHash is a faster and more fitting, and more secure replacement for MD5
in those situations. Replacing MD5 and SHA1 with SipHash for these uses is
obvious and straight-forward, and so is submitted along with this patch
series. There shouldn't be much of a debate over its efficacy.

Dozens of languages are already using this internally for their hash
tables and PRFs. Some of the BSDs already use this in their kernels.
SipHash is a widely known high-speed solution to a widely known set of
problems, and it's time we catch-up.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/siphash.txt | 100 ++++++++++++++++++++
 MAINTAINERS               |   7 ++
 include/linux/siphash.h   |  85 +++++++++++++++++
 lib/Kconfig.debug         |   6 +-
 lib/Makefile              |   5 +-
 lib/siphash.c             | 232 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/test_siphash.c        | 131 ++++++++++++++++++++++++++
 7 files changed, 561 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/siphash.txt
 create mode 100644 include/linux/siphash.h
 create mode 100644 lib/siphash.c
 create mode 100644 lib/test_siphash.c

(limited to 'include/linux')

diff --git a/Documentation/siphash.txt b/Documentation/siphash.txt
new file mode 100644
index 000000000000..e8e6ddbbaab4
--- /dev/null
+++ b/Documentation/siphash.txt
@@ -0,0 +1,100 @@
+         SipHash - a short input PRF
+-----------------------------------------------
+Written by Jason A. Donenfeld <jason@zx2c4.com>
+
+SipHash is a cryptographically secure PRF -- a keyed hash function -- that
+performs very well for short inputs, hence the name. It was designed by
+cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended
+as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`,
+and so forth.
+
+SipHash takes a secret key filled with randomly generated numbers and either
+an input buffer or several input integers. It spits out an integer that is
+indistinguishable from random. You may then use that integer as part of secure
+sequence numbers, secure cookies, or mask it off for use in a hash table.
+
+1. Generating a key
+
+Keys should always be generated from a cryptographically secure source of
+random numbers, either using get_random_bytes or get_random_once:
+
+siphash_key_t key;
+get_random_bytes(&key, sizeof(key));
+
+If you're not deriving your key from here, you're doing it wrong.
+
+2. Using the functions
+
+There are two variants of the function, one that takes a list of integers, and
+one that takes a buffer:
+
+u64 siphash(const void *data, size_t len, const siphash_key_t *key);
+
+And:
+
+u64 siphash_1u64(u64, const siphash_key_t *key);
+u64 siphash_2u64(u64, u64, const siphash_key_t *key);
+u64 siphash_3u64(u64, u64, u64, const siphash_key_t *key);
+u64 siphash_4u64(u64, u64, u64, u64, const siphash_key_t *key);
+u64 siphash_1u32(u32, const siphash_key_t *key);
+u64 siphash_2u32(u32, u32, const siphash_key_t *key);
+u64 siphash_3u32(u32, u32, u32, const siphash_key_t *key);
+u64 siphash_4u32(u32, u32, u32, u32, const siphash_key_t *key);
+
+If you pass the generic siphash function something of a constant length, it
+will constant fold at compile-time and automatically choose one of the
+optimized functions.
+
+3. Hashtable key function usage:
+
+struct some_hashtable {
+	DECLARE_HASHTABLE(hashtable, 8);
+	siphash_key_t key;
+};
+
+void init_hashtable(struct some_hashtable *table)
+{
+	get_random_bytes(&table->key, sizeof(table->key));
+}
+
+static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
+{
+	return &table->hashtable[siphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
+}
+
+You may then iterate like usual over the returned hash bucket.
+
+4. Security
+
+SipHash has a very high security margin, with its 128-bit key. So long as the
+key is kept secret, it is impossible for an attacker to guess the outputs of
+the function, even if being able to observe many outputs, since 2^128 outputs
+is significant.
+
+Linux implements the "2-4" variant of SipHash.
+
+5. Struct-passing Pitfalls
+
+Often times the XuY functions will not be large enough, and instead you'll
+want to pass a pre-filled struct to siphash. When doing this, it's important
+to always ensure the struct has no padding holes. The easiest way to do this
+is to simply arrange the members of the struct in descending order of size,
+and to use offsetendof() instead of sizeof() for getting the size. For
+performance reasons, if possible, it's probably a good thing to align the
+struct to the right boundary. Here's an example:
+
+const struct {
+	struct in6_addr saddr;
+	u32 counter;
+	u16 dport;
+} __aligned(SIPHASH_ALIGNMENT) combined = {
+	.saddr = *(struct in6_addr *)saddr,
+	.counter = counter,
+	.dport = dport
+};
+u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret);
+
+6. Resources
+
+Read the SipHash paper if you're interested in learning more:
+https://131002.net/siphash/siphash.pdf
diff --git a/MAINTAINERS b/MAINTAINERS
index fcb33a17831a..21edaedcab46 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11304,6 +11304,13 @@ F:	arch/arm/mach-s3c24xx/mach-bast.c
 F:	arch/arm/mach-s3c24xx/bast-ide.c
 F:	arch/arm/mach-s3c24xx/bast-irq.c
 
+SIPHASH PRF ROUTINES
+M:	Jason A. Donenfeld <Jason@zx2c4.com>
+S:	Maintained
+F:	lib/siphash.c
+F:	lib/test_siphash.c
+F:	include/linux/siphash.h
+
 TI DAVINCI MACHINE SUPPORT
 M:	Sekhar Nori <nsekhar@ti.com>
 M:	Kevin Hilman <khilman@kernel.org>
diff --git a/include/linux/siphash.h b/include/linux/siphash.h
new file mode 100644
index 000000000000..feeb29cd113e
--- /dev/null
+++ b/include/linux/siphash.h
@@ -0,0 +1,85 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4.
+ */
+
+#ifndef _LINUX_SIPHASH_H
+#define _LINUX_SIPHASH_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#define SIPHASH_ALIGNMENT __alignof__(u64)
+typedef struct {
+	u64 key[2];
+} siphash_key_t;
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
+#endif
+
+u64 siphash_1u64(const u64 a, const siphash_key_t *key);
+u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
+u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
+		 const siphash_key_t *key);
+u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
+		 const siphash_key_t *key);
+u64 siphash_1u32(const u32 a, const siphash_key_t *key);
+u64 siphash_3u32(const u32 a, const u32 b, const u32 c,
+		 const siphash_key_t *key);
+
+static inline u64 siphash_2u32(const u32 a, const u32 b,
+			       const siphash_key_t *key)
+{
+	return siphash_1u64((u64)b << 32 | a, key);
+}
+static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c,
+			       const u32 d, const siphash_key_t *key)
+{
+	return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
+}
+
+
+static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
+				     const siphash_key_t *key)
+{
+	if (__builtin_constant_p(len) && len == 4)
+		return siphash_1u32(le32_to_cpup((const __le32 *)data), key);
+	if (__builtin_constant_p(len) && len == 8)
+		return siphash_1u64(le64_to_cpu(data[0]), key);
+	if (__builtin_constant_p(len) && len == 16)
+		return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    key);
+	if (__builtin_constant_p(len) && len == 24)
+		return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    le64_to_cpu(data[2]), key);
+	if (__builtin_constant_p(len) && len == 32)
+		return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    le64_to_cpu(data[2]), le64_to_cpu(data[3]),
+				    key);
+	return __siphash_aligned(data, len, key);
+}
+
+/**
+ * siphash - compute 64-bit siphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the siphash key
+ */
+static inline u64 siphash(const void *data, size_t len,
+			  const siphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
+		return __siphash_unaligned(data, len, key);
+#endif
+	return ___siphash_aligned(data, len, key);
+}
+
+#endif /* _LINUX_SIPHASH_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b06848a104e6..3d2515a770c3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1819,9 +1819,9 @@ config TEST_HASH
 	tristate "Perform selftest on hash functions"
 	default n
 	help
-	  Enable this option to test the kernel's integer (<linux/hash,h>)
-	  and string (<linux/stringhash.h>) hash functions on boot
-	  (or module load).
+	  Enable this option to test the kernel's integer (<linux/hash.h>),
+	  string (<linux/stringhash.h>), and siphash (<linux/siphash.h>)
+	  hash functions on boot (or module load).
 
 	  This is intended to help people writing architecture-specific
 	  optimized versions.  If unsure, say N.
diff --git a/lib/Makefile b/lib/Makefile
index bc4073a8cd08..7b3008d58600 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
+	 earlycpio.o seq_buf.o siphash.o \
+	 nmi_backtrace.o nodemask.o win_minmax.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
@@ -44,7 +45,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
-obj-$(CONFIG_TEST_HASH) += test_hash.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
diff --git a/lib/siphash.c b/lib/siphash.c
new file mode 100644
index 000000000000..c43cf406e71b
--- /dev/null
+++ b/lib/siphash.c
@@ -0,0 +1,232 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4.
+ */
+
+#include <linux/siphash.h>
+#include <asm/unaligned.h>
+
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+#include <linux/dcache.h>
+#include <asm/word-at-a-time.h>
+#endif
+
+#define SIPROUND \
+	do { \
+	v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \
+	v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \
+	v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \
+	v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \
+	} while (0)
+
+#define PREAMBLE(len) \
+	u64 v0 = 0x736f6d6570736575ULL; \
+	u64 v1 = 0x646f72616e646f6dULL; \
+	u64 v2 = 0x6c7967656e657261ULL; \
+	u64 v3 = 0x7465646279746573ULL; \
+	u64 b = ((u64)(len)) << 56; \
+	v3 ^= key->key[1]; \
+	v2 ^= key->key[0]; \
+	v1 ^= key->key[1]; \
+	v0 ^= key->key[0];
+
+#define POSTAMBLE \
+	v3 ^= b; \
+	SIPROUND; \
+	SIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_unaligned);
+#endif
+
+/**
+ * siphash_1u64 - compute 64-bit siphash PRF value of a u64
+ * @first: first u64
+ * @key: the siphash key
+ */
+u64 siphash_1u64(const u64 first, const siphash_key_t *key)
+{
+	PREAMBLE(8)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u64);
+
+/**
+ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
+ * @first: first u64
+ * @second: second u64
+ * @key: the siphash key
+ */
+u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
+{
+	PREAMBLE(16)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_2u64);
+
+/**
+ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @key: the siphash key
+ */
+u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
+		 const siphash_key_t *key)
+{
+	PREAMBLE(24)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u64);
+
+/**
+ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @forth: forth u64
+ * @key: the siphash key
+ */
+u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
+		 const u64 forth, const siphash_key_t *key)
+{
+	PREAMBLE(32)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= forth;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_4u64);
+
+u64 siphash_1u32(const u32 first, const siphash_key_t *key)
+{
+	PREAMBLE(4)
+	b |= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u32);
+
+u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
+		 const siphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	PREAMBLE(12)
+	v3 ^= combined;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= combined;
+	b |= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u32);
diff --git a/lib/test_siphash.c b/lib/test_siphash.c
new file mode 100644
index 000000000000..d972acfc15e4
--- /dev/null
+++ b/lib/test_siphash.c
@@ -0,0 +1,131 @@
+/* Test cases for siphash.c
+ *
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/siphash.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/* Test vectors taken from official reference source available at:
+ *     https://131002.net/siphash/siphash24.c
+ */
+
+static const siphash_key_t test_key_siphash =
+	{{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }};
+
+static const u64 test_vectors_siphash[64] = {
+	0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL,
+	0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL,
+	0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL,
+	0x9e0082df0ba9e4b0ULL, 0x7a5dbbc594ddb9f3ULL, 0xf4b32f46226bada7ULL,
+	0x751e8fbc860ee5fbULL, 0x14ea5627c0843d90ULL, 0xf723ca908e7af2eeULL,
+	0xa129ca6149be45e5ULL, 0x3f2acc7f57c29bdbULL, 0x699ae9f52cbe4794ULL,
+	0x4bc1b3f0968dd39cULL, 0xbb6dc91da77961bdULL, 0xbed65cf21aa2ee98ULL,
+	0xd0f2cbb02e3b67c7ULL, 0x93536795e3a33e88ULL, 0xa80c038ccd5ccec8ULL,
+	0xb8ad50c6f649af94ULL, 0xbce192de8a85b8eaULL, 0x17d835b85bbb15f3ULL,
+	0x2f2e6163076bcfadULL, 0xde4daaaca71dc9a5ULL, 0xa6a2506687956571ULL,
+	0xad87a3535c49ef28ULL, 0x32d892fad841c342ULL, 0x7127512f72f27cceULL,
+	0xa7f32346f95978e3ULL, 0x12e0b01abb051238ULL, 0x15e034d40fa197aeULL,
+	0x314dffbe0815a3b4ULL, 0x027990f029623981ULL, 0xcadcd4e59ef40c4dULL,
+	0x9abfd8766a33735cULL, 0x0e3ea96b5304a7d0ULL, 0xad0c42d6fc585992ULL,
+	0x187306c89bc215a9ULL, 0xd4a60abcf3792b95ULL, 0xf935451de4f21df2ULL,
+	0xa9538f0419755787ULL, 0xdb9acddff56ca510ULL, 0xd06c98cd5c0975ebULL,
+	0xe612a3cb9ecba951ULL, 0xc766e62cfcadaf96ULL, 0xee64435a9752fe72ULL,
+	0xa192d576b245165aULL, 0x0a8787bf8ecb74b2ULL, 0x81b3e73d20b49b6fULL,
+	0x7fa8220ba3b2eceaULL, 0x245731c13ca42499ULL, 0xb78dbfaf3a8d83bdULL,
+	0xea1ad565322a1a0bULL, 0x60e61c23a3795013ULL, 0x6606d7e446282b93ULL,
+	0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL,
+	0x958a324ceb064572ULL
+};
+
+static int __init siphash_test_init(void)
+{
+	u8 in[64] __aligned(SIPHASH_ALIGNMENT);
+	u8 in_unaligned[65] __aligned(SIPHASH_ALIGNMENT);
+	u8 i;
+	int ret = 0;
+
+	for (i = 0; i < 64; ++i) {
+		in[i] = i;
+		in_unaligned[i + 1] = i;
+		if (siphash(in, i, &test_key_siphash) !=
+						test_vectors_siphash[i]) {
+			pr_info("siphash self-test aligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (siphash(in_unaligned + 1, i, &test_key_siphash) !=
+						test_vectors_siphash[i]) {
+			pr_info("siphash self-test unaligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+	}
+	if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) !=
+						test_vectors_siphash[8]) {
+		pr_info("siphash self-test 1u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 &test_key_siphash) != test_vectors_siphash[16]) {
+		pr_info("siphash self-test 2u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 0x1716151413121110ULL, &test_key_siphash) !=
+						test_vectors_siphash[24]) {
+		pr_info("siphash self-test 3u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL,
+			 &test_key_siphash) != test_vectors_siphash[32]) {
+		pr_info("siphash self-test 4u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_1u32(0x03020100U, &test_key_siphash) !=
+						test_vectors_siphash[4]) {
+		pr_info("siphash self-test 1u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_2u32(0x03020100U, 0x07060504U, &test_key_siphash) !=
+						test_vectors_siphash[8]) {
+		pr_info("siphash self-test 2u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_3u32(0x03020100U, 0x07060504U,
+			 0x0b0a0908U, &test_key_siphash) !=
+						test_vectors_siphash[12]) {
+		pr_info("siphash self-test 3u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_4u32(0x03020100U, 0x07060504U,
+			 0x0b0a0908U, 0x0f0e0d0cU, &test_key_siphash) !=
+						test_vectors_siphash[16]) {
+		pr_info("siphash self-test 4u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (!ret)
+		pr_info("self-tests: pass\n");
+	return ret;
+}
+
+static void __exit siphash_test_exit(void)
+{
+}
+
+module_init(siphash_test_init);
+module_exit(siphash_test_exit);
+
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit v1.2.3


From 1ae2324f732c9c4e2fa4ebd885fa1001b70d52e1 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 8 Jan 2017 13:54:01 +0100
Subject: siphash: implement HalfSipHash1-3 for hash tables

HalfSipHash, or hsiphash, is a shortened version of SipHash, which
generates 32-bit outputs using a weaker 64-bit key. It has *much* lower
security margins, and shouldn't be used for anything too sensitive, but
it could be used as a hashtable key function replacement, if the output
is never exposed, and if the security requirement is not too high.

The goal is to make this something that performance-critical jhash users
would be willing to use.

On 64-bit machines, HalfSipHash1-3 is slower than SipHash1-3, so we alias
SipHash1-3 to HalfSipHash1-3 on those systems.

64-bit x86_64:
[    0.509409] test_siphash:     SipHash2-4 cycles: 4049181
[    0.510650] test_siphash:     SipHash1-3 cycles: 2512884
[    0.512205] test_siphash: HalfSipHash1-3 cycles: 3429920
[    0.512904] test_siphash:    JenkinsHash cycles:  978267
So, we map hsiphash() -> SipHash1-3

32-bit x86:
[    0.509868] test_siphash:     SipHash2-4 cycles: 14812892
[    0.513601] test_siphash:     SipHash1-3 cycles:  9510710
[    0.515263] test_siphash: HalfSipHash1-3 cycles:  3856157
[    0.515952] test_siphash:    JenkinsHash cycles:  1148567
So, we map hsiphash() -> HalfSipHash1-3

hsiphash() is roughly 3 times slower than jhash(), but comes with a
considerable security improvement.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/siphash.txt |  75 +++++++++++
 include/linux/siphash.h   |  57 +++++++-
 lib/siphash.c             | 321 +++++++++++++++++++++++++++++++++++++++++++++-
 lib/test_siphash.c        |  98 +++++++++++++-
 4 files changed, 546 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/siphash.txt b/Documentation/siphash.txt
index e8e6ddbbaab4..908d348ff777 100644
--- a/Documentation/siphash.txt
+++ b/Documentation/siphash.txt
@@ -98,3 +98,78 @@ u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret);
 
 Read the SipHash paper if you're interested in learning more:
 https://131002.net/siphash/siphash.pdf
+
+
+~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~
+
+HalfSipHash - SipHash's insecure younger cousin
+-----------------------------------------------
+Written by Jason A. Donenfeld <jason@zx2c4.com>
+
+On the off-chance that SipHash is not fast enough for your needs, you might be
+able to justify using HalfSipHash, a terrifying but potentially useful
+possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and,
+even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output)
+instead of SipHash's 128-bit key. However, this may appeal to some
+high-performance `jhash` users.
+
+Danger!
+
+Do not ever use HalfSipHash except for as a hashtable key function, and only
+then when you can be absolutely certain that the outputs will never be
+transmitted out of the kernel. This is only remotely useful over `jhash` as a
+means of mitigating hashtable flooding denial of service attacks.
+
+1. Generating a key
+
+Keys should always be generated from a cryptographically secure source of
+random numbers, either using get_random_bytes or get_random_once:
+
+hsiphash_key_t key;
+get_random_bytes(&key, sizeof(key));
+
+If you're not deriving your key from here, you're doing it wrong.
+
+2. Using the functions
+
+There are two variants of the function, one that takes a list of integers, and
+one that takes a buffer:
+
+u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key);
+
+And:
+
+u32 hsiphash_1u32(u32, const hsiphash_key_t *key);
+u32 hsiphash_2u32(u32, u32, const hsiphash_key_t *key);
+u32 hsiphash_3u32(u32, u32, u32, const hsiphash_key_t *key);
+u32 hsiphash_4u32(u32, u32, u32, u32, const hsiphash_key_t *key);
+
+If you pass the generic hsiphash function something of a constant length, it
+will constant fold at compile-time and automatically choose one of the
+optimized functions.
+
+3. Hashtable key function usage:
+
+struct some_hashtable {
+	DECLARE_HASHTABLE(hashtable, 8);
+	hsiphash_key_t key;
+};
+
+void init_hashtable(struct some_hashtable *table)
+{
+	get_random_bytes(&table->key, sizeof(table->key));
+}
+
+static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
+{
+	return &table->hashtable[hsiphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)];
+}
+
+You may then iterate like usual over the returned hash bucket.
+
+4. Performance
+
+HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements,
+this will not be a problem, as the hashtable lookup isn't the bottleneck. And
+in general, this is probably a good sacrifice to make for the security and DoS
+resistance of HalfSipHash.
diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index feeb29cd113e..fa7a6b9cedbf 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -5,7 +5,9 @@
  * SipHash: a fast short-input PRF
  * https://131002.net/siphash/
  *
- * This implementation is specifically for SipHash2-4.
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
  */
 
 #ifndef _LINUX_SIPHASH_H
@@ -82,4 +84,57 @@ static inline u64 siphash(const void *data, size_t len,
 	return ___siphash_aligned(data, len, key);
 }
 
+#define HSIPHASH_ALIGNMENT __alignof__(unsigned long)
+typedef struct {
+	unsigned long key[2];
+} hsiphash_key_t;
+
+u32 __hsiphash_aligned(const void *data, size_t len,
+		       const hsiphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key);
+#endif
+
+u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
+u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
+u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
+		  const hsiphash_key_t *key);
+u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
+		  const hsiphash_key_t *key);
+
+static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
+				      const hsiphash_key_t *key)
+{
+	if (__builtin_constant_p(len) && len == 4)
+		return hsiphash_1u32(le32_to_cpu(data[0]), key);
+	if (__builtin_constant_p(len) && len == 8)
+		return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     key);
+	if (__builtin_constant_p(len) && len == 12)
+		return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     le32_to_cpu(data[2]), key);
+	if (__builtin_constant_p(len) && len == 16)
+		return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     le32_to_cpu(data[2]), le32_to_cpu(data[3]),
+				     key);
+	return __hsiphash_aligned(data, len, key);
+}
+
+/**
+ * hsiphash - compute 32-bit hsiphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the hsiphash key
+ */
+static inline u32 hsiphash(const void *data, size_t len,
+			   const hsiphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
+		return __hsiphash_unaligned(data, len, key);
+#endif
+	return ___hsiphash_aligned(data, len, key);
+}
+
 #endif /* _LINUX_SIPHASH_H */
diff --git a/lib/siphash.c b/lib/siphash.c
index c43cf406e71b..3ae58b4edad6 100644
--- a/lib/siphash.c
+++ b/lib/siphash.c
@@ -5,7 +5,9 @@
  * SipHash: a fast short-input PRF
  * https://131002.net/siphash/
  *
- * This implementation is specifically for SipHash2-4.
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
  */
 
 #include <linux/siphash.h>
@@ -230,3 +232,320 @@ u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
 	POSTAMBLE
 }
 EXPORT_SYMBOL(siphash_3u32);
+
+#if BITS_PER_LONG == 64
+/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
+ * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
+ */
+
+#define HSIPROUND SIPROUND
+#define HPREAMBLE(len) PREAMBLE(len)
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+	HPREAMBLE(4)
+	b |= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(8)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(12)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	b |= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(16)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	combined = (u64)forth << 32 | third;
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#else
+#define HSIPROUND \
+	do { \
+	v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \
+	v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \
+	v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \
+	v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \
+	} while (0)
+
+#define HPREAMBLE(len) \
+	u32 v0 = 0; \
+	u32 v1 = 0; \
+	u32 v2 = 0x6c796765U; \
+	u32 v3 = 0x74656462U; \
+	u32 b = ((u32)(len)) << 24; \
+	v3 ^= key->key[1]; \
+	v2 ^= key->key[0]; \
+	v1 ^= key->key[1]; \
+	v0 ^= key->key[0];
+
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return v1 ^ v3;
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = le32_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = get_unaligned_le32(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+	HPREAMBLE(4)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+	HPREAMBLE(8)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t *key)
+{
+	HPREAMBLE(12)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t *key)
+{
+	HPREAMBLE(16)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	HSIPROUND;
+	v0 ^= forth;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#endif
diff --git a/lib/test_siphash.c b/lib/test_siphash.c
index d972acfc15e4..a6d854d933bf 100644
--- a/lib/test_siphash.c
+++ b/lib/test_siphash.c
@@ -7,7 +7,9 @@
  * SipHash: a fast short-input PRF
  * https://131002.net/siphash/
  *
- * This implementation is specifically for SipHash2-4.
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -18,8 +20,8 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 
-/* Test vectors taken from official reference source available at:
- *     https://131002.net/siphash/siphash24.c
+/* Test vectors taken from reference source available at:
+ *     https://github.com/veorq/SipHash
  */
 
 static const siphash_key_t test_key_siphash =
@@ -50,6 +52,64 @@ static const u64 test_vectors_siphash[64] = {
 	0x958a324ceb064572ULL
 };
 
+#if BITS_PER_LONG == 64
+static const hsiphash_key_t test_key_hsiphash =
+	{{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }};
+
+static const u32 test_vectors_hsiphash[64] = {
+	0x050fc4dcU, 0x7d57ca93U, 0x4dc7d44dU,
+	0xe7ddf7fbU, 0x88d38328U, 0x49533b67U,
+	0xc59f22a7U, 0x9bb11140U, 0x8d299a8eU,
+	0x6c063de4U, 0x92ff097fU, 0xf94dc352U,
+	0x57b4d9a2U, 0x1229ffa7U, 0xc0f95d34U,
+	0x2a519956U, 0x7d908b66U, 0x63dbd80cU,
+	0xb473e63eU, 0x8d297d1cU, 0xa6cce040U,
+	0x2b45f844U, 0xa320872eU, 0xdae6c123U,
+	0x67349c8cU, 0x705b0979U, 0xca9913a5U,
+	0x4ade3b35U, 0xef6cd00dU, 0x4ab1e1f4U,
+	0x43c5e663U, 0x8c21d1bcU, 0x16a7b60dU,
+	0x7a8ff9bfU, 0x1f2a753eU, 0xbf186b91U,
+	0xada26206U, 0xa3c33057U, 0xae3a36a1U,
+	0x7b108392U, 0x99e41531U, 0x3f1ad944U,
+	0xc8138825U, 0xc28949a6U, 0xfaf8876bU,
+	0x9f042196U, 0x68b1d623U, 0x8b5114fdU,
+	0xdf074c46U, 0x12cc86b3U, 0x0a52098fU,
+	0x9d292f9aU, 0xa2f41f12U, 0x43a71ed0U,
+	0x73f0bce6U, 0x70a7e980U, 0x243c6d75U,
+	0xfdb71513U, 0xa67d8a08U, 0xb7e8f148U,
+	0xf7a644eeU, 0x0f1837f2U, 0x4b6694e0U,
+	0xb7bbb3a8U
+};
+#else
+static const hsiphash_key_t test_key_hsiphash =
+	{{ 0x03020100U, 0x07060504U }};
+
+static const u32 test_vectors_hsiphash[64] = {
+	0x5814c896U, 0xe7e864caU, 0xbc4b0e30U,
+	0x01539939U, 0x7e059ea6U, 0x88e3d89bU,
+	0xa0080b65U, 0x9d38d9d6U, 0x577999b1U,
+	0xc839caedU, 0xe4fa32cfU, 0x959246eeU,
+	0x6b28096cU, 0x66dd9cd6U, 0x16658a7cU,
+	0xd0257b04U, 0x8b31d501U, 0x2b1cd04bU,
+	0x06712339U, 0x522aca67U, 0x911bb605U,
+	0x90a65f0eU, 0xf826ef7bU, 0x62512debU,
+	0x57150ad7U, 0x5d473507U, 0x1ec47442U,
+	0xab64afd3U, 0x0a4100d0U, 0x6d2ce652U,
+	0x2331b6a3U, 0x08d8791aU, 0xbc6dda8dU,
+	0xe0f6c934U, 0xb0652033U, 0x9b9851ccU,
+	0x7c46fb7fU, 0x732ba8cbU, 0xf142997aU,
+	0xfcc9aa1bU, 0x05327eb2U, 0xe110131cU,
+	0xf9e5e7c0U, 0xa7d708a6U, 0x11795ab1U,
+	0x65671619U, 0x9f5fff91U, 0xd89c5267U,
+	0x007783ebU, 0x95766243U, 0xab639262U,
+	0x9c7e1390U, 0xc368dda6U, 0x38ddc455U,
+	0xfa13d379U, 0x979ea4e8U, 0x53ecd77eU,
+	0x2ee80657U, 0x33dbb66aU, 0xae3f0577U,
+	0x88b4c4ccU, 0x3e7f480bU, 0x74c1ebf8U,
+	0x87178304U
+};
+#endif
+
 static int __init siphash_test_init(void)
 {
 	u8 in[64] __aligned(SIPHASH_ALIGNMENT);
@@ -70,6 +130,16 @@ static int __init siphash_test_init(void)
 			pr_info("siphash self-test unaligned %u: FAIL\n", i + 1);
 			ret = -EINVAL;
 		}
+		if (hsiphash(in, i, &test_key_hsiphash) !=
+						test_vectors_hsiphash[i]) {
+			pr_info("hsiphash self-test aligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (hsiphash(in_unaligned + 1, i, &test_key_hsiphash) !=
+						test_vectors_hsiphash[i]) {
+			pr_info("hsiphash self-test unaligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
 	}
 	if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) !=
 						test_vectors_siphash[8]) {
@@ -115,6 +185,28 @@ static int __init siphash_test_init(void)
 		pr_info("siphash self-test 4u32: FAIL\n");
 		ret = -EINVAL;
 	}
+	if (hsiphash_1u32(0x03020100U, &test_key_hsiphash) !=
+						test_vectors_hsiphash[4]) {
+		pr_info("hsiphash self-test 1u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_2u32(0x03020100U, 0x07060504U, &test_key_hsiphash) !=
+						test_vectors_hsiphash[8]) {
+		pr_info("hsiphash self-test 2u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_3u32(0x03020100U, 0x07060504U,
+			  0x0b0a0908U, &test_key_hsiphash) !=
+						test_vectors_hsiphash[12]) {
+		pr_info("hsiphash self-test 3u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_4u32(0x03020100U, 0x07060504U,
+			  0x0b0a0908U, 0x0f0e0d0cU, &test_key_hsiphash) !=
+						test_vectors_hsiphash[16]) {
+		pr_info("hsiphash self-test 4u32: FAIL\n");
+		ret = -EINVAL;
+	}
 	if (!ret)
 		pr_info("self-tests: pass\n");
 	return ret;
-- 
cgit v1.2.3


From b4b7b772e8b018286482d8d1fba7804ceac56a64 Mon Sep 17 00:00:00 2001
From: jpinto <Joao.Pinto@synopsys.com>
Date: Mon, 9 Jan 2017 12:35:08 +0000
Subject: stmmac: adding DT parameter for LPI tx clock gating

This patch adds a new parameter to the stmmac DT: snps,en-tx-lpi-clockgating.
It was ported from synopsys/dwc_eth_qos.c and it is useful if lpi tx clock
gating is needed by stmmac users also.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
Tested-by: Niklas Cassel <niklas.cassel@axis.com>
Reviewed-by: Lars Persson <larper@axis.com>
Acked-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt      | 2 ++
 drivers/net/ethernet/stmicro/stmmac/common.h          | 3 ++-
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c  | 5 ++++-
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h          | 1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c     | 6 +++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     | 3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 +++
 include/linux/stmmac.h                                | 1 +
 8 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index c3d2fd480a1b..d3bfc2b30fb5 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -49,6 +49,8 @@ Optional properties:
 - snps,force_sf_dma_mode	Force DMA to use the Store and Forward
 				mode for both tx and rx. This flag is
 				ignored if force_thresh_dma_mode is set.
+- snps,en-tx-lpi-clockgating	Enable gating of the MAC TX clock during
+				TX low-power mode
 - snps,multicast-filter-bins:	Number of multicast filter hash bins
 				supported by this device instance
 - snps,perfect-filter-entries:	Number of perfect filter entries supported
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 6c9629138462..75e2666df940 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -476,7 +476,8 @@ struct stmmac_ops {
 			      unsigned int reg_n);
 	void (*get_umac_addr)(struct mac_device_info *hw, unsigned char *addr,
 			      unsigned int reg_n);
-	void (*set_eee_mode)(struct mac_device_info *hw);
+	void (*set_eee_mode)(struct mac_device_info *hw,
+			     bool en_tx_lpi_clockgating);
 	void (*reset_eee_mode)(struct mac_device_info *hw);
 	void (*set_eee_timer)(struct mac_device_info *hw, int ls, int tw);
 	void (*set_eee_pls)(struct mac_device_info *hw, int link);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index be3c91c7f211..a5ffca116edd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -343,11 +343,14 @@ static int dwmac1000_irq_status(struct mac_device_info *hw,
 	return ret;
 }
 
-static void dwmac1000_set_eee_mode(struct mac_device_info *hw)
+static void dwmac1000_set_eee_mode(struct mac_device_info *hw,
+				   bool en_tx_lpi_clockgating)
 {
 	void __iomem *ioaddr = hw->pcsr;
 	u32 value;
 
+	/*TODO - en_tx_lpi_clockgating treatment */
+
 	/* Enable the link status receive on RGMII, SGMII ore SMII
 	 * receive path and instruct the transmit to enter in LPI
 	 * state.
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 73d1dabcdba3..db45134fddf0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -98,6 +98,7 @@ enum power_event {
 #define GMAC4_LPI_TIMER_CTRL	0xd4
 
 /* LPI control and status defines */
+#define GMAC4_LPI_CTRL_STATUS_LPITCSE	BIT(21)	/* LPI Tx Clock Stop Enable */
 #define GMAC4_LPI_CTRL_STATUS_LPITXA	BIT(19)	/* Enable LPI TX Automate */
 #define GMAC4_LPI_CTRL_STATUS_PLS	BIT(17) /* PHY Link Status */
 #define GMAC4_LPI_CTRL_STATUS_LPIEN	BIT(16)	/* LPI Enable */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 02eab798050d..834f40f08208 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -137,7 +137,8 @@ static void dwmac4_get_umac_addr(struct mac_device_info *hw,
 				   GMAC_ADDR_LOW(reg_n));
 }
 
-static void dwmac4_set_eee_mode(struct mac_device_info *hw)
+static void dwmac4_set_eee_mode(struct mac_device_info *hw,
+				bool en_tx_lpi_clockgating)
 {
 	void __iomem *ioaddr = hw->pcsr;
 	u32 value;
@@ -149,6 +150,9 @@ static void dwmac4_set_eee_mode(struct mac_device_info *hw)
 	value = readl(ioaddr + GMAC4_LPI_CTRL_STATUS);
 	value |= GMAC4_LPI_CTRL_STATUS_LPIEN | GMAC4_LPI_CTRL_STATUS_LPITXA;
 
+	if (en_tx_lpi_clockgating)
+		value |= GMAC4_LPI_CTRL_STATUS_LPITCSE;
+
 	writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 92ac0064a52e..fa0b4de74c3e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -239,7 +239,8 @@ static void stmmac_enable_eee_mode(struct stmmac_priv *priv)
 	/* Check and enter in LPI mode */
 	if ((priv->dirty_tx == priv->cur_tx) &&
 	    (priv->tx_path_in_lpi_mode == false))
-		priv->hw->mac->set_eee_mode(priv->hw);
+		priv->hw->mac->set_eee_mode(priv->hw,
+					    priv->plat->en_tx_lpi_clockgating);
 }
 
 /**
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 60ba8993c650..78ccb50cfa4f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -248,6 +248,9 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	plat->force_sf_dma_mode =
 		of_property_read_bool(np, "snps,force_sf_dma_mode");
 
+	plat->en_tx_lpi_clockgating =
+		of_property_read_bool(np, "snps,en-tx-lpi-clockgating");
+
 	/* Set the maxmtu to a default of JUMBO_LEN in case the
 	 * parameter is not present in the device tree.
 	 */
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 889e0e9a3f1c..e3cd7588623d 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -142,5 +142,6 @@ struct plat_stmmacenet_data {
 	int has_gmac4;
 	bool tso_en;
 	int mac_port_sel_speed;
+	bool en_tx_lpi_clockgating;
 };
 #endif
-- 
cgit v1.2.3


From f573c0b9c4e02691cf87736bd0824fd37ec02e65 Mon Sep 17 00:00:00 2001
From: jpinto <Joao.Pinto@synopsys.com>
Date: Mon, 9 Jan 2017 12:35:09 +0000
Subject: stmmac: move stmmac_clk, pclk, clk_ptp_ref and stmmac_rst to platform
 structure

This patch moves stmmac_clk, pclk, clk_ptp_ref and stmmac_rst to the
plat_stmmacenet_data structure. It also moves these platform variables
initialization to stmmac_platform. This was done for two reasons:

a) If PCI is used, platform related code is being executed in stmmac_main
resulting in warnings that have no sense and conceptually was not right

b) stmmac as a synopsys reference ethernet driver stack will be hosting
more and more drivers to its structure like synopsys/dwc_eth_qos.c.
These drivers have their own DT bindings that are not compatible with
stmmac's. One of the most important are the clock names, and so they need
to be parsed in the glue logic and initialized there, and that is the main
reason why the clocks were passed to the platform structure.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
Tested-by: Niklas Cassel <niklas.cassel@axis.com>
Reviewed-by: Lars Persson <larper@axis.com>
Acked-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |  5 --
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  4 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 82 ++++------------------
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 47 +++++++++++++
 include/linux/stmmac.h                             |  5 ++
 6 files changed, 70 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index 1f997027ae51..17d4bbaeb65c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -341,7 +341,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev)
 	 * mode. Create a copy of the core reset handle so it can be used by
 	 * the driver later.
 	 */
-	dwmac->stmmac_rst = stpriv->stmmac_rst;
+	dwmac->stmmac_rst = stpriv->plat->stmmac_rst;
 
 	ret = socfpga_dwmac_set_phy_mode(dwmac);
 	if (ret)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index eab04aeeeb95..bf8a83ef96f9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -106,9 +106,6 @@ struct stmmac_priv {
 	u32 msg_enable;
 	int wolopts;
 	int wol_irq;
-	struct clk *stmmac_clk;
-	struct clk *pclk;
-	struct reset_control *stmmac_rst;
 	int clk_csr;
 	struct timer_list eee_ctrl_timer;
 	int lpi_irq;
@@ -120,8 +117,6 @@ struct stmmac_priv {
 	struct ptp_clock *ptp_clock;
 	struct ptp_clock_info ptp_clock_ops;
 	unsigned int default_addend;
-	struct clk *clk_ptp_ref;
-	unsigned int clk_ptp_rate;
 	u32 adv_ts;
 	int use_riwt;
 	int irq_wake;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 699ee1d30426..322e5c6a0d4b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -712,7 +712,7 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev,
 
 static u32 stmmac_usec2riwt(u32 usec, struct stmmac_priv *priv)
 {
-	unsigned long clk = clk_get_rate(priv->stmmac_clk);
+	unsigned long clk = clk_get_rate(priv->plat->stmmac_clk);
 
 	if (!clk)
 		return 0;
@@ -722,7 +722,7 @@ static u32 stmmac_usec2riwt(u32 usec, struct stmmac_priv *priv)
 
 static u32 stmmac_riwt2usec(u32 riwt, struct stmmac_priv *priv)
 {
-	unsigned long clk = clk_get_rate(priv->stmmac_clk);
+	unsigned long clk = clk_get_rate(priv->plat->stmmac_clk);
 
 	if (!clk)
 		return 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index fa0b4de74c3e..02808e827c93 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -158,7 +158,7 @@ static void stmmac_clk_csr_set(struct stmmac_priv *priv)
 {
 	u32 clk_rate;
 
-	clk_rate = clk_get_rate(priv->stmmac_clk);
+	clk_rate = clk_get_rate(priv->plat->stmmac_clk);
 
 	/* Platform provided default clk_csr would be assumed valid
 	 * for all other cases except for the below mentioned ones.
@@ -607,7 +607,7 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 
 		/* program Sub Second Increment reg */
 		sec_inc = priv->hw->ptp->config_sub_second_increment(
-			priv->ptpaddr, priv->clk_ptp_rate,
+			priv->ptpaddr, priv->plat->clk_ptp_rate,
 			priv->plat->has_gmac4);
 		temp = div_u64(1000000000ULL, sec_inc);
 
@@ -617,7 +617,7 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 		 * where, freq_div_ratio = 1e9ns/sec_inc
 		 */
 		temp = (u64)(temp << 32);
-		priv->default_addend = div_u64(temp, priv->clk_ptp_rate);
+		priv->default_addend = div_u64(temp, priv->plat->clk_ptp_rate);
 		priv->hw->ptp->config_addend(priv->ptpaddr,
 					     priv->default_addend);
 
@@ -645,18 +645,6 @@ static int stmmac_init_ptp(struct stmmac_priv *priv)
 	if (!(priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp))
 		return -EOPNOTSUPP;
 
-	/* Fall-back to main clock in case of no PTP ref is passed */
-	priv->clk_ptp_ref = devm_clk_get(priv->device, "clk_ptp_ref");
-	if (IS_ERR(priv->clk_ptp_ref)) {
-		priv->clk_ptp_rate = clk_get_rate(priv->stmmac_clk);
-		priv->clk_ptp_ref = NULL;
-		netdev_dbg(priv->dev, "PTP uses main clock\n");
-	} else {
-		clk_prepare_enable(priv->clk_ptp_ref);
-		priv->clk_ptp_rate = clk_get_rate(priv->clk_ptp_ref);
-		netdev_dbg(priv->dev, "PTP rate %d\n", priv->clk_ptp_rate);
-	}
-
 	priv->adv_ts = 0;
 	/* Check if adv_ts can be enabled for dwmac 4.x core */
 	if (priv->plat->has_gmac4 && priv->dma_cap.atime_stamp)
@@ -683,8 +671,8 @@ static int stmmac_init_ptp(struct stmmac_priv *priv)
 
 static void stmmac_release_ptp(struct stmmac_priv *priv)
 {
-	if (priv->clk_ptp_ref)
-		clk_disable_unprepare(priv->clk_ptp_ref);
+	if (priv->plat->clk_ptp_ref)
+		clk_disable_unprepare(priv->plat->clk_ptp_ref);
 	stmmac_ptp_unregister(priv);
 }
 
@@ -3278,44 +3266,8 @@ int stmmac_dvr_probe(struct device *device,
 	if ((phyaddr >= 0) && (phyaddr <= 31))
 		priv->plat->phy_addr = phyaddr;
 
-	priv->stmmac_clk = devm_clk_get(priv->device, STMMAC_RESOURCE_NAME);
-	if (IS_ERR(priv->stmmac_clk)) {
-		netdev_warn(priv->dev, "%s: warning: cannot get CSR clock\n",
-			    __func__);
-		/* If failed to obtain stmmac_clk and specific clk_csr value
-		 * is NOT passed from the platform, probe fail.
-		 */
-		if (!priv->plat->clk_csr) {
-			ret = PTR_ERR(priv->stmmac_clk);
-			goto error_clk_get;
-		} else {
-			priv->stmmac_clk = NULL;
-		}
-	}
-	clk_prepare_enable(priv->stmmac_clk);
-
-	priv->pclk = devm_clk_get(priv->device, "pclk");
-	if (IS_ERR(priv->pclk)) {
-		if (PTR_ERR(priv->pclk) == -EPROBE_DEFER) {
-			ret = -EPROBE_DEFER;
-			goto error_pclk_get;
-		}
-		priv->pclk = NULL;
-	}
-	clk_prepare_enable(priv->pclk);
-
-	priv->stmmac_rst = devm_reset_control_get(priv->device,
-						  STMMAC_RESOURCE_NAME);
-	if (IS_ERR(priv->stmmac_rst)) {
-		if (PTR_ERR(priv->stmmac_rst) == -EPROBE_DEFER) {
-			ret = -EPROBE_DEFER;
-			goto error_hw_init;
-		}
-		dev_info(priv->device, "no reset control found\n");
-		priv->stmmac_rst = NULL;
-	}
-	if (priv->stmmac_rst)
-		reset_control_deassert(priv->stmmac_rst);
+	if (priv->plat->stmmac_rst)
+		reset_control_deassert(priv->plat->stmmac_rst);
 
 	/* Init MAC and get the capabilities */
 	ret = stmmac_hw_init(priv);
@@ -3409,10 +3361,6 @@ error_netdev_register:
 error_mdio_register:
 	netif_napi_del(&priv->napi);
 error_hw_init:
-	clk_disable_unprepare(priv->pclk);
-error_pclk_get:
-	clk_disable_unprepare(priv->stmmac_clk);
-error_clk_get:
 	free_netdev(ndev);
 
 	return ret;
@@ -3438,10 +3386,10 @@ int stmmac_dvr_remove(struct device *dev)
 	stmmac_set_mac(priv->ioaddr, false);
 	netif_carrier_off(ndev);
 	unregister_netdev(ndev);
-	if (priv->stmmac_rst)
-		reset_control_assert(priv->stmmac_rst);
-	clk_disable_unprepare(priv->pclk);
-	clk_disable_unprepare(priv->stmmac_clk);
+	if (priv->plat->stmmac_rst)
+		reset_control_assert(priv->plat->stmmac_rst);
+	clk_disable_unprepare(priv->plat->pclk);
+	clk_disable_unprepare(priv->plat->stmmac_clk);
 	if (priv->hw->pcs != STMMAC_PCS_RGMII &&
 	    priv->hw->pcs != STMMAC_PCS_TBI &&
 	    priv->hw->pcs != STMMAC_PCS_RTBI)
@@ -3490,8 +3438,8 @@ int stmmac_suspend(struct device *dev)
 		stmmac_set_mac(priv->ioaddr, false);
 		pinctrl_pm_select_sleep_state(priv->device);
 		/* Disable clock in case of PWM is off */
-		clk_disable(priv->pclk);
-		clk_disable(priv->stmmac_clk);
+		clk_disable(priv->plat->pclk);
+		clk_disable(priv->plat->stmmac_clk);
 	}
 	spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -3531,8 +3479,8 @@ int stmmac_resume(struct device *dev)
 	} else {
 		pinctrl_pm_select_default_state(priv->device);
 		/* enable the clk prevously disabled */
-		clk_enable(priv->stmmac_clk);
-		clk_enable(priv->pclk);
+		clk_enable(priv->plat->stmmac_clk);
+		clk_enable(priv->plat->pclk);
 		/* reset the phy so that it's ready */
 		if (priv->mii)
 			stmmac_mdio_reset(priv->mii);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 78ccb50cfa4f..b489ae47f195 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -335,7 +335,54 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 
 	plat->axi = stmmac_axi_setup(pdev);
 
+	/* clock setup */
+	plat->stmmac_clk = devm_clk_get(&pdev->dev,
+					STMMAC_RESOURCE_NAME);
+	if (IS_ERR(plat->stmmac_clk)) {
+		dev_warn(&pdev->dev, "Cannot get CSR clock\n");
+		plat->stmmac_clk = NULL;
+	}
+	clk_prepare_enable(plat->stmmac_clk);
+
+	plat->pclk = devm_clk_get(&pdev->dev, "pclk");
+	if (IS_ERR(plat->pclk)) {
+		if (PTR_ERR(plat->pclk) == -EPROBE_DEFER)
+			goto error_pclk_get;
+
+		plat->pclk = NULL;
+	}
+	clk_prepare_enable(plat->pclk);
+
+	/* Fall-back to main clock in case of no PTP ref is passed */
+	plat->clk_ptp_ref = devm_clk_get(&pdev->dev, "clk_ptp_ref");
+	if (IS_ERR(plat->clk_ptp_ref)) {
+		plat->clk_ptp_rate = clk_get_rate(plat->stmmac_clk);
+		plat->clk_ptp_ref = NULL;
+		dev_warn(&pdev->dev, "PTP uses main clock\n");
+	} else {
+		clk_prepare_enable(plat->clk_ptp_ref);
+		plat->clk_ptp_rate = clk_get_rate(plat->clk_ptp_ref);
+		dev_info(&pdev->dev, "No reset control found\n");
+	}
+
+	plat->stmmac_rst = devm_reset_control_get(&pdev->dev,
+						  STMMAC_RESOURCE_NAME);
+	if (IS_ERR(plat->stmmac_rst)) {
+		if (PTR_ERR(plat->stmmac_rst) == -EPROBE_DEFER)
+			goto error_hw_init;
+
+		dev_info(&pdev->dev, "no reset control found\n");
+		plat->stmmac_rst = NULL;
+	}
+
 	return plat;
+
+error_hw_init:
+	clk_disable_unprepare(plat->pclk);
+error_pclk_get:
+	clk_disable_unprepare(plat->stmmac_clk);
+
+	return ERR_PTR(-EPROBE_DEFER);
 }
 
 /**
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e3cd7588623d..d76033d6726d 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -138,6 +138,11 @@ struct plat_stmmacenet_data {
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
+	struct clk *stmmac_clk;
+	struct clk *pclk;
+	struct clk *clk_ptp_ref;
+	unsigned int clk_ptp_rate;
+	struct reset_control *stmmac_rst;
 	struct stmmac_axi *axi;
 	int has_gmac4;
 	bool tso_en;
-- 
cgit v1.2.3


From ac7138746e14137a451f8539614cdd349153e0c0 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Mon, 9 Jan 2017 16:55:13 +0100
Subject: smc: establish new socket family

* enable smc module loading and unloading
 * register new socket family
 * basic smc socket creation and deletion
 * use backing TCP socket to run CLC (Connection Layer Control)
   handshake of SMC protocol
 * Setup for infiniband traffic is implemented in follow-on patches.
   For now fallback to TCP socket is always used.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Reviewed-by: Utz Bacher <utz.bacher@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS            |   7 +
 include/linux/socket.h |   7 +-
 net/Kconfig            |   1 +
 net/Makefile           |   1 +
 net/core/sock.c        |   6 +-
 net/smc/Kconfig        |  11 +
 net/smc/Makefile       |   2 +
 net/smc/af_smc.c       | 620 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc.h          |  37 +++
 9 files changed, 688 insertions(+), 4 deletions(-)
 create mode 100644 net/smc/Kconfig
 create mode 100644 net/smc/Makefile
 create mode 100644 net/smc/af_smc.c
 create mode 100644 net/smc/smc.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index d42a37a351f8..c8df0e1ef833 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10850,6 +10850,13 @@ S:	Maintained
 F:	drivers/staging/media/st-cec/
 F:	Documentation/devicetree/bindings/media/stih-cec.txt
 
+SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS
+M:	Ursula Braun <ubraun@linux.vnet.ibm.com>
+L:	linux-s390@vger.kernel.org
+W:	http://www.ibm.com/developerworks/linux/linux390/
+S:	Supported
+F:	net/smc/
+
 SYNOPSYS DESIGNWARE DMAC DRIVER
 M:	Viresh Kumar <vireshk@kernel.org>
 M:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index c06438023d79..082027457825 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -202,8 +202,12 @@ struct ucred {
 #define AF_VSOCK	40	/* vSockets			*/
 #define AF_KCM		41	/* Kernel Connection Multiplexor*/
 #define AF_QIPCRTR	42	/* Qualcomm IPC Router          */
+#define AF_SMC		43	/* smc sockets: reserve number for
+				 * PF_SMC protocol family that
+				 * reuses AF_INET address family
+				 */
 
-#define AF_MAX		43	/* For now.. */
+#define AF_MAX		44	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -251,6 +255,7 @@ struct ucred {
 #define PF_VSOCK	AF_VSOCK
 #define PF_KCM		AF_KCM
 #define PF_QIPCRTR	AF_QIPCRTR
+#define PF_SMC		AF_SMC
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/net/Kconfig b/net/Kconfig
index a1005007224c..2e9ee61822e2 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -57,6 +57,7 @@ source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
+source "net/smc/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
diff --git a/net/Makefile b/net/Makefile
index 4cafaa2b4667..5d6e0e5ff7f8 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211)		+= mac80211/
 obj-$(CONFIG_TIPC)		+= tipc/
 obj-$(CONFIG_NETLABEL)		+= netlabel/
 obj-$(CONFIG_IUCV)		+= iucv/
+obj-$(CONFIG_SMC)		+= smc/
 obj-$(CONFIG_RFKILL)		+= rfkill/
 obj-$(CONFIG_NET_9P)		+= 9p/
 obj-$(CONFIG_CAIF)		+= caif/
diff --git a/net/core/sock.c b/net/core/sock.c
index 5018703ee2c2..31f72f3a3559 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
-  "sk_lock-AF_MAX"
+  "sk_lock-AF_SMC"   , "sk_lock-AF_MAX"
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
-  "slock-AF_MAX"
+  "slock-AF_SMC"   , "slock-AF_MAX"
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
-  "clock-AF_MAX"
+  "closck-AF_smc"  , "clock-AF_MAX"
 };
 
 /*
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..bc029803e728
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,11 @@
+config SMC
+	tristate "SMC socket protocol family"
+	depends on INET && INFINIBAND
+	---help---
+	  SMC-R provides a "sockets over RDMA" solution making use of
+	  RDMA over Converged Ethernet (RoCE) technology to upgrade
+	  AF_INET TCP connections transparently.
+	  The Linux implementation of the SMC-R solution is designed as
+	  a separate socket family SMC.
+
+	  Select this option if you want to run SMC socket applications
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..c285c868dd82
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_SMC)	+= smc.o
+smc-y := af_smc.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..7fd773fc6238
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,620 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ *  applies to SOCK_STREAM sockets only
+ *  offers an alternative communication option for TCP-protocol sockets
+ *  applicable with RoCE-cards only
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ *              based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include "smc.h"
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct proto smc_proto = {
+	.name		= "SMC",
+	.owner		= THIS_MODULE,
+	.keepalive	= smc_set_keepalive,
+	.obj_size	= sizeof(struct smc_sock),
+	.slab_flags	= SLAB_DESTROY_BY_RCU,
+};
+
+static int smc_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+
+	if (!sk)
+		goto out;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+
+	sk->sk_state = SMC_CLOSED;
+	if (smc->clcsock) {
+		sock_release(smc->clcsock);
+		smc->clcsock = NULL;
+	}
+
+	/* detach socket */
+	sock_orphan(sk);
+	sock->sk = NULL;
+	release_sock(sk);
+
+	sock_put(sk);
+out:
+	return 0;
+}
+
+static void smc_destruct(struct sock *sk)
+{
+	if (sk->sk_state != SMC_CLOSED)
+		return;
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+{
+	struct smc_sock *smc;
+	struct sock *sk;
+
+	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+	sk->sk_state = SMC_INIT;
+	sk->sk_destruct = smc_destruct;
+	sk->sk_protocol = SMCPROTO_SMC;
+	sk_refcnt_debug_inc(sk);
+
+	smc = smc_sk(sk);
+
+	return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+		    int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc;
+
+	smc = smc_sk(sk);
+
+	/* replicate tests from inet_bind(), to be safe wrt. future changes */
+	rc = -EINVAL;
+	if (addr_len < sizeof(struct sockaddr_in))
+		goto out;
+
+	rc = -EAFNOSUPPORT;
+	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+	if ((addr->sin_family != AF_INET) &&
+	    ((addr->sin_family != AF_UNSPEC) ||
+	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+		goto out;
+
+	lock_sock(sk);
+
+	/* Check if socket is already active */
+	rc = -EINVAL;
+	if (sk->sk_state != SMC_INIT)
+		goto out_rel;
+
+	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+	release_sock(sk);
+out:
+	return rc;
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+				   unsigned long mask)
+{
+	/* options we don't get control via setsockopt for */
+	nsk->sk_type = osk->sk_type;
+	nsk->sk_sndbuf = osk->sk_sndbuf;
+	nsk->sk_rcvbuf = osk->sk_rcvbuf;
+	nsk->sk_sndtimeo = osk->sk_sndtimeo;
+	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+	nsk->sk_mark = osk->sk_mark;
+	nsk->sk_priority = osk->sk_priority;
+	nsk->sk_rcvlowat = osk->sk_rcvlowat;
+	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+	nsk->sk_err = osk->sk_err;
+
+	nsk->sk_flags &= ~mask;
+	nsk->sk_flags |= osk->sk_flags & mask;
+}
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+			     (1UL << SOCK_KEEPOPEN) | \
+			     (1UL << SOCK_LINGER) | \
+			     (1UL << SOCK_BROADCAST) | \
+			     (1UL << SOCK_TIMESTAMP) | \
+			     (1UL << SOCK_DBG) | \
+			     (1UL << SOCK_RCVTSTAMP) | \
+			     (1UL << SOCK_RCVTSTAMPNS) | \
+			     (1UL << SOCK_LOCALROUTE) | \
+			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+			     (1UL << SOCK_RXQ_OVFL) | \
+			     (1UL << SOCK_WIFI_STATUS) | \
+			     (1UL << SOCK_NOFCS) | \
+			     (1UL << SOCK_FILTER_LOCKED))
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+			     (1UL << SOCK_KEEPOPEN) | \
+			     (1UL << SOCK_LINGER) | \
+			     (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+		       int alen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EINVAL;
+
+	smc = smc_sk(sk);
+
+	/* separate smc parameter checking to be safe */
+	if (alen < sizeof(addr->sa_family))
+		goto out_err;
+	if (addr->sa_family != AF_INET)
+		goto out_err;
+
+	lock_sock(sk);
+	switch (sk->sk_state) {
+	default:
+		goto out;
+	case SMC_ACTIVE:
+		rc = -EISCONN;
+		goto out;
+	case SMC_INIT:
+		rc = 0;
+		break;
+	}
+
+	smc_copy_sock_settings_to_clc(smc);
+	rc = kernel_connect(smc->clcsock, addr, alen, flags);
+	if (rc)
+		goto out;
+
+	sk->sk_state = SMC_ACTIVE;
+
+	/* always use TCP fallback as transport mechanism for now;
+	 * This will change once RDMA transport is implemented
+	 */
+	smc->use_fallback = true;
+
+out:
+	release_sock(sk);
+out_err:
+	return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+	struct sock *sk = &lsmc->sk;
+	struct socket *new_clcsock;
+	struct sock *new_sk;
+	int rc;
+
+	new_sk = smc_sock_alloc(sock_net(sk), NULL);
+	if (!new_sk) {
+		rc = -ENOMEM;
+		lsmc->sk.sk_err = ENOMEM;
+		*new_smc = NULL;
+		goto out;
+	}
+	*new_smc = smc_sk(new_sk);
+
+	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+	if (rc) {
+		sock_put(new_sk);
+		*new_smc = NULL;
+		goto out;
+	}
+
+	(*new_smc)->clcsock = new_clcsock;
+out:
+	return rc;
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+
+	rc = -EINVAL;
+	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+		goto out;
+
+	rc = 0;
+	if (sk->sk_state == SMC_LISTEN) {
+		sk->sk_max_ack_backlog = backlog;
+		goto out;
+	}
+	/* some socket options are handled in core, so we could not apply
+	 * them to the clc socket -- copy smc socket options to clc socket
+	 */
+	smc_copy_sock_settings_to_clc(smc);
+
+	rc = kernel_listen(smc->clcsock, backlog);
+	if (rc)
+		goto out;
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+	sk->sk_state = SMC_LISTEN;
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+		      int flags)
+{
+	struct smc_sock *new_smc;
+	struct sock *sk = sock->sk;
+	struct smc_sock *lsmc;
+	int rc;
+
+	lsmc = smc_sk(sk);
+	lock_sock(sk);
+
+	if (lsmc->sk.sk_state != SMC_LISTEN) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = smc_clcsock_accept(lsmc, &new_smc);
+	if (rc)
+		goto out;
+	sock_graft(&new_smc->sk, new_sock);
+	new_smc->sk.sk_state = SMC_ACTIVE;
+
+	smc_copy_sock_settings_to_smc(new_smc);
+
+	/* always use TCP fallback as transport mechanism for now;
+	 * This will change once RDMA transport is implemented
+	 */
+	new_smc->use_fallback = true;
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+		       int *len, int peer)
+{
+	struct smc_sock *smc;
+
+	if (peer && (sock->sk->sk_state != SMC_ACTIVE))
+		return -ENOTCONN;
+
+	smc = smc_sk(sock->sk);
+
+	return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EPIPE;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if (sk->sk_state != SMC_ACTIVE)
+		goto out;
+	if (smc->use_fallback)
+		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+	else
+		rc = sock_no_sendmsg(sock, msg, len);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+		       int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -ENOTCONN;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+		goto out;
+
+	if (smc->use_fallback)
+		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+	else
+		rc = sock_no_recvmsg(sock, msg, len, flags);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static unsigned int smc_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask = 0;
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
+	    smc->use_fallback) {
+		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+		/* if non-blocking connect finished ... */
+		lock_sock(sk);
+		if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+			sk->sk_state = SMC_ACTIVE;
+			/* always use TCP fallback as transport mechanism;
+			 * This will change once RDMA transport is implemented
+			 */
+			smc->use_fallback = true;
+		}
+		release_sock(sk);
+	} else {
+		mask = sock_no_poll(file, sock, wait);
+	}
+
+	return mask;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EINVAL;
+
+	smc = smc_sk(sk);
+
+	if ((how < SHUT_RD) || (how > SHUT_RDWR))
+		goto out_err;
+
+	lock_sock(sk);
+
+	rc = -ENOTCONN;
+	if (sk->sk_state == SMC_CLOSED)
+		goto out;
+	if (smc->use_fallback) {
+		rc = kernel_sock_shutdown(smc->clcsock, how);
+		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+		if (sk->sk_shutdown == SHUTDOWN_MASK)
+			sk->sk_state = SMC_CLOSED;
+	} else {
+		rc = sock_no_shutdown(sock, how);
+	}
+
+out:
+	release_sock(sk);
+
+out_err:
+	return rc;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+
+	smc = smc_sk(sk);
+
+	/* generic setsockopts reaching us here always apply to the
+	 * CLC socket
+	 */
+	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+					     optval, optlen);
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	/* socket options apply to the CLC socket */
+	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+					     optval, optlen);
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+		     unsigned long arg)
+{
+	struct smc_sock *smc;
+
+	smc = smc_sk(sock->sk);
+	if (smc->use_fallback)
+		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+	else
+		return sock_no_ioctl(sock, cmd, arg);
+}
+
+static ssize_t smc_sendpage(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -EPIPE;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if (sk->sk_state != SMC_ACTIVE)
+		goto out;
+	if (smc->use_fallback)
+		rc = kernel_sendpage(smc->clcsock, page, offset,
+				     size, flags);
+	else
+		rc = sock_no_sendpage(sock, page, offset, size, flags);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+			       struct pipe_inode_info *pipe, size_t len,
+				    unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int rc = -ENOTCONN;
+
+	smc = smc_sk(sk);
+	lock_sock(sk);
+	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+		goto out;
+	if (smc->use_fallback) {
+		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+						    pipe, len, flags);
+	} else {
+		rc = -EOPNOTSUPP;
+	}
+out:
+	release_sock(sk);
+	return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+	.family		= PF_SMC,
+	.owner		= THIS_MODULE,
+	.release	= smc_release,
+	.bind		= smc_bind,
+	.connect	= smc_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= smc_accept,
+	.getname	= smc_getname,
+	.poll		= smc_poll,
+	.ioctl		= smc_ioctl,
+	.listen		= smc_listen,
+	.shutdown	= smc_shutdown,
+	.setsockopt	= smc_setsockopt,
+	.getsockopt	= smc_getsockopt,
+	.sendmsg	= smc_sendmsg,
+	.recvmsg	= smc_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= smc_sendpage,
+	.splice_read	= smc_splice_read,
+};
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct smc_sock *smc;
+	struct sock *sk;
+	int rc;
+
+	rc = -ESOCKTNOSUPPORT;
+	if (sock->type != SOCK_STREAM)
+		goto out;
+
+	rc = -EPROTONOSUPPORT;
+	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+		goto out;
+
+	rc = -ENOBUFS;
+	sock->ops = &smc_sock_ops;
+	sk = smc_sock_alloc(net, sock);
+	if (!sk)
+		goto out;
+
+	/* create internal TCP socket for CLC handshake and fallback */
+	smc = smc_sk(sk);
+	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
+			      IPPROTO_TCP, &smc->clcsock);
+	if (rc)
+		sk_common_release(sk);
+
+out:
+	return rc;
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+	.family	= PF_SMC,
+	.owner	= THIS_MODULE,
+	.create	= smc_create,
+};
+
+static int __init smc_init(void)
+{
+	int rc;
+
+	rc = proto_register(&smc_proto, 1);
+	if (rc) {
+		pr_err("%s: proto_register fails with %d\n", __func__, rc);
+		goto out;
+	}
+
+	rc = sock_register(&smc_sock_family_ops);
+	if (rc) {
+		pr_err("%s: sock_register fails with %d\n", __func__, rc);
+		goto out_proto;
+	}
+
+	return 0;
+
+out_proto:
+	proto_unregister(&smc_proto);
+out:
+	return rc;
+}
+
+static void __exit smc_exit(void)
+{
+	sock_unregister(PF_SMC);
+	proto_unregister(&smc_proto);
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..fcea7399e2eb
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,37 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for the SMC module (socket related)
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef __SMC_H
+#define __SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/sock.h>
+
+#define SMCPROTO_SMC		0	/* SMC protocol */
+
+enum smc_state {		/* possible states of an SMC socket */
+	SMC_ACTIVE	= 1,
+	SMC_INIT	= 2,
+	SMC_CLOSED	= 7,
+	SMC_LISTEN	= 10,
+};
+
+struct smc_sock {				/* smc sock container */
+	struct sock		sk;
+	struct socket		*clcsock;	/* internal tcp socket */
+	bool			use_fallback;	/* fallback to tcp */
+};
+
+static inline struct smc_sock *smc_sk(const struct sock *sk)
+{
+	return (struct smc_sock *)sk;
+}
+
+#endif	/* __SMC_H */
-- 
cgit v1.2.3


From 39f19ebbf57b403695f7b5f9cf322fe1ddb5d7fb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 9 Jan 2017 10:19:50 -0800
Subject: bpf: rename ARG_PTR_TO_STACK

since ARG_PTR_TO_STACK is no longer just pointer to stack
rename it to ARG_PTR_TO_MEM and adjust comment.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 12 ++++++------
 kernel/bpf/helpers.c     |  4 ++--
 kernel/bpf/verifier.c    | 28 ++++++++++++++--------------
 kernel/trace/bpf_trace.c | 20 ++++++++++----------
 net/core/filter.c        | 40 ++++++++++++++++++++--------------------
 5 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f74ae68086dc..94ea8d2383e6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -69,14 +69,14 @@ enum bpf_arg_type {
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
 	 */
-	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
-	ARG_PTR_TO_RAW_STACK,	/* any pointer to eBPF program stack, area does not
-				 * need to be initialized, helper function must fill
-				 * all bytes or clear them in error case.
+	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
+				 * helper function must fill all bytes or clear
+				 * them in error case.
 				 */
 
-	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
-	ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */
+	ARG_CONST_SIZE,		/* number of bytes accessed from memory */
+	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 045cbe673356..3d24e238221e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.func		= bpf_get_current_comm,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_RAW_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3d4f7bf32aaf..2efdc9128e3c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1034,8 +1034,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_STACK;
 		if (type != PTR_TO_PACKET && type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_CONST_STACK_SIZE ||
-		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+	} else if (arg_type == ARG_CONST_SIZE ||
+		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
 		expected_type = CONST_IMM;
 		/* One exception. Allow UNKNOWN_VALUE registers when the
 		 * boundaries are known and don't cause unsafe memory accesses
@@ -1050,8 +1050,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_CTX;
 		if (type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_STACK ||
-		   arg_type == ARG_PTR_TO_RAW_STACK) {
+	} else if (arg_type == ARG_PTR_TO_MEM ||
+		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a CONST_IMM type. Final test
@@ -1062,7 +1062,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
 			 type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
 			goto err_type;
-		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
+		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -1108,9 +1108,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
 						   false, NULL);
-	} else if (arg_type == ARG_CONST_STACK_SIZE ||
-		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
-		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
+	} else if (arg_type == ARG_CONST_SIZE ||
+		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
+		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
 		/* bpf_xxx(..., buf, len) call will access 'len' bytes
 		 * from stack pointer 'buf'. Check it
@@ -1118,7 +1118,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+			verbose("ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1235,15 +1235,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 {
 	int count = 0;
 
-	if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
 
 	return count > 1 ? -EINVAL : 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fa77311dadb2..f883c43c96f3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.func		= bpf_probe_read,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_RAW_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_ANYTHING,
 };
 
@@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_ANYTHING,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
@@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 	.func		= bpf_trace_printk,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 };
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
@@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -492,8 +492,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
diff --git a/net/core/filter.c b/net/core/filter.c
index 1969b3f118c1..f4d16a905754 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1416,8 +1416,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -1447,8 +1447,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_RAW_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 };
 
 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1601,10 +1601,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -2306,8 +2306,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2377,8 +2377,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2412,8 +2412,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static struct metadata_dst __percpu *md_dst;
@@ -2483,8 +2483,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2509,8 +2509,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
@@ -2593,8 +2593,8 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
-- 
cgit v1.2.3


From aecb57da7ae9df1f9c2eb11788888b135ddc0203 Mon Sep 17 00:00:00 2001
From: Vesa Jääskeläinen <vesa.jaaskelainen@vaisala.com>
Date: Fri, 23 Dec 2016 13:15:58 +0200
Subject: rtc: tps65910: Add RTC calibration support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Texas Instrument's TPS65910 has support for compensating RTC crystal
inaccuracies. When enabled every hour RTC counter value will be compensated
with two's complement value.

Signed-off-by: Vesa Jääskeläinen <vesa.jaaskelainen@vaisala.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/rtc/rtc-tps65910.c   | 143 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tps65910.h |   1 +
 2 files changed, 144 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c
index 5a3d53caa485..9dd1db9a12af 100644
--- a/drivers/rtc/rtc-tps65910.c
+++ b/drivers/rtc/rtc-tps65910.c
@@ -21,6 +21,7 @@
 #include <linux/types.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
+#include <linux/math64.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/tps65910.h>
@@ -33,6 +34,19 @@ struct tps65910_rtc {
 /* Total number of RTC registers needed to set time*/
 #define NUM_TIME_REGS	(TPS65910_YEARS - TPS65910_SECONDS + 1)
 
+/* Total number of RTC registers needed to set compensation registers */
+#define NUM_COMP_REGS	(TPS65910_RTC_COMP_MSB - TPS65910_RTC_COMP_LSB + 1)
+
+/* Min and max values supported with 'offset' interface (swapped sign) */
+#define MIN_OFFSET	(-277761)
+#define MAX_OFFSET	(277778)
+
+/* Number of ticks per hour */
+#define TICKS_PER_HOUR	(32768 * 3600)
+
+/* Multiplier for ppb conversions */
+#define PPB_MULT	(1000000000LL)
+
 static int tps65910_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
 {
 	struct tps65910 *tps = dev_get_drvdata(dev->parent);
@@ -187,6 +201,133 @@ static int tps65910_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 	return ret;
 }
 
+static int tps65910_rtc_set_calibration(struct device *dev, int calibration)
+{
+	unsigned char comp_data[NUM_COMP_REGS];
+	struct tps65910 *tps = dev_get_drvdata(dev->parent);
+	s16 value;
+	int ret;
+
+	/*
+	 * TPS65910 uses two's complement 16 bit value for compensation for RTC
+	 * crystal inaccuracies. One time every hour when seconds counter
+	 * increments from 0 to 1 compensation value will be added to internal
+	 * RTC counter value.
+	 *
+	 * Compensation value 0x7FFF is prohibited value.
+	 *
+	 * Valid range for compensation value: [-32768 .. 32766]
+	 */
+	if ((calibration < -32768) || (calibration > 32766)) {
+		dev_err(dev, "RTC calibration value out of range: %d\n",
+			calibration);
+		return -EINVAL;
+	}
+
+	value = (s16)calibration;
+
+	comp_data[0] = (u16)value & 0xFF;
+	comp_data[1] = ((u16)value >> 8) & 0xFF;
+
+	/* Update all the compensation registers in one shot */
+	ret = regmap_bulk_write(tps->regmap, TPS65910_RTC_COMP_LSB,
+		comp_data, NUM_COMP_REGS);
+	if (ret < 0) {
+		dev_err(dev, "rtc_set_calibration error: %d\n", ret);
+		return ret;
+	}
+
+	/* Enable automatic compensation */
+	ret = regmap_update_bits(tps->regmap, TPS65910_RTC_CTRL,
+		TPS65910_RTC_CTRL_AUTO_COMP, TPS65910_RTC_CTRL_AUTO_COMP);
+	if (ret < 0)
+		dev_err(dev, "auto_comp enable failed with error: %d\n", ret);
+
+	return ret;
+}
+
+static int tps65910_rtc_get_calibration(struct device *dev, int *calibration)
+{
+	unsigned char comp_data[NUM_COMP_REGS];
+	struct tps65910 *tps = dev_get_drvdata(dev->parent);
+	unsigned int ctrl;
+	u16 value;
+	int ret;
+
+	ret = regmap_read(tps->regmap, TPS65910_RTC_CTRL, &ctrl);
+	if (ret < 0)
+		return ret;
+
+	/* If automatic compensation is not enabled report back zero */
+	if (!(ctrl & TPS65910_RTC_CTRL_AUTO_COMP)) {
+		*calibration = 0;
+		return 0;
+	}
+
+	ret = regmap_bulk_read(tps->regmap, TPS65910_RTC_COMP_LSB, comp_data,
+		NUM_COMP_REGS);
+	if (ret < 0) {
+		dev_err(dev, "rtc_get_calibration error: %d\n", ret);
+		return ret;
+	}
+
+	value = (u16)comp_data[0] | ((u16)comp_data[1] << 8);
+
+	*calibration = (s16)value;
+
+	return 0;
+}
+
+static int tps65910_read_offset(struct device *dev, long *offset)
+{
+	int calibration;
+	s64 tmp;
+	int ret;
+
+	ret = tps65910_rtc_get_calibration(dev, &calibration);
+	if (ret < 0)
+		return ret;
+
+	/* Convert from RTC calibration register format to ppb format */
+	tmp = calibration * (s64)PPB_MULT;
+	if (tmp < 0)
+		tmp -= TICKS_PER_HOUR / 2LL;
+	else
+		tmp += TICKS_PER_HOUR / 2LL;
+	tmp = div_s64(tmp, TICKS_PER_HOUR);
+
+	/* Offset value operates in negative way, so swap sign */
+	*offset = (long)-tmp;
+
+	return 0;
+}
+
+static int tps65910_set_offset(struct device *dev, long offset)
+{
+	int calibration;
+	s64 tmp;
+	int ret;
+
+	/* Make sure offset value is within supported range */
+	if (offset < MIN_OFFSET || offset > MAX_OFFSET)
+		return -ERANGE;
+
+	/* Convert from ppb format to RTC calibration register format */
+	tmp = offset * (s64)TICKS_PER_HOUR;
+	if (tmp < 0)
+		tmp -= PPB_MULT / 2LL;
+	else
+		tmp += PPB_MULT / 2LL;
+	tmp = div_s64(tmp, PPB_MULT);
+
+	/* Offset value operates in negative way, so swap sign */
+	calibration = (int)-tmp;
+
+	ret = tps65910_rtc_set_calibration(dev, calibration);
+
+	return ret;
+}
+
 static irqreturn_t tps65910_rtc_interrupt(int irq, void *rtc)
 {
 	struct device *dev = rtc;
@@ -219,6 +360,8 @@ static const struct rtc_class_ops tps65910_rtc_ops = {
 	.read_alarm	= tps65910_rtc_read_alarm,
 	.set_alarm	= tps65910_rtc_set_alarm,
 	.alarm_irq_enable = tps65910_rtc_alarm_irq_enable,
+	.read_offset	= tps65910_read_offset,
+	.set_offset	= tps65910_set_offset,
 };
 
 static int tps65910_rtc_probe(struct platform_device *pdev)
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 6483a6fdce59..ffb21e79204d 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -134,6 +134,7 @@
 
 /* RTC_CTRL_REG bitfields */
 #define TPS65910_RTC_CTRL_STOP_RTC			0x01 /*0=stop, 1=run */
+#define TPS65910_RTC_CTRL_AUTO_COMP			0x04
 #define TPS65910_RTC_CTRL_GET_TIME			0x40
 
 /* RTC_STATUS_REG bitfields */
-- 
cgit v1.2.3


From b8aa8453918ebfd93d78de56c2afd4b735e02e27 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Thu, 22 Dec 2016 00:32:25 +0100
Subject: security: Fix inode_getattr documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace arguments @mnt and @dentry with @path.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 558adfa5c8a8..9cf50ad2fe20 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -352,8 +352,7 @@
  *	Return 0 if permission is granted.
  * @inode_getattr:
  *	Check permission before obtaining file attributes.
- *	@mnt is the vfsmount where the dentry was looked up
- *	@dentry contains the dentry structure for the file.
+ *	@path contains the path structure for the file.
  *	Return 0 if permission is granted.
  * @inode_setxattr:
  *	Check permission before setting the extended attributes
-- 
cgit v1.2.3


From 989e0aac1a801e9e9580632c9fd448a7aaca596a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Fri, 30 Dec 2016 15:01:17 +0100
Subject: ata: pass queued command to ->sff_data_xfer method

For Atari Falcon PATA support we need to check the current command
in its ->sff_data_xfer method.  Update core code and all users
accordingly.

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-sff.c      | 29 +++++++++++++++--------------
 drivers/ata/pata_at91.c       |  6 +++---
 drivers/ata/pata_bf54x.c      |  7 ++++---
 drivers/ata/pata_ep93xx.c     |  4 ++--
 drivers/ata/pata_ixp4xx_cf.c  |  4 ++--
 drivers/ata/pata_legacy.c     | 15 +++++++++------
 drivers/ata/pata_octeon_cf.c  | 12 ++++++------
 drivers/ata/pata_pcmcia.c     |  6 +++---
 drivers/ata/pata_samsung_cf.c |  4 ++--
 drivers/ata/sata_rcar.c       |  4 ++--
 include/linux/libata.h        |  8 ++++----
 11 files changed, 52 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 051b6158d1b7..4441b5c5e4fb 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -542,7 +542,7 @@ static inline void ata_tf_to_host(struct ata_port *ap,
 
 /**
  *	ata_sff_data_xfer - Transfer data by PIO
- *	@dev: device to target
+ *	@qc: queued command
  *	@buf: data buffer
  *	@buflen: buffer length
  *	@rw: read/write
@@ -555,10 +555,10 @@ static inline void ata_tf_to_host(struct ata_port *ap,
  *	RETURNS:
  *	Bytes consumed.
  */
-unsigned int ata_sff_data_xfer(struct ata_device *dev, unsigned char *buf,
+unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc, unsigned char *buf,
 			       unsigned int buflen, int rw)
 {
-	struct ata_port *ap = dev->link->ap;
+	struct ata_port *ap = qc->dev->link->ap;
 	void __iomem *data_addr = ap->ioaddr.data_addr;
 	unsigned int words = buflen >> 1;
 
@@ -595,7 +595,7 @@ EXPORT_SYMBOL_GPL(ata_sff_data_xfer);
 
 /**
  *	ata_sff_data_xfer32 - Transfer data by PIO
- *	@dev: device to target
+ *	@qc: queued command
  *	@buf: data buffer
  *	@buflen: buffer length
  *	@rw: read/write
@@ -610,16 +610,17 @@ EXPORT_SYMBOL_GPL(ata_sff_data_xfer);
  *	Bytes consumed.
  */
 
-unsigned int ata_sff_data_xfer32(struct ata_device *dev, unsigned char *buf,
+unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc, unsigned char *buf,
 			       unsigned int buflen, int rw)
 {
+	struct ata_device *dev = qc->dev;
 	struct ata_port *ap = dev->link->ap;
 	void __iomem *data_addr = ap->ioaddr.data_addr;
 	unsigned int words = buflen >> 2;
 	int slop = buflen & 3;
 
 	if (!(ap->pflags & ATA_PFLAG_PIO32))
-		return ata_sff_data_xfer(dev, buf, buflen, rw);
+		return ata_sff_data_xfer(qc, buf, buflen, rw);
 
 	/* Transfer multiple of 4 bytes */
 	if (rw == READ)
@@ -658,7 +659,7 @@ EXPORT_SYMBOL_GPL(ata_sff_data_xfer32);
 
 /**
  *	ata_sff_data_xfer_noirq - Transfer data by PIO
- *	@dev: device to target
+ *	@qc: queued command
  *	@buf: data buffer
  *	@buflen: buffer length
  *	@rw: read/write
@@ -672,14 +673,14 @@ EXPORT_SYMBOL_GPL(ata_sff_data_xfer32);
  *	RETURNS:
  *	Bytes consumed.
  */
-unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
+unsigned int ata_sff_data_xfer_noirq(struct ata_queued_cmd *qc, unsigned char *buf,
 				     unsigned int buflen, int rw)
 {
 	unsigned long flags;
 	unsigned int consumed;
 
 	local_irq_save(flags);
-	consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
+	consumed = ata_sff_data_xfer32(qc, buf, buflen, rw);
 	local_irq_restore(flags);
 
 	return consumed;
@@ -723,14 +724,14 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
-		ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
+		ap->ops->sff_data_xfer(qc, buf + offset, qc->sect_size,
 				       do_write);
 
 		kunmap_atomic(buf);
 		local_irq_restore(flags);
 	} else {
 		buf = page_address(page);
-		ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
+		ap->ops->sff_data_xfer(qc, buf + offset, qc->sect_size,
 				       do_write);
 	}
 
@@ -791,7 +792,7 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
 	DPRINTK("send cdb\n");
 	WARN_ON_ONCE(qc->dev->cdb_len < 12);
 
-	ap->ops->sff_data_xfer(qc->dev, qc->cdb, qc->dev->cdb_len, 1);
+	ap->ops->sff_data_xfer(qc, qc->cdb, qc->dev->cdb_len, 1);
 	ata_sff_sync(ap);
 	/* FIXME: If the CDB is for DMA do we need to do the transition delay
 	   or is bmdma_start guaranteed to do it ? */
@@ -868,14 +869,14 @@ next_sg:
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
-		consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
+		consumed = ap->ops->sff_data_xfer(qc, buf + offset,
 								count, rw);
 
 		kunmap_atomic(buf);
 		local_irq_restore(flags);
 	} else {
 		buf = page_address(page);
-		consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
+		consumed = ap->ops->sff_data_xfer(qc, buf + offset,
 								count, rw);
 	}
 
diff --git a/drivers/ata/pata_at91.c b/drivers/ata/pata_at91.c
index 1611e0e8d767..fd5b34f0d007 100644
--- a/drivers/ata/pata_at91.c
+++ b/drivers/ata/pata_at91.c
@@ -286,10 +286,10 @@ static void pata_at91_set_piomode(struct ata_port *ap, struct ata_device *adev)
 	set_smc_timing(ap->dev, adev, info, &timing);
 }
 
-static unsigned int pata_at91_data_xfer_noirq(struct ata_device *dev,
+static unsigned int pata_at91_data_xfer_noirq(struct ata_queued_cmd *qc,
 		unsigned char *buf, unsigned int buflen, int rw)
 {
-	struct at91_ide_info *info = dev->link->ap->host->private_data;
+	struct at91_ide_info *info = qc->dev->link->ap->host->private_data;
 	unsigned int consumed;
 	unsigned int mode;
 	unsigned long flags;
@@ -301,7 +301,7 @@ static unsigned int pata_at91_data_xfer_noirq(struct ata_device *dev,
 	regmap_fields_write(fields.mode, info->cs, (mode & ~AT91_SMC_DBW) |
 			    AT91_SMC_DBW_16);
 
-	consumed = ata_sff_data_xfer(dev, buf, buflen, rw);
+	consumed = ata_sff_data_xfer(qc, buf, buflen, rw);
 
 	/* restore 8bit mode after data is written */
 	regmap_fields_write(fields.mode, info->cs, (mode & ~AT91_SMC_DBW) |
diff --git a/drivers/ata/pata_bf54x.c b/drivers/ata/pata_bf54x.c
index ec748d31928d..9c5780a7e1b9 100644
--- a/drivers/ata/pata_bf54x.c
+++ b/drivers/ata/pata_bf54x.c
@@ -1143,7 +1143,7 @@ static unsigned char bfin_bmdma_status(struct ata_port *ap)
 
 /**
  *	bfin_data_xfer - Transfer data by PIO
- *	@adev: device for this I/O
+ *	@qc: queued command
  *	@buf: data buffer
  *	@buflen: buffer length
  *	@write_data: read/write
@@ -1151,10 +1151,11 @@ static unsigned char bfin_bmdma_status(struct ata_port *ap)
  *	Note: Original code is ata_sff_data_xfer().
  */
 
-static unsigned int bfin_data_xfer(struct ata_device *dev, unsigned char *buf,
+static unsigned int bfin_data_xfer(struct ata_queued_cmd *qc,
+				   unsigned char *buf,
 				   unsigned int buflen, int rw)
 {
-	struct ata_port *ap = dev->link->ap;
+	struct ata_port *ap = qc->dev->link->ap;
 	void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr;
 	unsigned int words = buflen >> 1;
 	unsigned short *buf16 = (u16 *)buf;
diff --git a/drivers/ata/pata_ep93xx.c b/drivers/ata/pata_ep93xx.c
index bd6b089c67a3..bf1b910c5d69 100644
--- a/drivers/ata/pata_ep93xx.c
+++ b/drivers/ata/pata_ep93xx.c
@@ -474,11 +474,11 @@ static void ep93xx_pata_set_devctl(struct ata_port *ap, u8 ctl)
 }
 
 /* Note: original code is ata_sff_data_xfer */
-static unsigned int ep93xx_pata_data_xfer(struct ata_device *adev,
+static unsigned int ep93xx_pata_data_xfer(struct ata_queued_cmd *qc,
 					  unsigned char *buf,
 					  unsigned int buflen, int rw)
 {
-	struct ata_port *ap = adev->link->ap;
+	struct ata_port *ap = qc->dev->link->ap;
 	struct ep93xx_pata_data *drv_data = ap->host->private_data;
 	u16 *data = (u16 *)buf;
 	unsigned int words = buflen >> 1;
diff --git a/drivers/ata/pata_ixp4xx_cf.c b/drivers/ata/pata_ixp4xx_cf.c
index abda44183512..0b0d93065f5a 100644
--- a/drivers/ata/pata_ixp4xx_cf.c
+++ b/drivers/ata/pata_ixp4xx_cf.c
@@ -40,13 +40,13 @@ static int ixp4xx_set_mode(struct ata_link *link, struct ata_device **error)
 	return 0;
 }
 
-static unsigned int ixp4xx_mmio_data_xfer(struct ata_device *dev,
+static unsigned int ixp4xx_mmio_data_xfer(struct ata_queued_cmd *qc,
 				unsigned char *buf, unsigned int buflen, int rw)
 {
 	unsigned int i;
 	unsigned int words = buflen >> 1;
 	u16 *buf16 = (u16 *) buf;
-	struct ata_port *ap = dev->link->ap;
+	struct ata_port *ap = qc->dev->link->ap;
 	void __iomem *mmio = ap->ioaddr.data_addr;
 	struct ixp4xx_pata_data *data = dev_get_platdata(ap->host->dev);
 
diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c
index bce2a8ca4678..53828b6c3044 100644
--- a/drivers/ata/pata_legacy.c
+++ b/drivers/ata/pata_legacy.c
@@ -303,11 +303,12 @@ static void pdc20230_set_piomode(struct ata_port *ap, struct ata_device *adev)
 
 }
 
-static unsigned int pdc_data_xfer_vlb(struct ata_device *dev,
+static unsigned int pdc_data_xfer_vlb(struct ata_queued_cmd *qc,
 			unsigned char *buf, unsigned int buflen, int rw)
 {
-	int slop = buflen & 3;
+	struct ata_device *dev = qc->dev;
 	struct ata_port *ap = dev->link->ap;
+	int slop = buflen & 3;
 
 	/* 32bit I/O capable *and* we need to write a whole number of dwords */
 	if (ata_id_has_dword_io(dev->id) && (slop == 0 || slop == 3)
@@ -340,7 +341,7 @@ static unsigned int pdc_data_xfer_vlb(struct ata_device *dev,
 		}
 		local_irq_restore(flags);
 	} else
-		buflen = ata_sff_data_xfer_noirq(dev, buf, buflen, rw);
+		buflen = ata_sff_data_xfer_noirq(qc, buf, buflen, rw);
 
 	return buflen;
 }
@@ -702,9 +703,11 @@ static unsigned int qdi_qc_issue(struct ata_queued_cmd *qc)
 	return ata_sff_qc_issue(qc);
 }
 
-static unsigned int vlb32_data_xfer(struct ata_device *adev, unsigned char *buf,
-					unsigned int buflen, int rw)
+static unsigned int vlb32_data_xfer(struct ata_queued_cmd *qc,
+				    unsigned char *buf,
+				    unsigned int buflen, int rw)
 {
+	struct ata_device *adev = qc->dev;
 	struct ata_port *ap = adev->link->ap;
 	int slop = buflen & 3;
 
@@ -727,7 +730,7 @@ static unsigned int vlb32_data_xfer(struct ata_device *adev, unsigned char *buf,
 		}
 		return (buflen + 3) & ~3;
 	} else
-		return ata_sff_data_xfer(adev, buf, buflen, rw);
+		return ata_sff_data_xfer(qc, buf, buflen, rw);
 }
 
 static int qdi_port(struct platform_device *dev,
diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c
index 475a00669427..e94e7ca0e743 100644
--- a/drivers/ata/pata_octeon_cf.c
+++ b/drivers/ata/pata_octeon_cf.c
@@ -293,17 +293,17 @@ static void octeon_cf_set_dmamode(struct ata_port *ap, struct ata_device *dev)
 /**
  * Handle an 8 bit I/O request.
  *
- * @dev:        Device to access
+ * @qc:         Queued command
  * @buffer:     Data buffer
  * @buflen:     Length of the buffer.
  * @rw:         True to write.
  */
-static unsigned int octeon_cf_data_xfer8(struct ata_device *dev,
+static unsigned int octeon_cf_data_xfer8(struct ata_queued_cmd *qc,
 					 unsigned char *buffer,
 					 unsigned int buflen,
 					 int rw)
 {
-	struct ata_port *ap		= dev->link->ap;
+	struct ata_port *ap		= qc->dev->link->ap;
 	void __iomem *data_addr		= ap->ioaddr.data_addr;
 	unsigned long words;
 	int count;
@@ -332,17 +332,17 @@ static unsigned int octeon_cf_data_xfer8(struct ata_device *dev,
 /**
  * Handle a 16 bit I/O request.
  *
- * @dev:        Device to access
+ * @qc:         Queued command
  * @buffer:     Data buffer
  * @buflen:     Length of the buffer.
  * @rw:         True to write.
  */
-static unsigned int octeon_cf_data_xfer16(struct ata_device *dev,
+static unsigned int octeon_cf_data_xfer16(struct ata_queued_cmd *qc,
 					  unsigned char *buffer,
 					  unsigned int buflen,
 					  int rw)
 {
-	struct ata_port *ap		= dev->link->ap;
+	struct ata_port *ap		= qc->dev->link->ap;
 	void __iomem *data_addr		= ap->ioaddr.data_addr;
 	unsigned long words;
 	int count;
diff --git a/drivers/ata/pata_pcmcia.c b/drivers/ata/pata_pcmcia.c
index bcc4b968c049..a541eacc5e95 100644
--- a/drivers/ata/pata_pcmcia.c
+++ b/drivers/ata/pata_pcmcia.c
@@ -90,7 +90,7 @@ static int pcmcia_set_mode_8bit(struct ata_link *link,
 
 /**
  *	ata_data_xfer_8bit	 -	Transfer data by 8bit PIO
- *	@dev: device to target
+ *	@qc: queued command
  *	@buf: data buffer
  *	@buflen: buffer length
  *	@rw: read/write
@@ -101,10 +101,10 @@ static int pcmcia_set_mode_8bit(struct ata_link *link,
  *	Inherited from caller.
  */
 
-static unsigned int ata_data_xfer_8bit(struct ata_device *dev,
+static unsigned int ata_data_xfer_8bit(struct ata_queued_cmd *qc,
 				unsigned char *buf, unsigned int buflen, int rw)
 {
-	struct ata_port *ap = dev->link->ap;
+	struct ata_port *ap = qc->dev->link->ap;
 
 	if (rw == READ)
 		ioread8_rep(ap->ioaddr.data_addr, buf, buflen);
diff --git a/drivers/ata/pata_samsung_cf.c b/drivers/ata/pata_samsung_cf.c
index f6facd686f94..431c7de30ce6 100644
--- a/drivers/ata/pata_samsung_cf.c
+++ b/drivers/ata/pata_samsung_cf.c
@@ -263,10 +263,10 @@ static u8 pata_s3c_check_altstatus(struct ata_port *ap)
 /*
  * pata_s3c_data_xfer - Transfer data by PIO
  */
-static unsigned int pata_s3c_data_xfer(struct ata_device *dev,
+static unsigned int pata_s3c_data_xfer(struct ata_queued_cmd *qc,
 				unsigned char *buf, unsigned int buflen, int rw)
 {
-	struct ata_port *ap = dev->link->ap;
+	struct ata_port *ap = qc->dev->link->ap;
 	struct s3c_ide_info *info = ap->host->private_data;
 	void __iomem *data_addr = ap->ioaddr.data_addr;
 	unsigned int words = buflen >> 1, i;
diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c
index f72d601e300a..5d38245a7a73 100644
--- a/drivers/ata/sata_rcar.c
+++ b/drivers/ata/sata_rcar.c
@@ -447,11 +447,11 @@ static void sata_rcar_exec_command(struct ata_port *ap,
 	ata_sff_pause(ap);
 }
 
-static unsigned int sata_rcar_data_xfer(struct ata_device *dev,
+static unsigned int sata_rcar_data_xfer(struct ata_queued_cmd *qc,
 					      unsigned char *buf,
 					      unsigned int buflen, int rw)
 {
-	struct ata_port *ap = dev->link->ap;
+	struct ata_port *ap = qc->dev->link->ap;
 	void __iomem *data_addr = ap->ioaddr.data_addr;
 	unsigned int words = buflen >> 1;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c170be548b7f..0e8a8000b45f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -968,7 +968,7 @@ struct ata_port_operations {
 	void (*sff_tf_read)(struct ata_port *ap, struct ata_taskfile *tf);
 	void (*sff_exec_command)(struct ata_port *ap,
 				 const struct ata_taskfile *tf);
-	unsigned int (*sff_data_xfer)(struct ata_device *dev,
+	unsigned int (*sff_data_xfer)(struct ata_queued_cmd *qc,
 			unsigned char *buf, unsigned int buflen, int rw);
 	void (*sff_irq_on)(struct ata_port *);
 	bool (*sff_irq_check)(struct ata_port *);
@@ -1823,11 +1823,11 @@ extern void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf);
 extern void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
 extern void ata_sff_exec_command(struct ata_port *ap,
 				 const struct ata_taskfile *tf);
-extern unsigned int ata_sff_data_xfer(struct ata_device *dev,
+extern unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc,
 			unsigned char *buf, unsigned int buflen, int rw);
-extern unsigned int ata_sff_data_xfer32(struct ata_device *dev,
+extern unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc,
 			unsigned char *buf, unsigned int buflen, int rw);
-extern unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev,
+extern unsigned int ata_sff_data_xfer_noirq(struct ata_queued_cmd *qc,
 			unsigned char *buf, unsigned int buflen, int rw);
 extern void ata_sff_irq_on(struct ata_port *ap);
 extern void ata_sff_irq_clear(struct ata_port *ap);
-- 
cgit v1.2.3


From 39d3e7584a686541a3295ff1624d341e669e1afc Mon Sep 17 00:00:00 2001
From: Parav Pandit <pandit.parav@gmail.com>
Date: Tue, 10 Jan 2017 00:02:13 +0000
Subject: rdmacg: Added rdma cgroup controller

Added rdma cgroup controller that does accounting, limit enforcement
on rdma/IB resources.

Added rdma cgroup header file which defines its APIs to perform
charging/uncharging functionality. It also defined APIs for RDMA/IB
stack for device registration. Devices which are registered will
participate in controller functions of accounting and limit
enforcements. It define rdmacg_device structure to bind IB stack
and RDMA cgroup controller.

RDMA resources are tracked using resource pool. Resource pool is per
device, per cgroup entity which allows setting up accounting limits
on per device basis.

Currently resources are defined by the RDMA cgroup.

Resource pool is created/destroyed dynamically whenever
charging/uncharging occurs respectively and whenever user
configuration is done. Its a tradeoff of memory vs little more code
space that creates resource pool object whenever necessary, instead of
creating them during cgroup creation and device registration time.

Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup_rdma.h   |  53 ++++
 include/linux/cgroup_subsys.h |   4 +
 init/Kconfig                  |  10 +
 kernel/cgroup/Makefile        |   1 +
 kernel/cgroup/rdma.c          | 617 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 685 insertions(+)
 create mode 100644 include/linux/cgroup_rdma.h
 create mode 100644 kernel/cgroup/rdma.c

(limited to 'include/linux')

diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 000000000000..e94290b29e99
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+	RDMACG_RESOURCE_HCA_HANDLE,
+	RDMACG_RESOURCE_HCA_OBJECT,
+	RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * head to keep track of all resource pools
+	 * that belongs to this cgroup.
+	 */
+	struct list_head		rpools;
+};
+
+struct rdmacg_device {
+	struct list_head	dev_node;
+	struct list_head	rpools;
+	char			*name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index);
+#endif	/* CONFIG_CGROUP_RDMA */
+#endif	/* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..d0e597c44585 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/init/Kconfig b/init/Kconfig
index 223b734abccd..ef80d46a32b6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1090,6 +1090,16 @@ config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_RDMA
+	bool "RDMA controller"
+	help
+	  Provides enforcement of RDMA resources defined by IB stack.
+	  It is fairly easy for consumers to exhaust RDMA resources, which
+	  can result into resource unavailability to other consumers.
+	  RDMA controller is designed to stop this from happening.
+	  Attaching processes with active RDMA resources to the cgroup
+	  hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
 	bool "Freezer controller"
 	help
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 6d42a3211164..387348a40c64 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -2,4 +2,5 @@ obj-y := cgroup.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..021bee7a9692
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,617 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+	RDMACG_RESOURCE_TYPE_MAX,
+	RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
+	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+	int max;
+	int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+	struct rdmacg_device	*device;
+	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
+
+	struct list_head	cg_node;
+	struct list_head	dev_node;
+
+	/* count active user tasks of this pool */
+	u64			usage_sum;
+	/* total number counts which are set to max */
+	int			num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+	return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+			       int index, int new_max)
+{
+	if (new_max == S32_MAX) {
+		if (rpool->resources[index].max != S32_MAX)
+			rpool->num_max_cnt++;
+	} else {
+		if (rpool->resources[index].max == S32_MAX)
+			rpool->num_max_cnt--;
+	}
+	rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+	int i;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+		set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_del(&rpool->cg_node);
+	list_del(&rpool->dev_node);
+	kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device)
+
+{
+	struct rdmacg_resource_pool *pool;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(pool, &cg->rpools, cg_node)
+		if (pool->device == device)
+			return pool;
+
+	return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+	if (rpool)
+		return rpool;
+
+	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+	if (!rpool)
+		return ERR_PTR(-ENOMEM);
+
+	rpool->device = device;
+	set_all_resource_max_limit(rpool);
+
+	INIT_LIST_HEAD(&rpool->cg_node);
+	INIT_LIST_HEAD(&rpool->dev_node);
+	list_add_tail(&rpool->cg_node, &cg->rpools);
+	list_add_tail(&rpool->dev_node, &device->rpools);
+	return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+		   struct rdmacg_device *device,
+		   enum rdmacg_resource_type index)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+
+	/*
+	 * rpool cannot be null at this stage. Let kernel operate in case
+	 * if there a bug in IB stack or rdma controller, instead of crashing
+	 * the system.
+	 */
+	if (unlikely(!rpool)) {
+		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+		return;
+	}
+
+	rpool->resources[index].usage--;
+
+	/*
+	 * A negative count (or overflow) is invalid,
+	 * it indicates a bug in the rdma controller.
+	 */
+	WARN_ON_ONCE(rpool->resources[index].usage < 0);
+	rpool->usage_sum--;
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ *           stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+				     struct rdmacg_device *device,
+				     struct rdma_cgroup *stop_cg,
+				     enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *p;
+
+	mutex_lock(&rdmacg_mutex);
+
+	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+		uncharge_cg_locked(p, device, index);
+
+	mutex_unlock(&rdmacg_mutex);
+
+	css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index)
+{
+	if (index >= RDMACG_RESOURCE_MAX)
+		return;
+
+	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *cg, *p;
+	struct rdmacg_resource_pool *rpool;
+	s64 new;
+	int ret = 0;
+
+	if (index >= RDMACG_RESOURCE_MAX)
+		return -EINVAL;
+
+	/*
+	 * hold on to css, as cgroup can be removed but resource
+	 * accounting happens on css.
+	 */
+	cg = get_current_rdmacg();
+
+	mutex_lock(&rdmacg_mutex);
+	for (p = cg; p; p = parent_rdmacg(p)) {
+		rpool = get_cg_rpool_locked(p, device);
+		if (IS_ERR(rpool)) {
+			ret = PTR_ERR(rpool);
+			goto err;
+		} else {
+			new = rpool->resources[index].usage + 1;
+			if (new > rpool->resources[index].max) {
+				ret = -EAGAIN;
+				goto err;
+			} else {
+				rpool->resources[index].usage = new;
+				rpool->usage_sum++;
+			}
+		}
+	}
+	mutex_unlock(&rdmacg_mutex);
+
+	*rdmacg = cg;
+	return 0;
+
+err:
+	mutex_unlock(&rdmacg_mutex);
+	rdmacg_uncharge_hierarchy(cg, device, p, index);
+	return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+	INIT_LIST_HEAD(&device->dev_node);
+	INIT_LIST_HEAD(&device->rpools);
+
+	mutex_lock(&rdmacg_mutex);
+	list_add_tail(&device->dev_node, &rdmacg_devices);
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ *          controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool, *tmp;
+
+	/*
+	 * Synchronize with any active resource settings,
+	 * usage query happening via configfs.
+	 */
+	mutex_lock(&rdmacg_mutex);
+	list_del_init(&device->dev_node);
+
+	/*
+	 * Now that this device is off the cgroup list, its safe to free
+	 * all the rpool resources.
+	 */
+	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+		free_cg_rpool_locked(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+	substring_t argstr;
+	const char **table = &rdmacg_resource_names[0];
+	char *name, *value = c;
+	size_t len;
+	int ret, i = 0;
+
+	name = strsep(&value, "=");
+	if (!name || !value)
+		return -EINVAL;
+
+	len = strlen(value);
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		if (strcmp(table[i], name))
+			continue;
+
+		argstr.from = value;
+		argstr.to = value + len;
+
+		ret = match_int(&argstr, intval);
+		if (ret >= 0) {
+			if (*intval < 0)
+				break;
+			return i;
+		}
+		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+			*intval = S32_MAX;
+			return i;
+		}
+		break;
+	}
+	return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+			       int *new_limits, unsigned long *enables)
+{
+	char *c;
+	int err = -EINVAL;
+
+	/* parse resource options */
+	while ((c = strsep(&options, " ")) != NULL) {
+		int index, intval;
+
+		index = parse_resource(c, &intval);
+		if (index < 0)
+			goto err;
+
+		new_limits[index] = intval;
+		*enables |= BIT(index);
+	}
+	return 0;
+
+err:
+	return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+	struct rdmacg_device *device;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node)
+		if (!strcmp(name, device->name))
+			return device;
+
+	return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+	const char *dev_name;
+	struct rdmacg_resource_pool *rpool;
+	struct rdmacg_device *device;
+	char *options = strstrip(buf);
+	int *new_limits;
+	unsigned long enables = 0;
+	int i = 0, ret = 0;
+
+	/* extract the device name first */
+	dev_name = strsep(&options, " ");
+	if (!dev_name) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+	if (!new_limits) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = rdmacg_parse_limits(options, new_limits, &enables);
+	if (ret)
+		goto parse_err;
+
+	/* acquire lock to synchronize with hot plug devices */
+	mutex_lock(&rdmacg_mutex);
+
+	device = rdmacg_get_device_locked(dev_name);
+	if (!device) {
+		ret = -ENODEV;
+		goto dev_err;
+	}
+
+	rpool = get_cg_rpool_locked(cg, device);
+	if (IS_ERR(rpool)) {
+		ret = PTR_ERR(rpool);
+		goto dev_err;
+	}
+
+	/* now set the new limits of the rpool */
+	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+		set_resource_limit(rpool, i, new_limits[i]);
+
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+
+dev_err:
+	mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+	kfree(new_limits);
+
+err:
+	return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+			       struct rdmacg_resource_pool *rpool)
+{
+	enum rdmacg_file_type sf_type;
+	int i;
+	u32 value;
+
+	sf_type = seq_cft(sf)->private;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		seq_puts(sf, rdmacg_resource_names[i]);
+		seq_putc(sf, '=');
+		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+			if (rpool)
+				value = rpool->resources[i].max;
+			else
+				value = S32_MAX;
+		} else {
+			if (rpool)
+				value = rpool->resources[i].usage;
+		}
+
+		if (value == S32_MAX)
+			seq_puts(sf, RDMACG_MAX_STR);
+		else
+			seq_printf(sf, "%d", value);
+		seq_putc(sf, ' ');
+	}
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+	struct rdmacg_device *device;
+	struct rdmacg_resource_pool *rpool;
+	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node) {
+		seq_printf(sf, "%s ", device->name);
+
+		rpool = find_cg_rpool_locked(cg, device);
+		print_rpool_values(sf, rpool);
+
+		seq_putc(sf, '\n');
+	}
+
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+	{
+		.name = "max",
+		.write = rdmacg_resource_set_max,
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_MAX,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_STAT,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct rdma_cgroup *cg;
+
+	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+	if (!cg)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&cg->rpools);
+	return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+
+	kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+	struct rdmacg_resource_pool *rpool;
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(rpool, &cg->rpools, cg_node)
+		set_all_resource_max_limit(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+	.css_alloc	= rdmacg_css_alloc,
+	.css_free	= rdmacg_css_free,
+	.css_offline	= rdmacg_css_offline,
+	.legacy_cftypes	= rdmacg_files,
+	.dfl_cftypes	= rdmacg_files,
+};
-- 
cgit v1.2.3


From 78c9981f6132cb600d545007c91e300021b7caf3 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:24 +0000
Subject: iio:buffer: Stop exporting iio_update_demux

Nothing outside of indiustrialio-buffer.c should be using this.
Requires a large amount of juggling of functions to avoid a
forward definition.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-buffer.c | 260 +++++++++++++++++++-------------------
 include/linux/iio/buffer.h        |   2 -
 2 files changed, 129 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index b12830b09c7d..9f496eb84e0b 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -751,6 +751,135 @@ static int iio_verify_update(struct iio_dev *indio_dev,
 	return 0;
 }
 
+/**
+ * struct iio_demux_table - table describing demux memcpy ops
+ * @from:	index to copy from
+ * @to:		index to copy to
+ * @length:	how many bytes to copy
+ * @l:		list head used for management
+ */
+struct iio_demux_table {
+	unsigned from;
+	unsigned to;
+	unsigned length;
+	struct list_head l;
+};
+
+static void iio_buffer_demux_free(struct iio_buffer *buffer)
+{
+	struct iio_demux_table *p, *q;
+	list_for_each_entry_safe(p, q, &buffer->demux_list, l) {
+		list_del(&p->l);
+		kfree(p);
+	}
+}
+
+static int iio_buffer_add_demux(struct iio_buffer *buffer,
+	struct iio_demux_table **p, unsigned int in_loc, unsigned int out_loc,
+	unsigned int length)
+{
+
+	if (*p && (*p)->from + (*p)->length == in_loc &&
+		(*p)->to + (*p)->length == out_loc) {
+		(*p)->length += length;
+	} else {
+		*p = kmalloc(sizeof(**p), GFP_KERNEL);
+		if (*p == NULL)
+			return -ENOMEM;
+		(*p)->from = in_loc;
+		(*p)->to = out_loc;
+		(*p)->length = length;
+		list_add_tail(&(*p)->l, &buffer->demux_list);
+	}
+
+	return 0;
+}
+
+static int iio_buffer_update_demux(struct iio_dev *indio_dev,
+				   struct iio_buffer *buffer)
+{
+	int ret, in_ind = -1, out_ind, length;
+	unsigned in_loc = 0, out_loc = 0;
+	struct iio_demux_table *p = NULL;
+
+	/* Clear out any old demux */
+	iio_buffer_demux_free(buffer);
+	kfree(buffer->demux_bounce);
+	buffer->demux_bounce = NULL;
+
+	/* First work out which scan mode we will actually have */
+	if (bitmap_equal(indio_dev->active_scan_mask,
+			 buffer->scan_mask,
+			 indio_dev->masklength))
+		return 0;
+
+	/* Now we have the two masks, work from least sig and build up sizes */
+	for_each_set_bit(out_ind,
+			 buffer->scan_mask,
+			 indio_dev->masklength) {
+		in_ind = find_next_bit(indio_dev->active_scan_mask,
+				       indio_dev->masklength,
+				       in_ind + 1);
+		while (in_ind != out_ind) {
+			in_ind = find_next_bit(indio_dev->active_scan_mask,
+					       indio_dev->masklength,
+					       in_ind + 1);
+			length = iio_storage_bytes_for_si(indio_dev, in_ind);
+			/* Make sure we are aligned */
+			in_loc = roundup(in_loc, length) + length;
+		}
+		length = iio_storage_bytes_for_si(indio_dev, in_ind);
+		out_loc = roundup(out_loc, length);
+		in_loc = roundup(in_loc, length);
+		ret = iio_buffer_add_demux(buffer, &p, in_loc, out_loc, length);
+		if (ret)
+			goto error_clear_mux_table;
+		out_loc += length;
+		in_loc += length;
+	}
+	/* Relies on scan_timestamp being last */
+	if (buffer->scan_timestamp) {
+		length = iio_storage_bytes_for_timestamp(indio_dev);
+		out_loc = roundup(out_loc, length);
+		in_loc = roundup(in_loc, length);
+		ret = iio_buffer_add_demux(buffer, &p, in_loc, out_loc, length);
+		if (ret)
+			goto error_clear_mux_table;
+		out_loc += length;
+		in_loc += length;
+	}
+	buffer->demux_bounce = kzalloc(out_loc, GFP_KERNEL);
+	if (buffer->demux_bounce == NULL) {
+		ret = -ENOMEM;
+		goto error_clear_mux_table;
+	}
+	return 0;
+
+error_clear_mux_table:
+	iio_buffer_demux_free(buffer);
+
+	return ret;
+}
+
+static int iio_update_demux(struct iio_dev *indio_dev)
+{
+	struct iio_buffer *buffer;
+	int ret;
+
+	list_for_each_entry(buffer, &indio_dev->buffer_list, buffer_list) {
+		ret = iio_buffer_update_demux(indio_dev, buffer);
+		if (ret < 0)
+			goto error_clear_mux_table;
+	}
+	return 0;
+
+error_clear_mux_table:
+	list_for_each_entry(buffer, &indio_dev->buffer_list, buffer_list)
+		iio_buffer_demux_free(buffer);
+
+	return ret;
+}
+
 static int iio_enable_buffers(struct iio_dev *indio_dev,
 	struct iio_device_config *config)
 {
@@ -1213,20 +1342,6 @@ int iio_scan_mask_query(struct iio_dev *indio_dev,
 };
 EXPORT_SYMBOL_GPL(iio_scan_mask_query);
 
-/**
- * struct iio_demux_table - table describing demux memcpy ops
- * @from:	index to copy from
- * @to:		index to copy to
- * @length:	how many bytes to copy
- * @l:		list head used for management
- */
-struct iio_demux_table {
-	unsigned from;
-	unsigned to;
-	unsigned length;
-	struct list_head l;
-};
-
 static const void *iio_demux(struct iio_buffer *buffer,
 				 const void *datain)
 {
@@ -1258,16 +1373,6 @@ static int iio_push_to_buffer(struct iio_buffer *buffer, const void *data)
 	return 0;
 }
 
-static void iio_buffer_demux_free(struct iio_buffer *buffer)
-{
-	struct iio_demux_table *p, *q;
-	list_for_each_entry_safe(p, q, &buffer->demux_list, l) {
-		list_del(&p->l);
-		kfree(p);
-	}
-}
-
-
 int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data)
 {
 	int ret;
@@ -1283,113 +1388,6 @@ int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data)
 }
 EXPORT_SYMBOL_GPL(iio_push_to_buffers);
 
-static int iio_buffer_add_demux(struct iio_buffer *buffer,
-	struct iio_demux_table **p, unsigned int in_loc, unsigned int out_loc,
-	unsigned int length)
-{
-
-	if (*p && (*p)->from + (*p)->length == in_loc &&
-		(*p)->to + (*p)->length == out_loc) {
-		(*p)->length += length;
-	} else {
-		*p = kmalloc(sizeof(**p), GFP_KERNEL);
-		if (*p == NULL)
-			return -ENOMEM;
-		(*p)->from = in_loc;
-		(*p)->to = out_loc;
-		(*p)->length = length;
-		list_add_tail(&(*p)->l, &buffer->demux_list);
-	}
-
-	return 0;
-}
-
-static int iio_buffer_update_demux(struct iio_dev *indio_dev,
-				   struct iio_buffer *buffer)
-{
-	int ret, in_ind = -1, out_ind, length;
-	unsigned in_loc = 0, out_loc = 0;
-	struct iio_demux_table *p = NULL;
-
-	/* Clear out any old demux */
-	iio_buffer_demux_free(buffer);
-	kfree(buffer->demux_bounce);
-	buffer->demux_bounce = NULL;
-
-	/* First work out which scan mode we will actually have */
-	if (bitmap_equal(indio_dev->active_scan_mask,
-			 buffer->scan_mask,
-			 indio_dev->masklength))
-		return 0;
-
-	/* Now we have the two masks, work from least sig and build up sizes */
-	for_each_set_bit(out_ind,
-			 buffer->scan_mask,
-			 indio_dev->masklength) {
-		in_ind = find_next_bit(indio_dev->active_scan_mask,
-				       indio_dev->masklength,
-				       in_ind + 1);
-		while (in_ind != out_ind) {
-			in_ind = find_next_bit(indio_dev->active_scan_mask,
-					       indio_dev->masklength,
-					       in_ind + 1);
-			length = iio_storage_bytes_for_si(indio_dev, in_ind);
-			/* Make sure we are aligned */
-			in_loc = roundup(in_loc, length) + length;
-		}
-		length = iio_storage_bytes_for_si(indio_dev, in_ind);
-		out_loc = roundup(out_loc, length);
-		in_loc = roundup(in_loc, length);
-		ret = iio_buffer_add_demux(buffer, &p, in_loc, out_loc, length);
-		if (ret)
-			goto error_clear_mux_table;
-		out_loc += length;
-		in_loc += length;
-	}
-	/* Relies on scan_timestamp being last */
-	if (buffer->scan_timestamp) {
-		length = iio_storage_bytes_for_timestamp(indio_dev);
-		out_loc = roundup(out_loc, length);
-		in_loc = roundup(in_loc, length);
-		ret = iio_buffer_add_demux(buffer, &p, in_loc, out_loc, length);
-		if (ret)
-			goto error_clear_mux_table;
-		out_loc += length;
-		in_loc += length;
-	}
-	buffer->demux_bounce = kzalloc(out_loc, GFP_KERNEL);
-	if (buffer->demux_bounce == NULL) {
-		ret = -ENOMEM;
-		goto error_clear_mux_table;
-	}
-	return 0;
-
-error_clear_mux_table:
-	iio_buffer_demux_free(buffer);
-
-	return ret;
-}
-
-int iio_update_demux(struct iio_dev *indio_dev)
-{
-	struct iio_buffer *buffer;
-	int ret;
-
-	list_for_each_entry(buffer, &indio_dev->buffer_list, buffer_list) {
-		ret = iio_buffer_update_demux(indio_dev, buffer);
-		if (ret < 0)
-			goto error_clear_mux_table;
-	}
-	return 0;
-
-error_clear_mux_table:
-	list_for_each_entry(buffer, &indio_dev->buffer_list, buffer_list)
-		iio_buffer_demux_free(buffer);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iio_update_demux);
-
 /**
  * iio_buffer_release() - Free a buffer's resources
  * @ref: Pointer to the kref embedded in the iio_buffer struct
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 70a5164f4728..889d0f2f5d7b 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -168,8 +168,6 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 	return iio_push_to_buffers(indio_dev, data);
 }
 
-int iio_update_demux(struct iio_dev *indio_dev);
-
 bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 	const unsigned long *mask);
 
-- 
cgit v1.2.3


From 263cf5e6577a779cc3311c5290325ef3917de2ea Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:25 +0000
Subject: iio:buffer.h Reformat structure comments to be inline.

This should make it easier to see how the structure is split into
public and private parts - reflected in the generated documentation.

Deliberately use /* instead of /** for the private elements to avoid
warnings when kernel-doc script runs.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 include/linux/iio/buffer.h | 100 ++++++++++++++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 889d0f2f5d7b..4a65a7bb40a4 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -74,45 +74,71 @@ struct iio_buffer_access_funcs {
 
 /**
  * struct iio_buffer - general buffer structure
- * @length:		[DEVICE] number of datums in buffer
- * @bytes_per_datum:	[DEVICE] size of individual datum including timestamp
- * @scan_el_attrs:	[DRIVER] control of scan elements if that scan mode
- *			control method is used
- * @scan_mask:		[INTERN] bitmask used in masking scan mode elements
- * @scan_timestamp:	[INTERN] does the scan mode include a timestamp
- * @access:		[DRIVER] buffer access functions associated with the
- *			implementation.
- * @scan_el_dev_attr_list:[INTERN] list of scan element related attributes.
- * @buffer_group:	[INTERN] attributes of the buffer group
- * @scan_el_group:	[DRIVER] attribute group for those attributes not
- *			created from the iio_chan_info array.
- * @pollq:		[INTERN] wait queue to allow for polling on the buffer.
- * @stufftoread:	[INTERN] flag to indicate new data.
- * @attrs:		[INTERN] standard attributes of the buffer
- * @demux_list:		[INTERN] list of operations required to demux the scan.
- * @demux_bounce:	[INTERN] buffer for doing gather from incoming scan.
- * @buffer_list:	[INTERN] entry in the devices list of current buffers.
- * @ref:		[INTERN] reference count of the buffer.
- * @watermark:		[INTERN] number of datums to wait for poll/read.
+ *
+ * Note that the internals of this structure should only be of interest to
+ * those writing new buffer implementations.
  */
 struct iio_buffer {
-	int					length;
-	int					bytes_per_datum;
-	struct attribute_group			*scan_el_attrs;
-	long					*scan_mask;
-	bool					scan_timestamp;
-	const struct iio_buffer_access_funcs	*access;
-	struct list_head			scan_el_dev_attr_list;
-	struct attribute_group			buffer_group;
-	struct attribute_group			scan_el_group;
-	wait_queue_head_t			pollq;
-	bool					stufftoread;
-	const struct attribute			**attrs;
-	struct list_head			demux_list;
-	void					*demux_bounce;
-	struct list_head			buffer_list;
-	struct kref				ref;
-	unsigned int				watermark;
+	/** @length: Number of datums in buffer. */
+	int length;
+
+	/**  @bytes_per_datum: Size of individual datum including timestamp. */
+	int bytes_per_datum;
+
+	/**
+	 * @access: Buffer access functions associated with the
+	 * implementation.
+	 */
+	const struct iio_buffer_access_funcs *access;
+
+	/** @scan_mask: Bitmask used in masking scan mode elements. */
+	long *scan_mask;
+
+	/** @demux_list: List of operations required to demux the scan. */
+	struct list_head demux_list;
+
+	/** @pollq: Wait queue to allow for polling on the buffer. */
+	wait_queue_head_t pollq;
+
+	/** @watermark: Number of datums to wait for poll/read. */
+	unsigned int watermark;
+
+	/* private: */
+	/*
+	 * @scan_el_attrs: Control of scan elements if that scan mode
+	 * control method is used.
+	 */
+	struct attribute_group *scan_el_attrs;
+
+	/* @scan_timestamp: Does the scan mode include a timestamp. */
+	bool scan_timestamp;
+
+	/* @scan_el_dev_attr_list: List of scan element related attributes. */
+	struct list_head scan_el_dev_attr_list;
+
+	/* @buffer_group: Attributes of the buffer group. */
+	struct attribute_group buffer_group;
+
+	/*
+	 * @scan_el_group: Attribute group for those attributes not
+	 * created from the iio_chan_info array.
+	 */
+	struct attribute_group scan_el_group;
+
+	/* @stufftoread: Flag to indicate new data. */
+	bool stufftoread;
+
+	/* @attrs: Standard attributes of the buffer. */
+	const struct attribute **attrs;
+
+	/* @demux_bounce: Buffer for doing gather from incoming scan. */
+	void *demux_bounce;
+
+	/* @buffer_list: Entry in the devices list of current buffers. */
+	struct list_head buffer_list;
+
+	/* @ref: Reference count of the buffer. */
+	struct kref ref;
 };
 
 /**
-- 
cgit v1.2.3


From 9f4667776c138df33c4107fcd8811aa9cb6cdcbe Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:26 +0000
Subject: iio:buffer: Introduced a function to assign the buffer specific
 attrs.

This is a necessary step in taking the buffer implementation
opaque.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/accel/bmc150-accel-core.c |  3 ++-
 drivers/iio/industrialio-buffer.c     | 12 ++++++++++++
 include/linux/iio/buffer.h            |  2 ++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c
index 59b380dbf27f..6b5d3be283c4 100644
--- a/drivers/iio/accel/bmc150-accel-core.c
+++ b/drivers/iio/accel/bmc150-accel-core.c
@@ -1638,7 +1638,8 @@ int bmc150_accel_core_probe(struct device *dev, struct regmap *regmap, int irq,
 		if (block_supported) {
 			indio_dev->modes |= INDIO_BUFFER_SOFTWARE;
 			indio_dev->info = &bmc150_accel_info_fifo;
-			indio_dev->buffer->attrs = bmc150_accel_fifo_attributes;
+			iio_buffer_set_attrs(indio_dev->buffer,
+					     bmc150_accel_fifo_attributes);
 		}
 	}
 
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 9f496eb84e0b..831537cc9500 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -209,6 +209,18 @@ void iio_buffer_init(struct iio_buffer *buffer)
 }
 EXPORT_SYMBOL(iio_buffer_init);
 
+/**
+ * iio_buffer_set_attrs - Set buffer specific attributes
+ * @buffer: The buffer for which we are setting attributes
+ * @attrs: Pointer to a null terminated list of pointers to attributes
+ */
+void iio_buffer_set_attrs(struct iio_buffer *buffer,
+			 const struct attribute **attrs)
+{
+	buffer->attrs = attrs;
+}
+EXPORT_SYMBOL_GPL(iio_buffer_set_attrs);
+
 static ssize_t iio_show_scan_index(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 4a65a7bb40a4..30ea9806db67 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -17,6 +17,8 @@
 
 struct iio_buffer;
 
+void iio_buffer_set_attrs(struct iio_buffer *buffer,
+			 const struct attribute **attrs);
 /**
  * INDIO_BUFFER_FLAG_FIXED_WATERMARK - Watermark level of the buffer can not be
  *   configured. It has a fixed value which will be buffer specific.
-- 
cgit v1.2.3


From c2bf8d5f3262b3942bf923ef3b86d6ebe590821d Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:27 +0000
Subject: iio:buffer: Stop exporting iio_scan_mask_query

Nothing uses it outside of core code.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-buffer.c | 27 +++++++++++++--------------
 include/linux/iio/buffer.h        |  3 ---
 2 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 831537cc9500..5eb991b24dff 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -358,6 +358,19 @@ static int iio_scan_mask_clear(struct iio_buffer *buffer, int bit)
 	return 0;
 }
 
+static int iio_scan_mask_query(struct iio_dev *indio_dev,
+			       struct iio_buffer *buffer, int bit)
+{
+	if (bit > indio_dev->masklength)
+		return -EINVAL;
+
+	if (!buffer->scan_mask)
+		return 0;
+
+	/* Ensure return value is 0 or 1. */
+	return !!test_bit(bit, buffer->scan_mask);
+};
+
 static ssize_t iio_scan_el_store(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf,
@@ -1340,20 +1353,6 @@ bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(iio_validate_scan_mask_onehot);
 
-int iio_scan_mask_query(struct iio_dev *indio_dev,
-			struct iio_buffer *buffer, int bit)
-{
-	if (bit > indio_dev->masklength)
-		return -EINVAL;
-
-	if (!buffer->scan_mask)
-		return 0;
-
-	/* Ensure return value is 0 or 1. */
-	return !!test_bit(bit, buffer->scan_mask);
-};
-EXPORT_SYMBOL_GPL(iio_scan_mask_query);
-
 static const void *iio_demux(struct iio_buffer *buffer,
 				 const void *datain)
 {
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 30ea9806db67..635aa87eb5e9 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -161,9 +161,6 @@ int iio_update_buffers(struct iio_dev *indio_dev,
  **/
 void iio_buffer_init(struct iio_buffer *buffer);
 
-int iio_scan_mask_query(struct iio_dev *indio_dev,
-			struct iio_buffer *buffer, int bit);
-
 /**
  * iio_push_to_buffers() - push to a registered buffer.
  * @indio_dev:		iio_dev structure for device.
-- 
cgit v1.2.3


From 315a19eca0e7cbae1bef7f43b36fdcfc33f248f6 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:28 +0000
Subject: iio:buffers: Push some docs down into the .c file.

Ancient legacy of me doing it wrong which it is nice to clear
up whilst we are here.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-buffer.c | 5 +++++
 include/linux/iio/buffer.h        | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 5eb991b24dff..0067e184c9ae 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -1384,6 +1384,11 @@ static int iio_push_to_buffer(struct iio_buffer *buffer, const void *data)
 	return 0;
 }
 
+/**
+ * iio_push_to_buffers() - push to a registered buffer.
+ * @indio_dev:		iio_dev structure for device.
+ * @data:		Full scan.
+ */
 int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data)
 {
 	int ret;
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 635aa87eb5e9..cd77ed14eb7f 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -161,11 +161,6 @@ int iio_update_buffers(struct iio_dev *indio_dev,
  **/
 void iio_buffer_init(struct iio_buffer *buffer);
 
-/**
- * iio_push_to_buffers() - push to a registered buffer.
- * @indio_dev:		iio_dev structure for device.
- * @data:		Full scan.
- */
 int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data);
 
 /*
-- 
cgit v1.2.3


From d4ad4f4b721ad76e28b73e32b8602c011e281893 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:29 +0000
Subject: iio:buffer:iio_push_to_buffers_with_timestamp fix kernel-doc.

Wrong start of kernel doc comment /* -> /**

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 include/linux/iio/buffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index cd77ed14eb7f..8c915c2c18f1 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -163,7 +163,7 @@ void iio_buffer_init(struct iio_buffer *buffer);
 
 int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data);
 
-/*
+/**
  * iio_push_to_buffers_with_timestamp() - push data and timestamp to buffers
  * @indio_dev:		iio_dev structure for device.
  * @data:		sample data
-- 
cgit v1.2.3


From 8abd5ba53962854c3a1c21d04fa6fdba54cc0ee1 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:30 +0000
Subject: iio:kfifo_buf header include push down.

As a precursor to splitting buffer.h, lets make sure all drivers
include the relevant headers rather than relying on picking them
up from kfifo_buf.h.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/accel/ssp_accel_sensor.c        | 1 +
 drivers/iio/adc/ina2xx-adc.c                | 2 ++
 drivers/iio/buffer/kfifo_buf.c              | 1 +
 drivers/iio/common/ssp_sensors/ssp_iio.c    | 1 +
 drivers/iio/dummy/iio_simple_dummy_buffer.c | 1 +
 drivers/iio/gyro/ssp_gyro_sensor.c          | 1 +
 drivers/staging/iio/meter/ade7758_ring.c    | 1 +
 include/linux/iio/kfifo_buf.h               | 5 ++---
 8 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/ssp_accel_sensor.c b/drivers/iio/accel/ssp_accel_sensor.c
index 31db00970fa0..6b54008e29c7 100644
--- a/drivers/iio/accel/ssp_accel_sensor.c
+++ b/drivers/iio/accel/ssp_accel_sensor.c
@@ -15,6 +15,7 @@
 
 #include <linux/iio/common/ssp_sensors.h>
 #include <linux/iio/iio.h>
+#include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
diff --git a/drivers/iio/adc/ina2xx-adc.c b/drivers/iio/adc/ina2xx-adc.c
index 59b7d76e1ad2..3263231276ca 100644
--- a/drivers/iio/adc/ina2xx-adc.c
+++ b/drivers/iio/adc/ina2xx-adc.c
@@ -22,6 +22,8 @@
 
 #include <linux/delay.h>
 #include <linux/i2c.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/iio/sysfs.h>
 #include <linux/kthread.h>
diff --git a/drivers/iio/buffer/kfifo_buf.c b/drivers/iio/buffer/kfifo_buf.c
index c5b999f0c519..c035ed1d5ee1 100644
--- a/drivers/iio/buffer/kfifo_buf.c
+++ b/drivers/iio/buffer/kfifo_buf.c
@@ -5,6 +5,7 @@
 #include <linux/workqueue.h>
 #include <linux/kfifo.h>
 #include <linux/mutex.h>
+#include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/sched.h>
 #include <linux/poll.h>
diff --git a/drivers/iio/common/ssp_sensors/ssp_iio.c b/drivers/iio/common/ssp_sensors/ssp_iio.c
index a3ae165f8d9f..645f2e3975db 100644
--- a/drivers/iio/common/ssp_sensors/ssp_iio.c
+++ b/drivers/iio/common/ssp_sensors/ssp_iio.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/iio/common/ssp_sensors.h>
+#include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/iio/dummy/iio_simple_dummy_buffer.c b/drivers/iio/dummy/iio_simple_dummy_buffer.c
index b383892a5193..26bddb2464b7 100644
--- a/drivers/iio/dummy/iio_simple_dummy_buffer.c
+++ b/drivers/iio/dummy/iio_simple_dummy_buffer.c
@@ -20,6 +20,7 @@
 
 #include <linux/iio/iio.h>
 #include <linux/iio/trigger_consumer.h>
+#include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
 
 #include "iio_simple_dummy.h"
diff --git a/drivers/iio/gyro/ssp_gyro_sensor.c b/drivers/iio/gyro/ssp_gyro_sensor.c
index 1f25f406c545..33a7f0a94ce5 100644
--- a/drivers/iio/gyro/ssp_gyro_sensor.c
+++ b/drivers/iio/gyro/ssp_gyro_sensor.c
@@ -15,6 +15,7 @@
 
 #include <linux/iio/common/ssp_sensors.h>
 #include <linux/iio/iio.h>
+#include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
diff --git a/drivers/staging/iio/meter/ade7758_ring.c b/drivers/staging/iio/meter/ade7758_ring.c
index 57c213dfadcc..6d7444d6e880 100644
--- a/drivers/staging/iio/meter/ade7758_ring.c
+++ b/drivers/staging/iio/meter/ade7758_ring.c
@@ -13,6 +13,7 @@
 #include <asm/unaligned.h>
 
 #include <linux/iio/iio.h>
+#include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/iio/trigger_consumer.h>
 #include "ade7758.h"
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 1683bc710d14..027cfa9c3703 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -1,9 +1,8 @@
 #ifndef __LINUX_IIO_KFIFO_BUF_H__
 #define __LINUX_IIO_KFIFO_BUF_H__
 
-#include <linux/kfifo.h>
-#include <linux/iio/iio.h>
-#include <linux/iio/buffer.h>
+struct iio_buffer;
+struct device;
 
 struct iio_buffer *iio_kfifo_allocate(void);
 void iio_kfifo_free(struct iio_buffer *r);
-- 
cgit v1.2.3


From 2b827ad54111439b919605da90e122ecda88bad6 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:32 +0000
Subject: iio:buffer: Push implementation of iio_device_attach_buffer into .c
 file

This is a precursor to the splitting of buffer.h into parts relevant
to buffer implementation vs those for devices using buffers.
struct buffer is about to become opaque as far as the header is
concerned.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-buffer.c | 16 ++++++++++++++++
 include/linux/iio/buffer.h        | 16 ++--------------
 2 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 0067e184c9ae..a04498231f94 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -1445,3 +1445,19 @@ void iio_buffer_put(struct iio_buffer *buffer)
 		kref_put(&buffer->ref, iio_buffer_release);
 }
 EXPORT_SYMBOL_GPL(iio_buffer_put);
+
+/**
+ * iio_device_attach_buffer - Attach a buffer to a IIO device
+ * @indio_dev: The device the buffer should be attached to
+ * @buffer: The buffer to attach to the device
+ *
+ * This function attaches a buffer to a IIO device. The buffer stays attached to
+ * the device until the device is freed. The function should only be called at
+ * most once per device.
+ */
+void iio_device_attach_buffer(struct iio_dev *indio_dev,
+			      struct iio_buffer *buffer)
+{
+	indio_dev->buffer = iio_buffer_get(buffer);
+}
+EXPORT_SYMBOL_GPL(iio_device_attach_buffer);
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 8c915c2c18f1..5cdfe2b7d674 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -194,20 +194,8 @@ bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 struct iio_buffer *iio_buffer_get(struct iio_buffer *buffer);
 void iio_buffer_put(struct iio_buffer *buffer);
 
-/**
- * iio_device_attach_buffer - Attach a buffer to a IIO device
- * @indio_dev: The device the buffer should be attached to
- * @buffer: The buffer to attach to the device
- *
- * This function attaches a buffer to a IIO device. The buffer stays attached to
- * the device until the device is freed. The function should only be called at
- * most once per device.
- */
-static inline void iio_device_attach_buffer(struct iio_dev *indio_dev,
-	struct iio_buffer *buffer)
-{
-	indio_dev->buffer = iio_buffer_get(buffer);
-}
+void iio_device_attach_buffer(struct iio_dev *indio_dev,
+			      struct iio_buffer *buffer);
 
 #else /* CONFIG_IIO_BUFFER */
 
-- 
cgit v1.2.3


From 33dd94cb972175249258329c4aaffddcc82c2005 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Mon, 2 Jan 2017 19:28:34 +0000
Subject: iio:buffer.h - split into buffer.h and buffer_impl.h

buffer.h supplies everything needed for devices using buffers.
buffer_impl.h supplies access to the internals as needed to write
a buffer implementation.

This was really motivated by the mess that turned up in the
kernel-doc documentation pulled in by the new sphinx docs.
It made it clear that our logical separations in headers were
generally terrible.  The buffer case was easy to sort out without
greatly effecting drivers so here it is.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Reviewed-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/buffer/industrialio-buffer-cb.c |   2 +-
 drivers/iio/buffer/kfifo_buf.c              |   1 +
 drivers/iio/industrialio-buffer.c           |   1 +
 drivers/iio/industrialio-core.c             |   1 +
 include/linux/iio/buffer.h                  | 156 +--------------------------
 include/linux/iio/buffer_impl.h             | 162 ++++++++++++++++++++++++++++
 6 files changed, 167 insertions(+), 156 deletions(-)
 create mode 100644 include/linux/iio/buffer_impl.h

(limited to 'include/linux')

diff --git a/drivers/iio/buffer/industrialio-buffer-cb.c b/drivers/iio/buffer/industrialio-buffer-cb.c
index 79fb2f9de759..4847534700e7 100644
--- a/drivers/iio/buffer/industrialio-buffer-cb.c
+++ b/drivers/iio/buffer/industrialio-buffer-cb.c
@@ -11,7 +11,7 @@
 #include <linux/err.h>
 #include <linux/export.h>
 #include <linux/iio/iio.h>
-#include <linux/iio/buffer.h>
+#include <linux/iio/buffer_impl.h>
 #include <linux/iio/consumer.h>
 
 struct iio_cb_buffer {
diff --git a/drivers/iio/buffer/kfifo_buf.c b/drivers/iio/buffer/kfifo_buf.c
index a47118e7db6f..047fe757ab97 100644
--- a/drivers/iio/buffer/kfifo_buf.c
+++ b/drivers/iio/buffer/kfifo_buf.c
@@ -8,6 +8,7 @@
 #include <linux/iio/iio.h>
 #include <linux/iio/buffer.h>
 #include <linux/iio/kfifo_buf.h>
+#include <linux/iio/buffer_impl.h>
 #include <linux/sched.h>
 #include <linux/poll.h>
 
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index a04498231f94..4972986f6455 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -26,6 +26,7 @@
 #include "iio_core.h"
 #include <linux/iio/sysfs.h>
 #include <linux/iio/buffer.h>
+#include <linux/iio/buffer_impl.h>
 
 static const char * const iio_endian_prefix[] = {
 	[IIO_BE] = "be",
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index c601698e0910..d18ded45bedd 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -32,6 +32,7 @@
 #include <linux/iio/sysfs.h>
 #include <linux/iio/events.h>
 #include <linux/iio/buffer.h>
+#include <linux/iio/buffer_impl.h>
 
 /* IDA to assign each registered device a unique id */
 static DEFINE_IDA(iio_ida);
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 5cdfe2b7d674..48767c776119 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -11,155 +11,11 @@
 #define _IIO_BUFFER_GENERIC_H_
 #include <linux/sysfs.h>
 #include <linux/iio/iio.h>
-#include <linux/kref.h>
-
-#ifdef CONFIG_IIO_BUFFER
 
 struct iio_buffer;
 
 void iio_buffer_set_attrs(struct iio_buffer *buffer,
 			 const struct attribute **attrs);
-/**
- * INDIO_BUFFER_FLAG_FIXED_WATERMARK - Watermark level of the buffer can not be
- *   configured. It has a fixed value which will be buffer specific.
- */
-#define INDIO_BUFFER_FLAG_FIXED_WATERMARK BIT(0)
-
-/**
- * struct iio_buffer_access_funcs - access functions for buffers.
- * @store_to:		actually store stuff to the buffer
- * @read_first_n:	try to get a specified number of bytes (must exist)
- * @data_available:	indicates how much data is available for reading from
- *			the buffer.
- * @request_update:	if a parameter change has been marked, update underlying
- *			storage.
- * @set_bytes_per_datum:set number of bytes per datum
- * @set_length:		set number of datums in buffer
- * @enable:             called if the buffer is attached to a device and the
- *                      device starts sampling. Calls are balanced with
- *                      @disable.
- * @disable:            called if the buffer is attached to a device and the
- *                      device stops sampling. Calles are balanced with @enable.
- * @release:		called when the last reference to the buffer is dropped,
- *			should free all resources allocated by the buffer.
- * @modes:		Supported operating modes by this buffer type
- * @flags:		A bitmask combination of INDIO_BUFFER_FLAG_*
- *
- * The purpose of this structure is to make the buffer element
- * modular as event for a given driver, different usecases may require
- * different buffer designs (space efficiency vs speed for example).
- *
- * It is worth noting that a given buffer implementation may only support a
- * small proportion of these functions.  The core code 'should' cope fine with
- * any of them not existing.
- **/
-struct iio_buffer_access_funcs {
-	int (*store_to)(struct iio_buffer *buffer, const void *data);
-	int (*read_first_n)(struct iio_buffer *buffer,
-			    size_t n,
-			    char __user *buf);
-	size_t (*data_available)(struct iio_buffer *buffer);
-
-	int (*request_update)(struct iio_buffer *buffer);
-
-	int (*set_bytes_per_datum)(struct iio_buffer *buffer, size_t bpd);
-	int (*set_length)(struct iio_buffer *buffer, int length);
-
-	int (*enable)(struct iio_buffer *buffer, struct iio_dev *indio_dev);
-	int (*disable)(struct iio_buffer *buffer, struct iio_dev *indio_dev);
-
-	void (*release)(struct iio_buffer *buffer);
-
-	unsigned int modes;
-	unsigned int flags;
-};
-
-/**
- * struct iio_buffer - general buffer structure
- *
- * Note that the internals of this structure should only be of interest to
- * those writing new buffer implementations.
- */
-struct iio_buffer {
-	/** @length: Number of datums in buffer. */
-	int length;
-
-	/**  @bytes_per_datum: Size of individual datum including timestamp. */
-	int bytes_per_datum;
-
-	/**
-	 * @access: Buffer access functions associated with the
-	 * implementation.
-	 */
-	const struct iio_buffer_access_funcs *access;
-
-	/** @scan_mask: Bitmask used in masking scan mode elements. */
-	long *scan_mask;
-
-	/** @demux_list: List of operations required to demux the scan. */
-	struct list_head demux_list;
-
-	/** @pollq: Wait queue to allow for polling on the buffer. */
-	wait_queue_head_t pollq;
-
-	/** @watermark: Number of datums to wait for poll/read. */
-	unsigned int watermark;
-
-	/* private: */
-	/*
-	 * @scan_el_attrs: Control of scan elements if that scan mode
-	 * control method is used.
-	 */
-	struct attribute_group *scan_el_attrs;
-
-	/* @scan_timestamp: Does the scan mode include a timestamp. */
-	bool scan_timestamp;
-
-	/* @scan_el_dev_attr_list: List of scan element related attributes. */
-	struct list_head scan_el_dev_attr_list;
-
-	/* @buffer_group: Attributes of the buffer group. */
-	struct attribute_group buffer_group;
-
-	/*
-	 * @scan_el_group: Attribute group for those attributes not
-	 * created from the iio_chan_info array.
-	 */
-	struct attribute_group scan_el_group;
-
-	/* @stufftoread: Flag to indicate new data. */
-	bool stufftoread;
-
-	/* @attrs: Standard attributes of the buffer. */
-	const struct attribute **attrs;
-
-	/* @demux_bounce: Buffer for doing gather from incoming scan. */
-	void *demux_bounce;
-
-	/* @buffer_list: Entry in the devices list of current buffers. */
-	struct list_head buffer_list;
-
-	/* @ref: Reference count of the buffer. */
-	struct kref ref;
-};
-
-/**
- * iio_update_buffers() - add or remove buffer from active list
- * @indio_dev:		device to add buffer to
- * @insert_buffer:	buffer to insert
- * @remove_buffer:	buffer_to_remove
- *
- * Note this will tear down the all buffering and build it up again
- */
-int iio_update_buffers(struct iio_dev *indio_dev,
-		       struct iio_buffer *insert_buffer,
-		       struct iio_buffer *remove_buffer);
-
-/**
- * iio_buffer_init() - Initialize the buffer structure
- * @buffer:		buffer to be initialized
- **/
-void iio_buffer_init(struct iio_buffer *buffer);
 
 int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data);
 
@@ -189,19 +45,9 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 }
 
 bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
-	const unsigned long *mask);
-
-struct iio_buffer *iio_buffer_get(struct iio_buffer *buffer);
-void iio_buffer_put(struct iio_buffer *buffer);
+				   const unsigned long *mask);
 
 void iio_device_attach_buffer(struct iio_dev *indio_dev,
 			      struct iio_buffer *buffer);
 
-#else /* CONFIG_IIO_BUFFER */
-
-static inline void iio_buffer_get(struct iio_buffer *buffer) {}
-static inline void iio_buffer_put(struct iio_buffer *buffer) {}
-
-#endif /* CONFIG_IIO_BUFFER */
-
 #endif /* _IIO_BUFFER_GENERIC_H_ */
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
new file mode 100644
index 000000000000..8daba198fafa
--- /dev/null
+++ b/include/linux/iio/buffer_impl.h
@@ -0,0 +1,162 @@
+#ifndef _IIO_BUFFER_GENERIC_IMPL_H_
+#define _IIO_BUFFER_GENERIC_IMPL_H_
+#include <linux/sysfs.h>
+#include <linux/kref.h>
+
+#ifdef CONFIG_IIO_BUFFER
+
+struct iio_dev;
+struct iio_buffer;
+
+/**
+ * INDIO_BUFFER_FLAG_FIXED_WATERMARK - Watermark level of the buffer can not be
+ *   configured. It has a fixed value which will be buffer specific.
+ */
+#define INDIO_BUFFER_FLAG_FIXED_WATERMARK BIT(0)
+
+/**
+ * struct iio_buffer_access_funcs - access functions for buffers.
+ * @store_to:		actually store stuff to the buffer
+ * @read_first_n:	try to get a specified number of bytes (must exist)
+ * @data_available:	indicates how much data is available for reading from
+ *			the buffer.
+ * @request_update:	if a parameter change has been marked, update underlying
+ *			storage.
+ * @set_bytes_per_datum:set number of bytes per datum
+ * @set_length:		set number of datums in buffer
+ * @enable:             called if the buffer is attached to a device and the
+ *                      device starts sampling. Calls are balanced with
+ *                      @disable.
+ * @disable:            called if the buffer is attached to a device and the
+ *                      device stops sampling. Calles are balanced with @enable.
+ * @release:		called when the last reference to the buffer is dropped,
+ *			should free all resources allocated by the buffer.
+ * @modes:		Supported operating modes by this buffer type
+ * @flags:		A bitmask combination of INDIO_BUFFER_FLAG_*
+ *
+ * The purpose of this structure is to make the buffer element
+ * modular as event for a given driver, different usecases may require
+ * different buffer designs (space efficiency vs speed for example).
+ *
+ * It is worth noting that a given buffer implementation may only support a
+ * small proportion of these functions.  The core code 'should' cope fine with
+ * any of them not existing.
+ **/
+struct iio_buffer_access_funcs {
+	int (*store_to)(struct iio_buffer *buffer, const void *data);
+	int (*read_first_n)(struct iio_buffer *buffer,
+			    size_t n,
+			    char __user *buf);
+	size_t (*data_available)(struct iio_buffer *buffer);
+
+	int (*request_update)(struct iio_buffer *buffer);
+
+	int (*set_bytes_per_datum)(struct iio_buffer *buffer, size_t bpd);
+	int (*set_length)(struct iio_buffer *buffer, int length);
+
+	int (*enable)(struct iio_buffer *buffer, struct iio_dev *indio_dev);
+	int (*disable)(struct iio_buffer *buffer, struct iio_dev *indio_dev);
+
+	void (*release)(struct iio_buffer *buffer);
+
+	unsigned int modes;
+	unsigned int flags;
+};
+
+/**
+ * struct iio_buffer - general buffer structure
+ *
+ * Note that the internals of this structure should only be of interest to
+ * those writing new buffer implementations.
+ */
+struct iio_buffer {
+	/** @length: Number of datums in buffer. */
+	int length;
+
+	/**  @bytes_per_datum: Size of individual datum including timestamp. */
+	int bytes_per_datum;
+
+	/**
+	 * @access: Buffer access functions associated with the
+	 * implementation.
+	 */
+	const struct iio_buffer_access_funcs *access;
+
+	/** @scan_mask: Bitmask used in masking scan mode elements. */
+	long *scan_mask;
+
+	/** @demux_list: List of operations required to demux the scan. */
+	struct list_head demux_list;
+
+	/** @pollq: Wait queue to allow for polling on the buffer. */
+	wait_queue_head_t pollq;
+
+	/** @watermark: Number of datums to wait for poll/read. */
+	unsigned int watermark;
+
+	/* private: */
+	/*
+	 * @scan_el_attrs: Control of scan elements if that scan mode
+	 * control method is used.
+	 */
+	struct attribute_group *scan_el_attrs;
+
+	/* @scan_timestamp: Does the scan mode include a timestamp. */
+	bool scan_timestamp;
+
+	/* @scan_el_dev_attr_list: List of scan element related attributes. */
+	struct list_head scan_el_dev_attr_list;
+
+	/* @buffer_group: Attributes of the buffer group. */
+	struct attribute_group buffer_group;
+
+	/*
+	 * @scan_el_group: Attribute group for those attributes not
+	 * created from the iio_chan_info array.
+	 */
+	struct attribute_group scan_el_group;
+
+	/* @stufftoread: Flag to indicate new data. */
+	bool stufftoread;
+
+	/* @attrs: Standard attributes of the buffer. */
+	const struct attribute **attrs;
+
+	/* @demux_bounce: Buffer for doing gather from incoming scan. */
+	void *demux_bounce;
+
+	/* @buffer_list: Entry in the devices list of current buffers. */
+	struct list_head buffer_list;
+
+	/* @ref: Reference count of the buffer. */
+	struct kref ref;
+};
+
+/**
+ * iio_update_buffers() - add or remove buffer from active list
+ * @indio_dev:		device to add buffer to
+ * @insert_buffer:	buffer to insert
+ * @remove_buffer:	buffer_to_remove
+ *
+ * Note this will tear down the all buffering and build it up again
+ */
+int iio_update_buffers(struct iio_dev *indio_dev,
+		       struct iio_buffer *insert_buffer,
+		       struct iio_buffer *remove_buffer);
+
+/**
+ * iio_buffer_init() - Initialize the buffer structure
+ * @buffer:		buffer to be initialized
+ **/
+void iio_buffer_init(struct iio_buffer *buffer);
+
+struct iio_buffer *iio_buffer_get(struct iio_buffer *buffer);
+void iio_buffer_put(struct iio_buffer *buffer);
+
+#else /* CONFIG_IIO_BUFFER */
+
+static inline void iio_buffer_get(struct iio_buffer *buffer) {}
+static inline void iio_buffer_put(struct iio_buffer *buffer) {}
+
+#endif /* CONFIG_IIO_BUFFER */
+#endif /* _IIO_BUFFER_GENERIC_IMPL_H_ */
-- 
cgit v1.2.3


From ccb61f8a99e6c29df4fb96a65dad4fad740d5be9 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 22 Dec 2016 16:54:00 -0800
Subject: Drivers: hv: vmbus: Fix a rescind handling bug

The host can rescind a channel that has been offered to the
guest and once the channel is rescinded, the host does not
respond to any requests on that channel. Deal with the case where
the guest may be blocked waiting for a response from the host.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      | 18 ++++++++++++++++++
 drivers/hv/channel_mgmt.c | 25 +++++++++++++++++++++++++
 include/linux/hyperv.h    |  1 +
 3 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index d5b8d9fd50bb..be34547cdb68 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -157,6 +157,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 	}
 
 	init_completion(&open_info->waitevent);
+	open_info->waiting_channel = newchannel;
 
 	open_msg = (struct vmbus_channel_open_channel *)open_info->msg;
 	open_msg->header.msgtype = CHANNELMSG_OPENCHANNEL;
@@ -194,6 +195,11 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 	list_del(&open_info->msglistentry);
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 
+	if (newchannel->rescind) {
+		err = -ENODEV;
+		goto error_free_gpadl;
+	}
+
 	if (open_info->response.open_result.status) {
 		err = -EAGAIN;
 		goto error_free_gpadl;
@@ -405,6 +411,7 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer,
 		return ret;
 
 	init_completion(&msginfo->waitevent);
+	msginfo->waiting_channel = channel;
 
 	gpadlmsg = (struct vmbus_channel_gpadl_header *)msginfo->msg;
 	gpadlmsg->header.msgtype = CHANNELMSG_GPADL_HEADER;
@@ -441,6 +448,11 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer,
 	}
 	wait_for_completion(&msginfo->waitevent);
 
+	if (channel->rescind) {
+		ret = -ENODEV;
+		goto cleanup;
+	}
+
 	/* At this point, we received the gpadl created msg */
 	*gpadl_handle = gpadlmsg->gpadl;
 
@@ -474,6 +486,7 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle)
 		return -ENOMEM;
 
 	init_completion(&info->waitevent);
+	info->waiting_channel = channel;
 
 	msg = (struct vmbus_channel_gpadl_teardown *)info->msg;
 
@@ -493,6 +506,11 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle)
 
 	wait_for_completion(&info->waitevent);
 
+	if (channel->rescind) {
+		ret = -ENODEV;
+		goto post_msg_err;
+	}
+
 post_msg_err:
 	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
 	list_del(&info->msglistentry);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index b1e85d24bb4e..0af7e39006c8 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -147,6 +147,29 @@ static const struct {
 	{ HV_RDV_GUID	},
 };
 
+/*
+ * The rescinded channel may be blocked waiting for a response from the host;
+ * take care of that.
+ */
+static void vmbus_rescind_cleanup(struct vmbus_channel *channel)
+{
+	struct vmbus_channel_msginfo *msginfo;
+	unsigned long flags;
+
+
+	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+
+	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
+				msglistentry) {
+
+		if (msginfo->waiting_channel == channel) {
+			complete(&msginfo->waitevent);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+}
+
 static bool is_unsupported_vmbus_devs(const uuid_le *guid)
 {
 	int i;
@@ -825,6 +848,8 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 	channel->rescind = true;
 	spin_unlock_irqrestore(&channel->lock, flags);
 
+	vmbus_rescind_cleanup(channel);
+
 	if (channel->device_obj) {
 		if (channel->chn_rescind_callback) {
 			channel->chn_rescind_callback(channel);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 42fe43fb0c80..7ea20bd7cdd1 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -627,6 +627,7 @@ struct vmbus_channel_msginfo {
 
 	/* Synchronize the request/response if needed */
 	struct completion  waitevent;
+	struct vmbus_channel *waiting_channel;
 	union {
 		struct vmbus_channel_version_supported version_supported;
 		struct vmbus_channel_open_result open_result;
-- 
cgit v1.2.3


From 874bcd00f520cac297aefade201c4efc07fc8d17 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Thu, 15 Dec 2016 20:36:33 +0100
Subject: apm-emulation: move APM_MINOR_DEV to include/linux/miscdevice.h

This patch move the define for APM_MINOR_DEV to include/linux/miscdevice.h
It is better that all minor number definitions are in the same place.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/apm-emulation.c | 7 -------
 include/linux/miscdevice.h   | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index dd9dfa15e9d1..1dfb9f8de171 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -31,13 +31,6 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 
-
-/*
- * The apm_bios device is one of the misc char devices.
- * This is its minor number.
- */
-#define APM_MINOR_DEV	134
-
 /*
  * One option can be changed at boot time as follows:
  *	apm=on/off			enable/disable APM
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index ed30d5d713e3..0590263c462c 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -22,6 +22,7 @@
 /*#define ADB_MOUSE_MINOR	10	FIXME OBSOLETE */
 #define WATCHDOG_MINOR		130	/* Watchdog timer     */
 #define TEMP_MINOR		131	/* Temperature Sensor */
+#define APM_MINOR_DEV		134
 #define RTC_MINOR		135
 #define EFI_RTC_MINOR		136	/* EFI Time services */
 #define VHCI_MINOR		137
-- 
cgit v1.2.3


From b0f2d7d546d37697d3f50753904f6f0c549b62bc Mon Sep 17 00:00:00 2001
From: Martyn Welch <martyn@welchs.me.uk>
Date: Sat, 10 Dec 2016 23:10:41 +0000
Subject: VME: Remove node entry from vme_driver

The vme_driver structure currently has a "node" entry. This entry is
never used and it's intended purpose has been lost to the mists of time.

Remove the entry from vme_driver to avoid confusion.

Signed-off-by: Martyn Welch <martyn.welch@collabora.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/vme.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vme.h b/include/linux/vme.h
index 8c589176c2f8..ec5e8bf6118e 100644
--- a/include/linux/vme.h
+++ b/include/linux/vme.h
@@ -108,7 +108,6 @@ struct vme_dev {
 };
 
 struct vme_driver {
-	struct list_head node;
 	const char *name;
 	int (*match)(struct vme_dev *);
 	int (*probe)(struct vme_dev *);
-- 
cgit v1.2.3


From e41746b0dd51f353630112558835aa62c1eb883e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Dec 2016 15:33:39 +0000
Subject: debugfs: improve formatting of debugfs_real_fops()

Type of debugfs_real_fops() is longer than parameters and
the name, so there is no way to break the declaration nicely.
We have to go over 80 characters.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/debugfs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 014cc564d1c4..7574e86ff1eb 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -52,8 +52,7 @@ extern struct srcu_struct debugfs_srcu;
  * Must only be called under the protection established by
  * debugfs_use_file_start().
  */
-static inline const struct file_operations *
-debugfs_real_fops(const struct file *filp)
+static inline const struct file_operations *debugfs_real_fops(const struct file *filp)
 	__must_hold(&debugfs_srcu)
 {
 	/*
-- 
cgit v1.2.3


From a0244a8df15d8caee8831872b6fc97b676cb15bd Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 15 Dec 2016 19:55:54 +0100
Subject: kref: prefer atomic_inc_not_zero to atomic_add_unless

On most platforms, there exists this ifdef:

 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

This makes this patch functionally useless. However, on PPC, there is
actually an explicit definition of atomic_inc_not_zero with its own
assembly that is slightly more optimized than atomic_add_unless. So,
this patch changes kref to use atomic_inc_not_zero instead, for PPC and
any future platforms that might provide an explicit implementation.

This also puts this usage of kref more in line with a verbatim reading
of the examples in Paul McKenney's paper [1] in the section titled "2.4
Atomic Counting With Check and Release Memory Barrier", which uses
atomic_inc_not_zero.

[1] http://open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2167.pdf

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kref.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index e15828fd71f1..62f0a84ae94e 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -133,6 +133,6 @@ static inline int kref_put_mutex(struct kref *kref,
  */
 static inline int __must_check kref_get_unless_zero(struct kref *kref)
 {
-	return atomic_add_unless(&kref->refcount, 1, 0);
+	return atomic_inc_not_zero(&kref->refcount);
 }
 #endif /* _KREF_H_ */
-- 
cgit v1.2.3


From bb5b06750f1d9b2d632d942d8a72b35a4dd930b9 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 4 Jan 2017 13:56:28 +0530
Subject: gpio: davinci: Remove redundant members davinci_gpio_controller stuct

davinci_gpio_controller struct has set_data, in_data, clr_data
members that are assigned and never used.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-davinci.c                | 3 ---
 include/linux/platform_data/gpio-davinci.h | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c
index 9191056548fe..163f81e3db83 100644
--- a/drivers/gpio/gpio-davinci.c
+++ b/drivers/gpio/gpio-davinci.c
@@ -266,9 +266,6 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 		if (!regs)
 			return -ENXIO;
 		chips[i].regs = regs;
-		chips[i].set_data = &regs->set_data;
-		chips[i].clr_data = &regs->clr_data;
-		chips[i].in_data = &regs->in_data;
 
 		gpiochip_add_data(&chips[i].chip, &chips[i]);
 	}
diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h
index 6ace3fd32b6a..44ca5303849c 100644
--- a/include/linux/platform_data/gpio-davinci.h
+++ b/include/linux/platform_data/gpio-davinci.h
@@ -33,9 +33,6 @@ struct davinci_gpio_controller {
 	/* Serialize access to GPIO registers */
 	spinlock_t		lock;
 	void __iomem		*regs;
-	void __iomem		*set_data;
-	void __iomem		*clr_data;
-	void __iomem		*in_data;
 	int			gpio_unbanked;
 	unsigned		gpio_irq;
 };
-- 
cgit v1.2.3


From 568c5fe5a54f2654f5a4c599c45b8a62ed9a2013 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 10 Jan 2017 13:35:42 -0800
Subject: mm: Introduce lm_alias

Certain architectures may have the kernel image mapped separately to
alias the linear map. Introduce a macro lm_alias to translate a kernel
image symbol into its linear alias. This is used in part with work to
add CONFIG_DEBUG_VIRTUAL support for arm64.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/mm.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fe6b4036664a..5dc9c4650522 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -76,6 +76,10 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define page_to_virt(x)	__va(PFN_PHYS(page_to_pfn(x)))
 #endif
 
+#ifndef lm_alias
+#define lm_alias(x)	__va(__pa_symbol(x))
+#endif
+
 /*
  * To prevent common memory management code establishing
  * a zero page mapping on a read fault.
-- 
cgit v1.2.3


From cbbd86075a7e32b6e67f77d374b9bb53841ab1c7 Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin@kaiser.cx>
Date: Wed, 11 Jan 2017 17:09:50 +0100
Subject: video: fbdev: imxfb: remove the macros for initializing the DMACR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current definitions of DMACR_HM() and DMACR_TM() are correct only
for imx1, they're wrong for imx21.

The macros are meant for legacy board files only, they're not applicable
for boards using device tree.

At the moment, there are no boards using these macros. So it should be
safe to drop them rather than making them work for both imx1 and imx21,
which would require a change in the platform data struct.

Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 include/linux/platform_data/video-imxfb.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
index 18e908324549..a5c0a71ec914 100644
--- a/include/linux/platform_data/video-imxfb.h
+++ b/include/linux/platform_data/video-imxfb.h
@@ -47,10 +47,6 @@
 #define LSCR1_GRAY2(x)            (((x) & 0xf) << 4)
 #define LSCR1_GRAY1(x)            (((x) & 0xf))
 
-#define DMACR_BURST	(1 << 31)
-#define DMACR_HM(x)	(((x) & 0xf) << 16)
-#define DMACR_TM(x)	((x) & 0xf)
-
 struct imx_fb_videomode {
 	struct fb_videomode mode;
 	u32 pcr;
-- 
cgit v1.2.3


From 15df6d98ec3b40775918fc6ef73d7f1c2d0cf870 Mon Sep 17 00:00:00 2001
From: Michal Suchanek <msuchanek@suse.de>
Date: Tue, 10 Jan 2017 18:48:12 +0100
Subject: power: supply: axp20x_usb_power: fix warning on 64bit

Casting of_device_get_match_data return value to int causes warning on 64bit
architectures.

../drivers/power/supply/axp20x_usb_power.c: In function
'axp20x_usb_power_probe':
../drivers/power/supply/axp20x_usb_power.c:297:21: warning: cast from
pointer to integer of different size [-Wpointer-to-int-cast]

Fixes: 0dcc70ca8644 ("power: supply: axp20x_usb_power: use of_device_id
    data field instead of device_is_compatible")
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/axp20x_usb_power.c | 5 +++--
 include/linux/mfd/axp20x.h              | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/axp20x_usb_power.c b/drivers/power/supply/axp20x_usb_power.c
index 1bcb02551e02..632a33fe2d54 100644
--- a/drivers/power/supply/axp20x_usb_power.c
+++ b/drivers/power/supply/axp20x_usb_power.c
@@ -48,7 +48,7 @@ struct axp20x_usb_power {
 	struct device_node *np;
 	struct regmap *regmap;
 	struct power_supply *supply;
-	int axp20x_id;
+	enum axp20x_variants axp20x_id;
 };
 
 static irqreturn_t axp20x_usb_power_irq(int irq, void *devid)
@@ -294,7 +294,8 @@ static int axp20x_usb_power_probe(struct platform_device *pdev)
 	if (!power)
 		return -ENOMEM;
 
-	power->axp20x_id = (int)of_device_get_match_data(&pdev->dev);
+	power->axp20x_id = (enum axp20x_variants)of_device_get_match_data(
+								&pdev->dev);
 
 	power->np = pdev->dev.of_node;
 	power->regmap = axp20x->regmap;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 812806d6319b..f848ee86a339 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -13,7 +13,7 @@
 
 #include <linux/regmap.h>
 
-enum {
+enum axp20x_variants {
 	AXP152_ID = 0,
 	AXP202_ID,
 	AXP209_ID,
-- 
cgit v1.2.3


From 818e3012c2eac4885bf7278c5144a14186d742d2 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:38 +1100
Subject: power: supply: bq27xxx: rename BQ27500 allow for deprecation in
 future.

The BQ2750X definition exists only to satisfy backwards compatibility.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 8 ++++----
 drivers/power/supply/bq27xxx_battery_i2c.c | 2 +-
 include/linux/power/bq27xxx_battery.h      | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 7272d1e024f9..103ba3b1db57 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -145,7 +145,7 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x76,
 		[BQ27XXX_REG_AP] = INVALID_REG_ADDR,
 	},
-	[BQ27500] = {
+	[BQ2750X] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
 		[BQ27XXX_REG_INT_TEMP] = 0x28,
@@ -303,7 +303,7 @@ static enum power_supply_property bq27010_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
-static enum power_supply_property bq27500_battery_props[] = {
+static enum power_supply_property bq2750x_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
@@ -421,7 +421,7 @@ static struct {
 } bq27xxx_battery_props[] = {
 	BQ27XXX_PROP(BQ27000, bq27000_battery_props),
 	BQ27XXX_PROP(BQ27010, bq27010_battery_props),
-	BQ27XXX_PROP(BQ27500, bq27500_battery_props),
+	BQ27XXX_PROP(BQ2750X, bq2750x_battery_props),
 	BQ27XXX_PROP(BQ27510, bq27510_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
@@ -675,7 +675,7 @@ static int bq27xxx_battery_read_pwr_avg(struct bq27xxx_device_info *di)
 static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 {
 	switch (di->chip) {
-	case BQ27500:
+	case BQ2750X:
 	case BQ27510:
 	case BQ27541:
 	case BQ27545:
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 5c5c3a6f9923..fb1219b779b7 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -148,7 +148,7 @@ static int bq27xxx_battery_i2c_remove(struct i2c_client *client)
 static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27200", BQ27000 },
 	{ "bq27210", BQ27010 },
-	{ "bq27500", BQ27500 },
+	{ "bq27500", BQ2750X },
 	{ "bq27510", BQ27510 },
 	{ "bq27520", BQ27510 },
 	{ "bq27530", BQ27530 },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index bed9557b69e7..4c904d00564c 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -4,7 +4,7 @@
 enum bq27xxx_chip {
 	BQ27000 = 1, /* bq27000, bq27200 */
 	BQ27010, /* bq27010, bq27210 */
-	BQ27500, /* bq27500 */
+	BQ2750X, /* bq27500 deprecated alias */
 	BQ27510, /* bq27510, bq27520 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
-- 
cgit v1.2.3


From 6da6e4bdd383d8efdf50da6d0933a16ad3a590f6 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:39 +1100
Subject: power: supply: bq27xxx: rename BQ27510 allow for deprecation in
 future.

The BQ2751X definition exists only to satisfy backwards compatibility.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 8 ++++----
 drivers/power/supply/bq27xxx_battery_i2c.c | 4 ++--
 include/linux/power/bq27xxx_battery.h      | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 103ba3b1db57..90a5373b0e95 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -164,7 +164,7 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = INVALID_REG_ADDR,
 	},
-	[BQ27510] = {
+	[BQ2751X] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
 		[BQ27XXX_REG_INT_TEMP] = 0x28,
@@ -321,7 +321,7 @@ static enum power_supply_property bq2750x_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
-static enum power_supply_property bq27510_battery_props[] = {
+static enum power_supply_property bq2751x_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
@@ -422,7 +422,7 @@ static struct {
 	BQ27XXX_PROP(BQ27000, bq27000_battery_props),
 	BQ27XXX_PROP(BQ27010, bq27010_battery_props),
 	BQ27XXX_PROP(BQ2750X, bq2750x_battery_props),
-	BQ27XXX_PROP(BQ27510, bq27510_battery_props),
+	BQ27XXX_PROP(BQ2751X, bq2751x_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -676,7 +676,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 {
 	switch (di->chip) {
 	case BQ2750X:
-	case BQ27510:
+	case BQ2751X:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index fb1219b779b7..5f64e707656a 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -149,8 +149,8 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27200", BQ27000 },
 	{ "bq27210", BQ27010 },
 	{ "bq27500", BQ2750X },
-	{ "bq27510", BQ27510 },
-	{ "bq27520", BQ27510 },
+	{ "bq27510", BQ2751X },
+	{ "bq27520", BQ2751X },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 4c904d00564c..651f265837a4 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -5,7 +5,7 @@ enum bq27xxx_chip {
 	BQ27000 = 1, /* bq27000, bq27200 */
 	BQ27010, /* bq27010, bq27210 */
 	BQ2750X, /* bq27500 deprecated alias */
-	BQ27510, /* bq27510, bq27520 */
+	BQ2751X, /* bq27510, bq27520 deprecated alias */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From 32833635b0fe9bf71bbf867d1c3abfb5b006bf29 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:40 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27500/1 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27500 chip definition to specifically match the
bq27500/1 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 42 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 90a5373b0e95..9c9ffe2318a5 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -183,6 +183,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x2e,
 		[BQ27XXX_REG_AP] = INVALID_REG_ADDR,
 	},
+	[BQ27500] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = 0x18,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = 0x26,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x2a,
+		[BQ27XXX_REG_AE] = 0x22,
+		[BQ27XXX_REG_SOC] = 0x2c,
+		[BQ27XXX_REG_DCAP] = 0x3c,
+		[BQ27XXX_REG_AP] = 0x24,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -339,6 +358,27 @@ static enum power_supply_property bq2751x_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27500_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TIME_TO_FULL_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_ENERGY_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -423,6 +463,7 @@ static struct {
 	BQ27XXX_PROP(BQ27010, bq27010_battery_props),
 	BQ27XXX_PROP(BQ2750X, bq2750x_battery_props),
 	BQ27XXX_PROP(BQ2751X, bq2751x_battery_props),
+	BQ27XXX_PROP(BQ27500, bq27500_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -677,6 +718,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	switch (di->chip) {
 	case BQ2750X:
 	case BQ2751X:
+	case BQ27500:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 5f64e707656a..ce0f01ec2749 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -151,6 +151,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27500", BQ2750X },
 	{ "bq27510", BQ2751X },
 	{ "bq27520", BQ2751X },
+	{ "bq27500-1", BQ27500 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -173,6 +174,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27500" },
 	{ .compatible = "ti,bq27510" },
 	{ .compatible = "ti,bq27520" },
+	{ .compatible = "ti,bq27500-1" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 651f265837a4..76b623d7a9a4 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -6,6 +6,7 @@ enum bq27xxx_chip {
 	BQ27010, /* bq27010, bq27210 */
 	BQ2750X, /* bq27500 deprecated alias */
 	BQ2751X, /* bq27510, bq27520 deprecated alias */
+	BQ27500, /* bq27500/1 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From bd28177f3ec8367fbb3c56cfcf1c1a46e3fe240a Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:41 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27510-g1
 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27510G1 chip definition to specifically match the
bq27510-G1 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 43 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 9c9ffe2318a5..44e8aa0f201d 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -22,6 +22,7 @@
  * http://www.ti.com/product/bq27010
  * http://www.ti.com/product/bq27210
  * http://www.ti.com/product/bq27500
+ * http://www.ti.com/product/bq27510-g1
  * http://www.ti.com/product/bq27510-g3
  * http://www.ti.com/product/bq27520-g4
  * http://www.ti.com/product/bq27530-g1
@@ -202,6 +203,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = 0x24,
 	},
+	[BQ27510G1] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = 0x18,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = 0x26,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x2a,
+		[BQ27XXX_REG_AE] = 0x22,
+		[BQ27XXX_REG_SOC] = 0x2c,
+		[BQ27XXX_REG_DCAP] = 0x3c,
+		[BQ27XXX_REG_AP] = 0x24,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -379,6 +399,27 @@ static enum power_supply_property bq27500_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27510g1_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TIME_TO_FULL_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_ENERGY_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -464,6 +505,7 @@ static struct {
 	BQ27XXX_PROP(BQ2750X, bq2750x_battery_props),
 	BQ27XXX_PROP(BQ2751X, bq2751x_battery_props),
 	BQ27XXX_PROP(BQ27500, bq27500_battery_props),
+	BQ27XXX_PROP(BQ27510G1, bq27510g1_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -719,6 +761,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	case BQ2750X:
 	case BQ2751X:
 	case BQ27500:
+	case BQ27510G1:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index ce0f01ec2749..11e54c10e86f 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -152,6 +152,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27510", BQ2751X },
 	{ "bq27520", BQ2751X },
 	{ "bq27500-1", BQ27500 },
+	{ "bq27510g1", BQ27510G1 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -175,6 +176,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27510" },
 	{ .compatible = "ti,bq27520" },
 	{ .compatible = "ti,bq27500-1" },
+	{ .compatible = "ti,bq27510g1" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 76b623d7a9a4..3af0815426c9 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -7,6 +7,7 @@ enum bq27xxx_chip {
 	BQ2750X, /* bq27500 deprecated alias */
 	BQ2751X, /* bq27510, bq27520 deprecated alias */
 	BQ27500, /* bq27500/1 */
+	BQ27510G1, /* bq27510G1 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From 698a2bf5fc31d85d428a2ae495775b61381a495e Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:42 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27510-g2
 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27510G2 chip definition to specifically match the
bq27510-G2 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 43 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 44e8aa0f201d..d378f5600bbc 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -23,6 +23,7 @@
  * http://www.ti.com/product/bq27210
  * http://www.ti.com/product/bq27500
  * http://www.ti.com/product/bq27510-g1
+ * http://www.ti.com/product/bq27510-g2
  * http://www.ti.com/product/bq27510-g3
  * http://www.ti.com/product/bq27520-g4
  * http://www.ti.com/product/bq27530-g1
@@ -222,6 +223,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = 0x24,
 	},
+	[BQ27510G2] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = 0x18,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = 0x26,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x2a,
+		[BQ27XXX_REG_AE] = 0x22,
+		[BQ27XXX_REG_SOC] = 0x2c,
+		[BQ27XXX_REG_DCAP] = 0x3c,
+		[BQ27XXX_REG_AP] = 0x24,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -420,6 +440,27 @@ static enum power_supply_property bq27510g1_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27510g2_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TIME_TO_FULL_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_ENERGY_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -506,6 +547,7 @@ static struct {
 	BQ27XXX_PROP(BQ2751X, bq2751x_battery_props),
 	BQ27XXX_PROP(BQ27500, bq27500_battery_props),
 	BQ27XXX_PROP(BQ27510G1, bq27510g1_battery_props),
+	BQ27XXX_PROP(BQ27510G2, bq27510g2_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -762,6 +804,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	case BQ2751X:
 	case BQ27500:
 	case BQ27510G1:
+	case BQ27510G2:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 11e54c10e86f..1a7919d81ae8 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -153,6 +153,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27520", BQ2751X },
 	{ "bq27500-1", BQ27500 },
 	{ "bq27510g1", BQ27510G1 },
+	{ "bq27510g2", BQ27510G2 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -177,6 +178,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27520" },
 	{ .compatible = "ti,bq27500-1" },
 	{ .compatible = "ti,bq27510g1" },
+	{ .compatible = "ti,bq27510g2" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 3af0815426c9..79772ca231b1 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -8,6 +8,7 @@ enum bq27xxx_chip {
 	BQ2751X, /* bq27510, bq27520 deprecated alias */
 	BQ27500, /* bq27500/1 */
 	BQ27510G1, /* bq27510G1 */
+	BQ27510G2, /* bq27510G2 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From 71375aa7d6a7392d4968f6a562b437cb4958f956 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:43 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27510-g3
 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27510G3 chip definition to specifically match the
bq27510-G3 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Tested-by: Chris Lapa <chris@lapa.com.au>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 39 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index d378f5600bbc..562055df978c 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -242,6 +242,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = 0x24,
 	},
+	[BQ27510G3] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = 0x28,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_TTES] = 0x1a,
+		[BQ27XXX_REG_TTECP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x1e,
+		[BQ27XXX_REG_AE] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_SOC] = 0x20,
+		[BQ27XXX_REG_DCAP] = 0x2e,
+		[BQ27XXX_REG_AP] = INVALID_REG_ADDR,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -461,6 +480,24 @@ static enum power_supply_property bq27510g2_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27510g3_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -548,6 +585,7 @@ static struct {
 	BQ27XXX_PROP(BQ27500, bq27500_battery_props),
 	BQ27XXX_PROP(BQ27510G1, bq27510g1_battery_props),
 	BQ27XXX_PROP(BQ27510G2, bq27510g2_battery_props),
+	BQ27XXX_PROP(BQ27510G3, bq27510g3_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -805,6 +843,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	case BQ27500:
 	case BQ27510G1:
 	case BQ27510G2:
+	case BQ27510G3:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 1a7919d81ae8..289592ac0cbb 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -154,6 +154,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27500-1", BQ27500 },
 	{ "bq27510g1", BQ27510G1 },
 	{ "bq27510g2", BQ27510G2 },
+	{ "bq27510g3", BQ27510G3 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -179,6 +180,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27500-1" },
 	{ .compatible = "ti,bq27510g1" },
 	{ .compatible = "ti,bq27510g2" },
+	{ .compatible = "ti,bq27510g3" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 79772ca231b1..c45ce71a6158 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -9,6 +9,7 @@ enum bq27xxx_chip {
 	BQ27500, /* bq27500/1 */
 	BQ27510G1, /* bq27510G1 */
 	BQ27510G2, /* bq27510G2 */
+	BQ27510G3, /* bq27510G3 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From 68f2a813eb25bd0ef72453aeef9b1ce156157d14 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:44 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27520-g1
 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27520G1 chip definition to specifically match the
bq27520-G1 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 42 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 562055df978c..c5e1bf0c14b0 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -26,6 +26,7 @@
  * http://www.ti.com/product/bq27510-g2
  * http://www.ti.com/product/bq27510-g3
  * http://www.ti.com/product/bq27520-g4
+ * http://www.ti.com/product/bq27520-g1
  * http://www.ti.com/product/bq27530-g1
  * http://www.ti.com/product/bq27531-g1
  * http://www.ti.com/product/bq27541-g1
@@ -261,6 +262,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x2e,
 		[BQ27XXX_REG_AP] = INVALID_REG_ADDR,
 	},
+	[BQ27520G1] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = 0x18,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = 0x26,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_AE] = 0x22,
+		[BQ27XXX_REG_SOC] = 0x2c,
+		[BQ27XXX_REG_DCAP] = 0x3c,
+		[BQ27XXX_REG_AP] = 0x24,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -498,6 +518,26 @@ static enum power_supply_property bq27510g3_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27520g1_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TIME_TO_FULL_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_ENERGY_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -586,6 +626,7 @@ static struct {
 	BQ27XXX_PROP(BQ27510G1, bq27510g1_battery_props),
 	BQ27XXX_PROP(BQ27510G2, bq27510g2_battery_props),
 	BQ27XXX_PROP(BQ27510G3, bq27510g3_battery_props),
+	BQ27XXX_PROP(BQ27520G1, bq27520g1_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -844,6 +885,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	case BQ27510G1:
 	case BQ27510G2:
 	case BQ27510G3:
+	case BQ27520G1:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 289592ac0cbb..e398edebbe4d 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -155,6 +155,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27510g1", BQ27510G1 },
 	{ "bq27510g2", BQ27510G2 },
 	{ "bq27510g3", BQ27510G3 },
+	{ "bq27520g1", BQ27520G1 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -181,6 +182,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27510g1" },
 	{ .compatible = "ti,bq27510g2" },
 	{ .compatible = "ti,bq27510g3" },
+	{ .compatible = "ti,bq27520g1" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index c45ce71a6158..eddd96936875 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -10,6 +10,7 @@ enum bq27xxx_chip {
 	BQ27510G1, /* bq27510G1 */
 	BQ27510G2, /* bq27510G2 */
 	BQ27510G3, /* bq27510G3 */
+	BQ27520G1, /* bq27520G1 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From a5deb9a93040a4a221ef8a67c88ecd72cd1f3625 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:45 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27520-g2
 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27520G2 chip definition to specifically match the
bq27520-G2 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 43 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index c5e1bf0c14b0..cd483ec61648 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -27,6 +27,7 @@
  * http://www.ti.com/product/bq27510-g3
  * http://www.ti.com/product/bq27520-g4
  * http://www.ti.com/product/bq27520-g1
+ * http://www.ti.com/product/bq27520-g2
  * http://www.ti.com/product/bq27530-g1
  * http://www.ti.com/product/bq27531-g1
  * http://www.ti.com/product/bq27541-g1
@@ -281,6 +282,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = 0x24,
 	},
+	[BQ27520G2] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = 0x36,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = 0x18,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = 0x26,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x2a,
+		[BQ27XXX_REG_AE] = 0x22,
+		[BQ27XXX_REG_SOC] = 0x2c,
+		[BQ27XXX_REG_DCAP] = 0x3c,
+		[BQ27XXX_REG_AP] = 0x24,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -538,6 +558,27 @@ static enum power_supply_property bq27520g1_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27520g2_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TIME_TO_FULL_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_ENERGY_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -627,6 +668,7 @@ static struct {
 	BQ27XXX_PROP(BQ27510G2, bq27510g2_battery_props),
 	BQ27XXX_PROP(BQ27510G3, bq27510g3_battery_props),
 	BQ27XXX_PROP(BQ27520G1, bq27520g1_battery_props),
+	BQ27XXX_PROP(BQ27520G2, bq27520g2_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -886,6 +928,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	case BQ27510G2:
 	case BQ27510G3:
 	case BQ27520G1:
+	case BQ27520G2:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index e398edebbe4d..b2898994ab85 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -156,6 +156,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27510g2", BQ27510G2 },
 	{ "bq27510g3", BQ27510G3 },
 	{ "bq27520g1", BQ27520G1 },
+	{ "bq27520g2", BQ27520G2 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -183,6 +184,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27510g2" },
 	{ .compatible = "ti,bq27510g3" },
 	{ .compatible = "ti,bq27520g1" },
+	{ .compatible = "ti,bq27520g2" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index eddd96936875..a3fc34a4ef72 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -11,6 +11,7 @@ enum bq27xxx_chip {
 	BQ27510G2, /* bq27510G2 */
 	BQ27510G3, /* bq27510G3 */
 	BQ27520G1, /* bq27520G1 */
+	BQ27520G2, /* bq27520G2 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From 825e915ba2e811b91a034ac6290cd172387c5447 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:46 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27520-g3
 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27520G3 chip definition to specifically match the
bq27520-G3 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 42 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index cd483ec61648..1fe48ea4078f 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -28,6 +28,7 @@
  * http://www.ti.com/product/bq27520-g4
  * http://www.ti.com/product/bq27520-g1
  * http://www.ti.com/product/bq27520-g2
+ * http://www.ti.com/product/bq27520-g3
  * http://www.ti.com/product/bq27530-g1
  * http://www.ti.com/product/bq27531-g1
  * http://www.ti.com/product/bq27541-g1
@@ -301,6 +302,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = 0x24,
 	},
+	[BQ27520G3] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = 0x36,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = 0x26,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x2a,
+		[BQ27XXX_REG_AE] = 0x22,
+		[BQ27XXX_REG_SOC] = 0x2c,
+		[BQ27XXX_REG_DCAP] = 0x3c,
+		[BQ27XXX_REG_AP] = 0x24,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -579,6 +599,26 @@ static enum power_supply_property bq27520g2_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27520g3_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_ENERGY_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -669,6 +709,7 @@ static struct {
 	BQ27XXX_PROP(BQ27510G3, bq27510g3_battery_props),
 	BQ27XXX_PROP(BQ27520G1, bq27520g1_battery_props),
 	BQ27XXX_PROP(BQ27520G2, bq27520g2_battery_props),
+	BQ27XXX_PROP(BQ27520G3, bq27520g3_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -929,6 +970,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	case BQ27510G3:
 	case BQ27520G1:
 	case BQ27520G2:
+	case BQ27520G3:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index b2898994ab85..3712cd902023 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -157,6 +157,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27510g3", BQ27510G3 },
 	{ "bq27520g1", BQ27520G1 },
 	{ "bq27520g2", BQ27520G2 },
+	{ "bq27520g3", BQ27520G3 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -185,6 +186,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27510g3" },
 	{ .compatible = "ti,bq27520g1" },
 	{ .compatible = "ti,bq27520g2" },
+	{ .compatible = "ti,bq27520g3" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index a3fc34a4ef72..5c12717d0f75 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -12,6 +12,7 @@ enum bq27xxx_chip {
 	BQ27510G3, /* bq27510G3 */
 	BQ27520G1, /* bq27520G1 */
 	BQ27520G2, /* bq27520G2 */
+	BQ27520G3, /* bq27520G3 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From 8835cae5f2abd7f7a3143afe357f416aff5517a4 Mon Sep 17 00:00:00 2001
From: Chris Lapa <chris@lapa.com.au>
Date: Wed, 11 Jan 2017 12:44:47 +1100
Subject: power: supply: bq27xxx: adds specific support for bq27520-g4
 revision.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the BQ27520G4 chip definition to specifically match the
bq27520-G4 functionality as described in the datasheet.

Signed-off-by: Chris Lapa <chris@lapa.com.au>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq27xxx_battery.c     | 39 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 1fe48ea4078f..398801a21b86 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -29,6 +29,7 @@
  * http://www.ti.com/product/bq27520-g1
  * http://www.ti.com/product/bq27520-g2
  * http://www.ti.com/product/bq27520-g3
+ * http://www.ti.com/product/bq27520-g4
  * http://www.ti.com/product/bq27530-g1
  * http://www.ti.com/product/bq27531-g1
  * http://www.ti.com/product/bq27541-g1
@@ -321,6 +322,25 @@ static u8 bq27xxx_regs[][BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = 0x24,
 	},
+	[BQ27520G4] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = 0x28,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_NAC] = 0x0c,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x1e,
+		[BQ27XXX_REG_AE] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_SOC] = 0x20,
+		[BQ27XXX_REG_DCAP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_AP] = INVALID_REG_ADDR,
+	},
 	[BQ27530] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -619,6 +639,23 @@ static enum power_supply_property bq27520g3_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq27520g4_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 static enum power_supply_property bq27530_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
@@ -710,6 +747,7 @@ static struct {
 	BQ27XXX_PROP(BQ27520G1, bq27520g1_battery_props),
 	BQ27XXX_PROP(BQ27520G2, bq27520g2_battery_props),
 	BQ27XXX_PROP(BQ27520G3, bq27520g3_battery_props),
+	BQ27XXX_PROP(BQ27520G4, bq27520g4_battery_props),
 	BQ27XXX_PROP(BQ27530, bq27530_battery_props),
 	BQ27XXX_PROP(BQ27541, bq27541_battery_props),
 	BQ27XXX_PROP(BQ27545, bq27545_battery_props),
@@ -971,6 +1009,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 	case BQ27520G1:
 	case BQ27520G2:
 	case BQ27520G3:
+	case BQ27520G4:
 	case BQ27541:
 	case BQ27545:
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 3712cd902023..c68fbc3fe50a 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -158,6 +158,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27520g1", BQ27520G1 },
 	{ "bq27520g2", BQ27520G2 },
 	{ "bq27520g3", BQ27520G3 },
+	{ "bq27520g4", BQ27520G4 },
 	{ "bq27530", BQ27530 },
 	{ "bq27531", BQ27530 },
 	{ "bq27541", BQ27541 },
@@ -187,6 +188,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27520g1" },
 	{ .compatible = "ti,bq27520g2" },
 	{ .compatible = "ti,bq27520g3" },
+	{ .compatible = "ti,bq27520g4" },
 	{ .compatible = "ti,bq27530" },
 	{ .compatible = "ti,bq27531" },
 	{ .compatible = "ti,bq27541" },
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 5c12717d0f75..b312bcef53da 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -13,6 +13,7 @@ enum bq27xxx_chip {
 	BQ27520G1, /* bq27520G1 */
 	BQ27520G2, /* bq27520G2 */
 	BQ27520G3, /* bq27520G3 */
+	BQ27520G4, /* bq27520G4 */
 	BQ27530, /* bq27530, bq27531 */
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
 	BQ27545, /* bq27545 */
-- 
cgit v1.2.3


From 729204ef49ec00b788ce23deb9eb922a5769f55d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Sat, 17 Dec 2016 18:49:09 +0800
Subject: block: relax check on sg gap

If the last bvec of the 1st bio and the 1st bvec of the next
bio are physically contigious, and the latter can be merged
to last segment of the 1st bio, we should think they don't
violate sg gap(or virt boundary) limit.

Both Vitaly and Dexuan reported lots of unmergeable small bios
are observed when running mkfs on Hyper-V virtual storage, and
performance becomes quite low. This patch fixes that performance
issue.

The same issue should exist on NVMe, since it sets virt boundary too.

Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reported-by: Dexuan Cui <decui@microsoft.com>
Tested-by: Dexuan Cui <decui@microsoft.com>
Cc: Keith Busch <keith.busch@intel.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83695641bd5e..b20da8dfa7ec 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1607,6 +1607,25 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 	return __bvec_gap_to_prev(q, bprv, offset);
 }
 
+/*
+ * Check if the two bvecs from two bios can be merged to one segment.
+ * If yes, no need to check gap between the two bios since the 1st bio
+ * and the 1st bvec in the 2nd bio can be handled in one segment.
+ */
+static inline bool bios_segs_mergeable(struct request_queue *q,
+		struct bio *prev, struct bio_vec *prev_last_bv,
+		struct bio_vec *next_first_bv)
+{
+	if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv))
+		return false;
+	if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv))
+		return false;
+	if (prev->bi_seg_back_size + next_first_bv->bv_len >
+			queue_max_segment_size(q))
+		return false;
+	return true;
+}
+
 static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 			 struct bio *next)
 {
@@ -1616,7 +1635,8 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 		bio_get_last_bvec(prev, &pb);
 		bio_get_first_bvec(next, &nb);
 
-		return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+		if (!bios_segs_mergeable(q, prev, &pb, &nb))
+			return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
 	}
 
 	return false;
-- 
cgit v1.2.3


From f8a5b12247fe18f7fed801ad262a7ab190e1f848 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 13 Dec 2016 09:24:51 -0700
Subject: blk-mq: make mq_ops a const pointer

We never change it, make that clear.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
---
 block/blk-mq.c         | 2 +-
 include/linux/blk-mq.h | 2 +-
 include/linux/blkdev.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a8e67a155d04..79e1cb0f7b15 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -639,7 +639,7 @@ struct blk_mq_timeout_data {
 
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-	struct blk_mq_ops *ops = req->q->mq_ops;
+	const struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
 	/*
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4a2ab5d99ff7..afc81d77e471 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -60,7 +60,7 @@ struct blk_mq_hw_ctx {
 
 struct blk_mq_tag_set {
 	unsigned int		*mq_map;
-	struct blk_mq_ops	*ops;
+	const struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
 	unsigned int		queue_depth;	/* max hw supported */
 	unsigned int		reserved_tags;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b20da8dfa7ec..2e99d659b0f1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -407,7 +407,7 @@ struct request_queue {
 	dma_drain_needed_fn	*dma_drain_needed;
 	lld_busy_fn		*lld_busy_fn;
 
-	struct blk_mq_ops	*mq_ops;
+	const struct blk_mq_ops	*mq_ops;
 
 	unsigned int		*mq_map;
 
-- 
cgit v1.2.3


From 607904c357c61adf20b8fd18af765e501d61a385 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 9 Jan 2017 10:26:52 -0500
Subject: locking/spinlocks: Remove the unused spin_lock_bh_nested() API

The spin_lock_bh_nested() API is defined but is not used anywhere
in the kernel. So all spin_lock_bh_nested() and related APIs are
now removed.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1483975612-16447-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/spinlock.h         | 8 --------
 include/linux/spinlock_api_smp.h | 2 --
 include/linux/spinlock_api_up.h  | 1 -
 kernel/locking/spinlock.c        | 8 --------
 4 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 47dd0cebd204..59248dcc6ef3 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -180,8 +180,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
 	_raw_spin_lock_nested(lock, subclass)
-# define raw_spin_lock_bh_nested(lock, subclass) \
-	_raw_spin_lock_bh_nested(lock, subclass)
 
 # define raw_spin_lock_nest_lock(lock, nest_lock)			\
 	 do {								\
@@ -197,7 +195,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # define raw_spin_lock_nested(lock, subclass)		\
 	_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)	_raw_spin_lock(lock)
-# define raw_spin_lock_bh_nested(lock, subclass)	_raw_spin_lock_bh(lock)
 #endif
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -317,11 +314,6 @@ do {								\
 	raw_spin_lock_nested(spinlock_check(lock), subclass);	\
 } while (0)
 
-#define spin_lock_bh_nested(lock, subclass)			\
-do {								\
-	raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
-} while (0)
-
 #define spin_lock_nest_lock(lock, nest_lock)				\
 do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 5344268e6e62..42dfab89e740 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -22,8 +22,6 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 								__acquires(lock);
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-								__acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
 								__acquires(lock);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index d3afef9d8dbe..d0d188861ad6 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -57,7 +57,6 @@
 
 #define _raw_spin_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass)	__LOCK(lock)
-#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock)			__LOCK(lock)
 #define _raw_write_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index db3ccb1dd614..4b082b5cac9e 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,14 +363,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);
 
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-{
-	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
-	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
-}
-EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
-
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
 						   int subclass)
 {
-- 
cgit v1.2.3


From 6c0b2e833f1439ebcbd7258b655623c1aef23ffb Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Wed, 11 Jan 2017 16:32:17 +0200
Subject: soc: qcom: smem_state: Fix include for ERR_PTR()

The correct include file for getting errno constants and ERR_PTR() is
linux/err.h, rather than linux/errno.h, so fix the include.

Fixes: e8b123e60084 ("soc: qcom: smem_state: Add stubs for disabled smem_state")
Acked-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
---
 include/linux/soc/qcom/smem_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smem_state.h b/include/linux/soc/qcom/smem_state.h
index 7b88697929e9..b8478ee7a71f 100644
--- a/include/linux/soc/qcom/smem_state.h
+++ b/include/linux/soc/qcom/smem_state.h
@@ -1,7 +1,7 @@
 #ifndef __QCOM_SMEM_STATE__
 #define __QCOM_SMEM_STATE__
 
-#include <linux/errno.h>
+#include <linux/err.h>
 
 struct device_node;
 struct qcom_smem_state;
-- 
cgit v1.2.3


From 732dbf3a6104a3abfcfcd066dcaf89e5054ce009 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Dec 2016 08:31:34 +0100
Subject: serial: do not accept sysrq characters via serial port

many embedded boards have a disconnected TTL level serial which can
generate some garbage that can lead to spurious false sysrq detects.

Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h |  2 +-
 lib/Kconfig.debug           | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 5def8e830fb0..58484fb35cc8 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -450,7 +450,7 @@ extern void uart_handle_cts_change(struct uart_port *uport,
 extern void uart_insert_char(struct uart_port *port, unsigned int status,
 		 unsigned int overrun, unsigned int ch, unsigned int flag);
 
-#ifdef SUPPORT_SYSRQ
+#if defined(SUPPORT_SYSRQ) && defined(CONFIG_MAGIC_SYSRQ_SERIAL)
 static inline int
 uart_handle_sysrq_char(struct uart_port *port, unsigned int ch)
 {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b06848a104e6..9fa4e6eb1fe3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -416,6 +416,16 @@ config MAGIC_SYSRQ_DEFAULT_ENABLE
 	  This may be set to 1 or 0 to enable or disable them all, or
 	  to a bitmask as described in Documentation/sysrq.txt.
 
+config MAGIC_SYSRQ_SERIAL
+	bool "Enable magic SysRq key over serial"
+	depends on MAGIC_SYSRQ
+	default y
+	help
+	  Many embedded boards have a disconnected TTL level serial which can
+	  generate some garbage that can lead to spurious false sysrq detects.
+	  This option allows you to decide whether you want to enable the
+	  magic SysRq key.
+
 config DEBUG_KERNEL
 	bool "Kernel debugging"
 	help
-- 
cgit v1.2.3


From 738b35ccee1bcd7cf4af147edd76e7880533ad9f Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 11 Jan 2017 21:13:02 -0800
Subject: net: core: Make netif_wake_subqueue a wrapper

netif_wake_subqueue() is duplicating the same thing that netif_tx_wake_queue()
does, so make it call it directly after looking up the queue from the index.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 +++++++++++++-
 net/core/dev.c            | 22 ----------------------
 2 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ebd9e2c12f44..97ae0ac513ee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3106,7 +3106,19 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev,
 	return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
 }
 
-void netif_wake_subqueue(struct net_device *dev, u16 queue_index);
+/**
+ *	netif_wake_subqueue - allow sending packets on subqueue
+ *	@dev: network device
+ *	@queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+
+	netif_tx_wake_queue(txq);
+}
 
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index e98cc06d2577..ad5959e56116 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2408,28 +2408,6 @@ void netif_schedule_queue(struct netdev_queue *txq)
 }
 EXPORT_SYMBOL(netif_schedule_queue);
 
-/**
- *	netif_wake_subqueue - allow sending packets on subqueue
- *	@dev: network device
- *	@queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-
-	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
-		struct Qdisc *q;
-
-		rcu_read_lock();
-		q = rcu_dereference(txq->qdisc);
-		__netif_schedule(q);
-		rcu_read_unlock();
-	}
-}
-EXPORT_SYMBOL(netif_wake_subqueue);
-
 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 {
 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
-- 
cgit v1.2.3


From 6b8cc1d11ef75c5b9c530b3d0d148f3c2dd25f93 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 12 Jan 2017 11:51:32 +0100
Subject: bpf: pass original insn directly to convert_ctx_access

Currently, when calling convert_ctx_access() callback for the various
program types, we pass in insn->dst_reg, insn->src_reg, insn->off from
the original instruction. This information is needed to rewrite the
instruction that is based on the user ctx structure into a kernel
representation for the ctx. As we'd like to allow access size beyond
just BPF_W, we'd need also insn->code for that in order to decode the
original access size. Given that, lets just pass insn directly to the
convert_ctx_access() callback and work on that to not clutter the
callback with even more arguments we need to pass when everything is
already contained in insn. So lets go through that once, no functional
change.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   7 ++-
 kernel/bpf/verifier.c    |   3 +-
 kernel/trace/bpf_trace.c |  15 ++---
 net/core/filter.c        | 139 +++++++++++++++++++++++++----------------------
 4 files changed, 87 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 94ea8d2383e6..f8c3560b01db 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -161,9 +161,10 @@ struct bpf_verifier_ops {
 				enum bpf_reg_type *reg_type);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
-	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
-				  int src_reg, int ctx_off,
-				  struct bpf_insn *insn, struct bpf_prog *prog);
+	u32 (*convert_ctx_access)(enum bpf_access_type type,
+				  const struct bpf_insn *src,
+				  struct bpf_insn *dst,
+				  struct bpf_prog *prog);
 };
 
 struct bpf_prog_type_list {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2efdc9128e3c..df7e47244e75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3177,8 +3177,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
 			continue;
 
-		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					      insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f883c43c96f3..1860e7f1e5a8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -572,28 +572,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				      int src_reg, int ctx_off,
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
+				      const struct bpf_insn *si,
 				      struct bpf_insn *insn_buf,
 				      struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct bpf_perf_event_data, sample_period):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
-						       data), dst_reg, src_reg,
+						       data), si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, data));
-		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+		*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
 				      offsetof(struct perf_sample_data, period));
 		break;
 	default:
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
-						       regs), dst_reg, src_reg,
+						       regs), si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, regs));
-		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
+				      si->off);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index f4d16a905754..8cfbdefbfb1c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2972,32 +2972,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					int src_reg, int ctx_off,
+static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
+					const struct bpf_insn *si,
 					struct bpf_insn *insn_buf,
 					struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, len):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, len));
 		break;
 
 	case offsetof(struct __sk_buff, protocol):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, protocol));
 		break;
 
 	case offsetof(struct __sk_buff, vlan_proto):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, vlan_proto));
 		break;
 
@@ -3005,17 +3006,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		break;
 
 	case offsetof(struct __sk_buff, ingress_ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, skb_iif));
 		break;
 
@@ -3023,17 +3024,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 
 	case offsetof(struct __sk_buff, hash):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, hash));
 		break;
 
@@ -3041,63 +3042,74 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		break;
 
 	case offsetof(struct __sk_buff, pkt_type):
-		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, queue_mapping):
-		return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_present):
 		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_tci):
 		return convert_skb_access(SKF_AD_VLAN_TAG,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, cb[0]) ...
 	     offsetof(struct __sk_buff, cb[4]):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
 
 		prog->cb_access = 1;
-		ctx_off -= offsetof(struct __sk_buff, cb[0]);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, data);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, cb[0]);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, data);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_classid):
-		ctx_off -= offsetof(struct __sk_buff, tc_classid);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, tc_classid);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, tc_classid);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, data));
 		break;
 
 	case offsetof(struct __sk_buff, data_end):
-		ctx_off -= offsetof(struct __sk_buff, data_end);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
-		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
-				      ctx_off);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_end);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_end);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_index):
@@ -3105,110 +3117,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
-		break;
 #else
 		if (type == BPF_WRITE)
-			*insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+			*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
 		else
-			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
-		break;
+			*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
 #endif
+		break;
 	}
 
 	return insn - insn_buf;
 }
 
 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
-					  int dst_reg, int src_reg,
-					  int ctx_off,
+					  const struct bpf_insn *si,
 					  struct bpf_insn *insn_buf,
 					  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct bpf_sock, bound_dev_if):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					offsetof(struct sock, sk_bound_dev_if));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
 
 	case offsetof(struct bpf_sock, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_family));
 		break;
 
 	case offsetof(struct bpf_sock, type):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
 		break;
 
 	case offsetof(struct bpf_sock, protocol):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
 		break;
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					 int src_reg, int ctx_off,
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+					 const struct bpf_insn *si,
 					 struct bpf_insn *insn_buf,
 					 struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 	default:
-		return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
-						    ctx_off, insn_buf, prog);
+		return sk_filter_convert_ctx_access(type, si, insn_buf, prog);
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				  int src_reg, int ctx_off,
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
 				  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct xdp_md, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
 	}
-- 
cgit v1.2.3


From 3a2f5a59a695a73e0cde9a61e0feae5fa730e936 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Tue, 10 Jan 2017 12:28:32 -0500
Subject: security,selinux,smack: kill security_task_wait hook

As reported by yangshukui, a permission denial from security_task_wait()
can lead to a soft lockup in zap_pid_ns_processes() since it only expects
sys_wait4() to return 0 or -ECHILD. Further, security_task_wait() can
in general lead to zombies; in the absence of some way to automatically
reparent a child process upon a denial, the hook is not useful.  Remove
the security hook and its implementations in SELinux and Smack.  Smack
already removed its check from its hook.

Reported-by: yangshukui <yangshukui@huawei.com>
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h  |  7 -------
 include/linux/security.h   |  6 ------
 kernel/exit.c              | 19 ++-----------------
 security/security.c        |  6 ------
 security/selinux/hooks.c   |  7 -------
 security/smack/smack_lsm.c | 20 --------------------
 6 files changed, 2 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0dde95900196..6fe7a5cb0be1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -666,11 +666,6 @@
  *	@sig contains the signal value.
  *	@secid contains the sid of the process where the signal originated
  *	Return 0 if permission is granted.
- * @task_wait:
- *	Check permission before allowing a process to reap a child process @p
- *	and collect its status information.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
  * @task_prctl:
  *	Check permission before performing a process control operation on the
  *	current process.
@@ -1507,7 +1502,6 @@ union security_list_options {
 	int (*task_movememory)(struct task_struct *p);
 	int (*task_kill)(struct task_struct *p, struct siginfo *info,
 				int sig, u32 secid);
-	int (*task_wait)(struct task_struct *p);
 	int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3,
 				unsigned long arg4, unsigned long arg5);
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
@@ -1767,7 +1761,6 @@ struct security_hook_heads {
 	struct list_head task_getscheduler;
 	struct list_head task_movememory;
 	struct list_head task_kill;
-	struct list_head task_wait;
 	struct list_head task_prctl;
 	struct list_head task_to_inode;
 	struct list_head ipc_permission;
diff --git a/include/linux/security.h b/include/linux/security.h
index f4ebac117fa6..d3868f2ebada 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -332,7 +332,6 @@ int security_task_getscheduler(struct task_struct *p);
 int security_task_movememory(struct task_struct *p);
 int security_task_kill(struct task_struct *p, struct siginfo *info,
 			int sig, u32 secid);
-int security_task_wait(struct task_struct *p);
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
@@ -980,11 +979,6 @@ static inline int security_task_kill(struct task_struct *p,
 	return 0;
 }
 
-static inline int security_task_wait(struct task_struct *p)
-{
-	return 0;
-}
-
 static inline int security_task_prctl(int option, unsigned long arg2,
 				      unsigned long arg3,
 				      unsigned long arg4,
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..60f245190571 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -14,7 +14,6 @@
 #include <linux/tty.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
-#include <linux/security.h>
 #include <linux/cpu.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
@@ -1360,7 +1359,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
  * then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
  */
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
 				struct task_struct *p)
@@ -1380,20 +1379,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (!ret)
 		return ret;
 
-	ret = security_task_wait(p);
-	if (unlikely(ret < 0)) {
-		/*
-		 * If we have not yet seen any eligible child,
-		 * then let this error code replace -ECHILD.
-		 * A permission error will give the user a clue
-		 * to look for security policy problems, rather
-		 * than for mysterious wait bugs.
-		 */
-		if (wo->notask_error)
-			wo->notask_error = ret;
-		return 0;
-	}
-
 	if (unlikely(exit_state == EXIT_TRACE)) {
 		/*
 		 * ptrace == 0 means we are the natural parent. In this case
@@ -1486,7 +1471,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
  * ->notask_error is 0 if there were any eligible children,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
diff --git a/security/security.c b/security/security.c
index 32052f5c76b2..8c9fee59e60a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1025,11 +1025,6 @@ int security_task_kill(struct task_struct *p, struct siginfo *info,
 	return call_int_hook(task_kill, 0, p, info, sig, secid);
 }
 
-int security_task_wait(struct task_struct *p)
-{
-	return call_int_hook(task_wait, 0, p);
-}
-
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 unsigned long arg4, unsigned long arg5)
 {
@@ -1769,7 +1764,6 @@ struct security_hook_heads security_hook_heads = {
 	.task_movememory =
 		LIST_HEAD_INIT(security_hook_heads.task_movememory),
 	.task_kill =	LIST_HEAD_INIT(security_hook_heads.task_kill),
-	.task_wait =	LIST_HEAD_INIT(security_hook_heads.task_wait),
 	.task_prctl =	LIST_HEAD_INIT(security_hook_heads.task_prctl),
 	.task_to_inode =
 		LIST_HEAD_INIT(security_hook_heads.task_to_inode),
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 55ad878f1146..a5398fea0966 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3963,12 +3963,6 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
 	return avc_has_perm(secid, task_sid(p), SECCLASS_PROCESS, perm, NULL);
 }
 
-static int selinux_task_wait(struct task_struct *p)
-{
-	return avc_has_perm(task_sid(p), current_sid(), SECCLASS_PROCESS,
-			    PROCESS__SIGCHLD, NULL);
-}
-
 static void selinux_task_to_inode(struct task_struct *p,
 				  struct inode *inode)
 {
@@ -6211,7 +6205,6 @@ static struct security_hook_list selinux_hooks[] = {
 	LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
 	LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
 	LSM_HOOK_INIT(task_kill, selinux_task_kill),
-	LSM_HOOK_INIT(task_wait, selinux_task_wait),
 	LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),
 
 	LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 8da4a6b9ca4d..2166373ea5a4 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2271,25 +2271,6 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 	return rc;
 }
 
-/**
- * smack_task_wait - Smack access check for waiting
- * @p: task to wait for
- *
- * Returns 0
- */
-static int smack_task_wait(struct task_struct *p)
-{
-	/*
-	 * Allow the operation to succeed.
-	 * Zombies are bad.
-	 * In userless environments (e.g. phones) programs
-	 * get marked with SMACK64EXEC and even if the parent
-	 * and child shouldn't be talking the parent still
-	 * may expect to know when the child exits.
-	 */
-	return 0;
-}
-
 /**
  * smack_task_to_inode - copy task smack into the inode blob
  * @p: task to copy from
@@ -4658,7 +4639,6 @@ static struct security_hook_list smack_hooks[] = {
 	LSM_HOOK_INIT(task_getscheduler, smack_task_getscheduler),
 	LSM_HOOK_INIT(task_movememory, smack_task_movememory),
 	LSM_HOOK_INIT(task_kill, smack_task_kill),
-	LSM_HOOK_INIT(task_wait, smack_task_wait),
 	LSM_HOOK_INIT(task_to_inode, smack_task_to_inode),
 
 	LSM_HOOK_INIT(ipc_permission, smack_ipc_permission),
-- 
cgit v1.2.3


From d8c34b949d8c9f61e099e00f22770e400adf2b76 Mon Sep 17 00:00:00 2001
From: Gideon Israel Dsouza <gidisrael@gmail.com>
Date: Sat, 31 Dec 2016 21:26:23 +0530
Subject: crypto: Replaced gcc specific attributes with macros from compiler.h

Continuing from this commit: 52f5684c8e1e
("kernel: use macros from compiler.h instead of __attribute__((...))")

I submitted 4 total patches. They are part of task I've taken up to
increase compiler portability in the kernel. I've cleaned up the
subsystems under /kernel /mm /block and /security, this patch targets
/crypto.

There is <linux/compiler.h> which provides macros for various gcc specific
constructs. Eg: __weak for __attribute__((weak)). I've cleaned all
instances of gcc specific attributes with the right macros for the crypto
subsystem.

I had to make one additional change into compiler-gcc.h for the case when
one wants to use this: __attribute__((aligned) and not specify an alignment
factor. From the gcc docs, this will result in the largest alignment for
that data type on the target machine so I've named the macro
__aligned_largest. Please advise if another name is more appropriate.

Signed-off-by: Gideon Israel Dsouza <gidisrael@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ablkcipher.c          | 5 +++--
 crypto/acompress.c           | 3 ++-
 crypto/aead.c                | 3 ++-
 crypto/ahash.c               | 3 ++-
 crypto/akcipher.c            | 3 ++-
 crypto/blkcipher.c           | 7 ++++---
 crypto/cts.c                 | 5 +++--
 crypto/kpp.c                 | 3 ++-
 crypto/pcbc.c                | 3 ++-
 crypto/rng.c                 | 3 ++-
 crypto/scompress.c           | 3 ++-
 crypto/shash.c               | 9 +++++----
 crypto/skcipher.c            | 3 ++-
 include/linux/compiler-gcc.h | 1 +
 14 files changed, 34 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c
index d676fc59521a..d880a4897159 100644
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include <crypto/scatterwalk.h>
@@ -394,7 +395,7 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
@@ -468,7 +469,7 @@ static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
diff --git a/crypto/acompress.c b/crypto/acompress.c
index 887783d8e9a9..47d11627cd20 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -20,6 +20,7 @@
 #include <linux/crypto.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 #include <crypto/internal/acompress.h>
 #include <crypto/internal/scompress.h>
@@ -50,7 +51,7 @@ static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/aead.c b/crypto/aead.c
index 3f5c5ff004ab..f794b30a9407 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -132,7 +133,7 @@ static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
diff --git a/crypto/ahash.c b/crypto/ahash.c
index 2ce8bcb9049c..e58c4970c22b 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -493,7 +494,7 @@ static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : ahash\n");
diff --git a/crypto/akcipher.c b/crypto/akcipher.c
index def301ed1288..cfbdb06d8ca8 100644
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/compiler.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
 #include <net/netlink.h>
@@ -47,7 +48,7 @@ static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index a832426820e8..6c43a0a17a55 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -1,6 +1,6 @@
 /*
  * Block chaining cipher operations.
- * 
+ *
  * Generic encrypt/decrypt wrapper for ciphers, handles operations across
  * multiple page boundaries by using temporary blocks.  In user context,
  * the kernel is given a chance to schedule us once per page.
@@ -9,7 +9,7 @@
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option) 
+ * Software Foundation; either version 2 of the License, or (at your option)
  * any later version.
  *
  */
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -534,7 +535,7 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : blkcipher\n");
diff --git a/crypto/cts.c b/crypto/cts.c
index 00254d76b21b..a1335d6c35fb 100644
--- a/crypto/cts.c
+++ b/crypto/cts.c
@@ -49,6 +49,7 @@
 #include <linux/scatterlist.h>
 #include <crypto/scatterwalk.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>
 
 struct crypto_cts_ctx {
 	struct crypto_skcipher *child;
@@ -103,7 +104,7 @@ static int cts_cbc_encrypt(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_request *subreq = &rctx->subreq;
 	int bsize = crypto_skcipher_blocksize(tfm);
-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
+	u8 d[bsize * 2] __aligned(__alignof__(u32));
 	struct scatterlist *sg;
 	unsigned int offset;
 	int lastn;
@@ -183,7 +184,7 @@ static int cts_cbc_decrypt(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_request *subreq = &rctx->subreq;
 	int bsize = crypto_skcipher_blocksize(tfm);
-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
+	u8 d[bsize * 2] __aligned(__alignof__(u32));
 	struct scatterlist *sg;
 	unsigned int offset;
 	u8 *space;
diff --git a/crypto/kpp.c b/crypto/kpp.c
index d36ce05eee43..a90edc27af77 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -19,6 +19,7 @@
 #include <linux/crypto.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 #include <crypto/kpp.h>
 #include <crypto/internal/kpp.h>
@@ -47,7 +48,7 @@ static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/pcbc.c b/crypto/pcbc.c
index e4538e07f7ca..11d248673ad4 100644
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>
 
 struct crypto_pcbc_ctx {
 	struct crypto_cipher *child;
@@ -146,7 +147,7 @@ static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req,
 	unsigned int nbytes = walk->nbytes;
 	u8 *src = walk->src.virt.addr;
 	u8 *iv = walk->iv;
-	u8 tmpbuf[bsize] __attribute__ ((aligned(__alignof__(u32))));
+	u8 tmpbuf[bsize] __aligned(__alignof__(u32));
 
 	do {
 		memcpy(tmpbuf, src, bsize);
diff --git a/crypto/rng.c b/crypto/rng.c
index b81cffb13bab..f46dac5288b9 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -95,7 +96,7 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : rng\n");
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 35e396d154b7..6b048b36312d 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/compiler.h>
 #include <linux/vmalloc.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
@@ -57,7 +58,7 @@ static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/shash.c b/crypto/shash.c
index a051541a4a17..5e31c8d776df 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -19,6 +19,7 @@
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
 #include <net/netlink.h>
+#include <linux/compiler.h>
 
 #include "internal.h"
 
@@ -67,7 +68,7 @@ EXPORT_SYMBOL_GPL(crypto_shash_setkey);
 static inline unsigned int shash_align_buffer_size(unsigned len,
 						   unsigned long mask)
 {
-	typedef u8 __attribute__ ((aligned)) u8_aligned;
+	typedef u8 __aligned_largest u8_aligned;
 	return len + (mask & ~(__alignof__(u8_aligned) - 1));
 }
 
@@ -80,7 +81,7 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
 	unsigned int unaligned_len = alignmask + 1 -
 				     ((unsigned long)data & alignmask);
 	u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)]
-		__attribute__ ((aligned));
+		__aligned_largest;
 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
 	int err;
 
@@ -116,7 +117,7 @@ static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
 	struct shash_alg *shash = crypto_shash_alg(tfm);
 	unsigned int ds = crypto_shash_digestsize(tfm);
 	u8 ubuf[shash_align_buffer_size(ds, alignmask)]
-		__attribute__ ((aligned));
+		__aligned_largest;
 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
 	int err;
 
@@ -403,7 +404,7 @@ static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct shash_alg *salg = __crypto_shash_alg(alg);
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 6ee6a1521e0b..014af741fc6a 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -19,6 +19,7 @@
 #include <crypto/scatterwalk.h>
 #include <linux/bug.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/rtnetlink.h>
@@ -807,7 +808,7 @@ static void crypto_skcipher_free_instance(struct crypto_instance *inst)
 }
 
 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 0444b1336268..fddd1a5eb322 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -116,6 +116,7 @@
  */
 #define __pure			__attribute__((pure))
 #define __aligned(x)		__attribute__((aligned(x)))
+#define __aligned_largest	__attribute__((aligned))
 #define __printf(a, b)		__attribute__((format(printf, a, b)))
 #define __scanf(a, b)		__attribute__((format(scanf, a, b)))
 #define __attribute_const__	__attribute__((__const__))
-- 
cgit v1.2.3


From 2cf8e2dfdf88363476f23bc600745250b94dbbed Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Thu, 12 Jan 2017 11:17:39 +0000
Subject: regmap: Fixup the kernel-doc comments on functions/structures

Most of the kernel-doc comments in regmap don't actually generate
correctly. This patch fixes up a few common issues, corrects some typos
and adds some missing argument descriptions.

The most common issues being using a : after the function name which
causes the short description to not render correctly and not separating
the long and short descriptions of the function. There are quite a few
instances of arguments not being described or given the wrong name as
well.

This patch doesn't fixup functions/structures that are currently missing
descriptions.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regcache.c   |  20 +++----
 drivers/base/regmap/regmap-irq.c |  62 ++++++++++---------
 drivers/base/regmap/regmap.c     | 125 +++++++++++++++++++++------------------
 include/linux/regmap.h           | 115 ++++++++++++++++++++---------------
 4 files changed, 179 insertions(+), 143 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 4e582561e1e7..b0a0dcf32fb7 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -224,7 +224,7 @@ void regcache_exit(struct regmap *map)
 }
 
 /**
- * regcache_read: Fetch the value of a given register from the cache.
+ * regcache_read - Fetch the value of a given register from the cache.
  *
  * @map: map to configure.
  * @reg: The register index.
@@ -255,7 +255,7 @@ int regcache_read(struct regmap *map,
 }
 
 /**
- * regcache_write: Set the value of a given register in the cache.
+ * regcache_write - Set the value of a given register in the cache.
  *
  * @map: map to configure.
  * @reg: The register index.
@@ -328,7 +328,7 @@ static int regcache_default_sync(struct regmap *map, unsigned int min,
 }
 
 /**
- * regcache_sync: Sync the register cache with the hardware.
+ * regcache_sync - Sync the register cache with the hardware.
  *
  * @map: map to configure.
  *
@@ -396,7 +396,7 @@ out:
 EXPORT_SYMBOL_GPL(regcache_sync);
 
 /**
- * regcache_sync_region: Sync part  of the register cache with the hardware.
+ * regcache_sync_region - Sync part  of the register cache with the hardware.
  *
  * @map: map to sync.
  * @min: first register to sync
@@ -452,7 +452,7 @@ out:
 EXPORT_SYMBOL_GPL(regcache_sync_region);
 
 /**
- * regcache_drop_region: Discard part of the register cache
+ * regcache_drop_region - Discard part of the register cache
  *
  * @map: map to operate on
  * @min: first register to discard
@@ -483,10 +483,10 @@ int regcache_drop_region(struct regmap *map, unsigned int min,
 EXPORT_SYMBOL_GPL(regcache_drop_region);
 
 /**
- * regcache_cache_only: Put a register map into cache only mode
+ * regcache_cache_only - Put a register map into cache only mode
  *
  * @map: map to configure
- * @cache_only: flag if changes should be written to the hardware
+ * @enable: flag if changes should be written to the hardware
  *
  * When a register map is marked as cache only writes to the register
  * map API will only update the register cache, they will not cause
@@ -505,7 +505,7 @@ void regcache_cache_only(struct regmap *map, bool enable)
 EXPORT_SYMBOL_GPL(regcache_cache_only);
 
 /**
- * regcache_mark_dirty: Indicate that HW registers were reset to default values
+ * regcache_mark_dirty - Indicate that HW registers were reset to default values
  *
  * @map: map to mark
  *
@@ -527,10 +527,10 @@ void regcache_mark_dirty(struct regmap *map)
 EXPORT_SYMBOL_GPL(regcache_mark_dirty);
 
 /**
- * regcache_cache_bypass: Put a register map into cache bypass mode
+ * regcache_cache_bypass - Put a register map into cache bypass mode
  *
  * @map: map to configure
- * @cache_bypass: flag if changes should not be written to the cache
+ * @enable: flag if changes should not be written to the cache
  *
  * When a register map is marked with the cache bypass option, writes
  * to the register map API will only update the hardware and not the
diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index ec262476d043..cd54189f2b1d 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -398,13 +398,14 @@ static const struct irq_domain_ops regmap_domain_ops = {
 };
 
 /**
- * regmap_add_irq_chip(): Use standard regmap IRQ controller handling
+ * regmap_add_irq_chip() - Use standard regmap IRQ controller handling
  *
- * map:       The regmap for the device.
- * irq:       The IRQ the device uses to signal interrupts
- * irq_flags: The IRQF_ flags to use for the primary interrupt.
- * chip:      Configuration for the interrupt controller.
- * data:      Runtime data structure for the controller, allocated on success
+ * @map: The regmap for the device.
+ * @irq: The IRQ the device uses to signal interrupts.
+ * @irq_flags: The IRQF_ flags to use for the primary interrupt.
+ * @irq_base: Allocate at specific IRQ number if irq_base > 0.
+ * @chip: Configuration for the interrupt controller.
+ * @data: Runtime data structure for the controller, allocated on success.
  *
  * Returns 0 on success or an errno on failure.
  *
@@ -659,12 +660,12 @@ err_alloc:
 EXPORT_SYMBOL_GPL(regmap_add_irq_chip);
 
 /**
- * regmap_del_irq_chip(): Stop interrupt handling for a regmap IRQ chip
+ * regmap_del_irq_chip() - Stop interrupt handling for a regmap IRQ chip
  *
  * @irq: Primary IRQ for the device
- * @d:   regmap_irq_chip_data allocated by regmap_add_irq_chip()
+ * @d: &regmap_irq_chip_data allocated by regmap_add_irq_chip()
  *
- * This function also dispose all mapped irq on chip.
+ * This function also disposes of all mapped IRQs on the chip.
  */
 void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d)
 {
@@ -723,18 +724,19 @@ static int devm_regmap_irq_chip_match(struct device *dev, void *res, void *data)
 }
 
 /**
- * devm_regmap_add_irq_chip(): Resource manager regmap_add_irq_chip()
+ * devm_regmap_add_irq_chip() - Resource manager regmap_add_irq_chip()
  *
- * @dev:       The device pointer on which irq_chip belongs to.
- * @map:       The regmap for the device.
- * @irq:       The IRQ the device uses to signal interrupts
+ * @dev: The device pointer on which irq_chip belongs to.
+ * @map: The regmap for the device.
+ * @irq: The IRQ the device uses to signal interrupts
  * @irq_flags: The IRQF_ flags to use for the primary interrupt.
- * @chip:      Configuration for the interrupt controller.
- * @data:      Runtime data structure for the controller, allocated on success
+ * @irq_base: Allocate at specific IRQ number if irq_base > 0.
+ * @chip: Configuration for the interrupt controller.
+ * @data: Runtime data structure for the controller, allocated on success
  *
  * Returns 0 on success or an errno on failure.
  *
- * The regmap_irq_chip data automatically be released when the device is
+ * The &regmap_irq_chip_data will be automatically released when the device is
  * unbound.
  */
 int devm_regmap_add_irq_chip(struct device *dev, struct regmap *map, int irq,
@@ -765,11 +767,13 @@ int devm_regmap_add_irq_chip(struct device *dev, struct regmap *map, int irq,
 EXPORT_SYMBOL_GPL(devm_regmap_add_irq_chip);
 
 /**
- * devm_regmap_del_irq_chip(): Resource managed regmap_del_irq_chip()
+ * devm_regmap_del_irq_chip() - Resource managed regmap_del_irq_chip()
  *
  * @dev: Device for which which resource was allocated.
- * @irq: Primary IRQ for the device
- * @d:   regmap_irq_chip_data allocated by regmap_add_irq_chip()
+ * @irq: Primary IRQ for the device.
+ * @data: &regmap_irq_chip_data allocated by regmap_add_irq_chip().
+ *
+ * A resource managed version of regmap_del_irq_chip().
  */
 void devm_regmap_del_irq_chip(struct device *dev, int irq,
 			      struct regmap_irq_chip_data *data)
@@ -786,11 +790,11 @@ void devm_regmap_del_irq_chip(struct device *dev, int irq,
 EXPORT_SYMBOL_GPL(devm_regmap_del_irq_chip);
 
 /**
- * regmap_irq_chip_get_base(): Retrieve interrupt base for a regmap IRQ chip
+ * regmap_irq_chip_get_base() - Retrieve interrupt base for a regmap IRQ chip
  *
- * Useful for drivers to request their own IRQs.
+ * @data: regmap irq controller to operate on.
  *
- * @data: regmap_irq controller to operate on.
+ * Useful for drivers to request their own IRQs.
  */
 int regmap_irq_chip_get_base(struct regmap_irq_chip_data *data)
 {
@@ -800,12 +804,12 @@ int regmap_irq_chip_get_base(struct regmap_irq_chip_data *data)
 EXPORT_SYMBOL_GPL(regmap_irq_chip_get_base);
 
 /**
- * regmap_irq_get_virq(): Map an interrupt on a chip to a virtual IRQ
+ * regmap_irq_get_virq() - Map an interrupt on a chip to a virtual IRQ
  *
- * Useful for drivers to request their own IRQs.
+ * @data: regmap irq controller to operate on.
+ * @irq: index of the interrupt requested in the chip IRQs.
  *
- * @data: regmap_irq controller to operate on.
- * @irq: index of the interrupt requested in the chip IRQs
+ * Useful for drivers to request their own IRQs.
  */
 int regmap_irq_get_virq(struct regmap_irq_chip_data *data, int irq)
 {
@@ -818,14 +822,14 @@ int regmap_irq_get_virq(struct regmap_irq_chip_data *data, int irq)
 EXPORT_SYMBOL_GPL(regmap_irq_get_virq);
 
 /**
- * regmap_irq_get_domain(): Retrieve the irq_domain for the chip
+ * regmap_irq_get_domain() - Retrieve the irq_domain for the chip
+ *
+ * @data: regmap_irq controller to operate on.
  *
  * Useful for drivers to request their own IRQs and for integration
  * with subsystems.  For ease of integration NULL is accepted as a
  * domain, allowing devices to just call this even if no domain is
  * allocated.
- *
- * @data: regmap_irq controller to operate on.
  */
 struct irq_domain *regmap_irq_get_domain(struct regmap_irq_chip_data *data)
 {
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index ae63bb0875ea..0e770ba8d361 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1091,8 +1091,7 @@ static void regmap_field_init(struct regmap_field *rm_field,
 }
 
 /**
- * devm_regmap_field_alloc(): Allocate and initialise a register field
- * in a register map.
+ * devm_regmap_field_alloc() - Allocate and initialise a register field.
  *
  * @dev: Device that will be interacted with
  * @regmap: regmap bank in which this register field is located.
@@ -1118,13 +1117,15 @@ struct regmap_field *devm_regmap_field_alloc(struct device *dev,
 EXPORT_SYMBOL_GPL(devm_regmap_field_alloc);
 
 /**
- * devm_regmap_field_free(): Free register field allocated using
- * devm_regmap_field_alloc. Usally drivers need not call this function,
- * as the memory allocated via devm will be freed as per device-driver
- * life-cyle.
+ * devm_regmap_field_free() - Free a register field allocated using
+ *                            devm_regmap_field_alloc.
  *
  * @dev: Device that will be interacted with
  * @field: regmap field which should be freed.
+ *
+ * Free register field allocated using devm_regmap_field_alloc(). Usually
+ * drivers need not call this function, as the memory allocated via devm
+ * will be freed as per device-driver life-cyle.
  */
 void devm_regmap_field_free(struct device *dev,
 	struct regmap_field *field)
@@ -1134,8 +1135,7 @@ void devm_regmap_field_free(struct device *dev,
 EXPORT_SYMBOL_GPL(devm_regmap_field_free);
 
 /**
- * regmap_field_alloc(): Allocate and initialise a register field
- * in a register map.
+ * regmap_field_alloc() - Allocate and initialise a register field.
  *
  * @regmap: regmap bank in which this register field is located.
  * @reg_field: Register field with in the bank.
@@ -1159,7 +1159,8 @@ struct regmap_field *regmap_field_alloc(struct regmap *regmap,
 EXPORT_SYMBOL_GPL(regmap_field_alloc);
 
 /**
- * regmap_field_free(): Free register field allocated using regmap_field_alloc
+ * regmap_field_free() - Free register field allocated using
+ *                       regmap_field_alloc.
  *
  * @field: regmap field which should be freed.
  */
@@ -1170,7 +1171,7 @@ void regmap_field_free(struct regmap_field *field)
 EXPORT_SYMBOL_GPL(regmap_field_free);
 
 /**
- * regmap_reinit_cache(): Reinitialise the current register cache
+ * regmap_reinit_cache() - Reinitialise the current register cache
  *
  * @map: Register map to operate on.
  * @config: New configuration.  Only the cache data will be used.
@@ -1205,7 +1206,9 @@ int regmap_reinit_cache(struct regmap *map, const struct regmap_config *config)
 EXPORT_SYMBOL_GPL(regmap_reinit_cache);
 
 /**
- * regmap_exit(): Free a previously allocated register map
+ * regmap_exit() - Free a previously allocated register map
+ *
+ * @map: Register map to operate on.
  */
 void regmap_exit(struct regmap *map)
 {
@@ -1245,7 +1248,7 @@ static int dev_get_regmap_match(struct device *dev, void *res, void *data)
 }
 
 /**
- * dev_get_regmap(): Obtain the regmap (if any) for a device
+ * dev_get_regmap() - Obtain the regmap (if any) for a device
  *
  * @dev: Device to retrieve the map for
  * @name: Optional name for the register map, usually NULL.
@@ -1268,7 +1271,7 @@ struct regmap *dev_get_regmap(struct device *dev, const char *name)
 EXPORT_SYMBOL_GPL(dev_get_regmap);
 
 /**
- * regmap_get_device(): Obtain the device from a regmap
+ * regmap_get_device() - Obtain the device from a regmap
  *
  * @map: Register map to operate on.
  *
@@ -1654,7 +1657,7 @@ int _regmap_write(struct regmap *map, unsigned int reg,
 }
 
 /**
- * regmap_write(): Write a value to a single register
+ * regmap_write() - Write a value to a single register
  *
  * @map: Register map to write to
  * @reg: Register to write to
@@ -1681,7 +1684,7 @@ int regmap_write(struct regmap *map, unsigned int reg, unsigned int val)
 EXPORT_SYMBOL_GPL(regmap_write);
 
 /**
- * regmap_write_async(): Write a value to a single register asynchronously
+ * regmap_write_async() - Write a value to a single register asynchronously
  *
  * @map: Register map to write to
  * @reg: Register to write to
@@ -1712,7 +1715,7 @@ int regmap_write_async(struct regmap *map, unsigned int reg, unsigned int val)
 EXPORT_SYMBOL_GPL(regmap_write_async);
 
 /**
- * regmap_raw_write(): Write raw values to one or more registers
+ * regmap_raw_write() - Write raw values to one or more registers
  *
  * @map: Register map to write to
  * @reg: Initial register to write to
@@ -1750,9 +1753,8 @@ int regmap_raw_write(struct regmap *map, unsigned int reg,
 EXPORT_SYMBOL_GPL(regmap_raw_write);
 
 /**
- * regmap_field_update_bits_base():
- *	Perform a read/modify/write cycle on the register field
- *	with change, async, force option
+ * regmap_field_update_bits_base() - Perform a read/modify/write cycle a
+ *                                   register field.
  *
  * @field: Register field to write to
  * @mask: Bitmask to change
@@ -1761,6 +1763,9 @@ EXPORT_SYMBOL_GPL(regmap_raw_write);
  * @async: Boolean indicating asynchronously
  * @force: Boolean indicating use force update
  *
+ * Perform a read/modify/write cycle on the register field with change,
+ * async, force option.
+ *
  * A value of zero will be returned on success, a negative errno will
  * be returned in error cases.
  */
@@ -1777,9 +1782,8 @@ int regmap_field_update_bits_base(struct regmap_field *field,
 EXPORT_SYMBOL_GPL(regmap_field_update_bits_base);
 
 /**
- * regmap_fields_update_bits_base():
- *	Perform a read/modify/write cycle on the register field
- *	with change, async, force option
+ * regmap_fields_update_bits_base() - Perform a read/modify/write cycle a
+ *                                    register field with port ID
  *
  * @field: Register field to write to
  * @id: port ID
@@ -1808,8 +1812,8 @@ int regmap_fields_update_bits_base(struct regmap_field *field,  unsigned int id,
 }
 EXPORT_SYMBOL_GPL(regmap_fields_update_bits_base);
 
-/*
- * regmap_bulk_write(): Write multiple registers to the device
+/**
+ * regmap_bulk_write() - Write multiple registers to the device
  *
  * @map: Register map to write to
  * @reg: First register to be write from
@@ -2174,18 +2178,18 @@ static int _regmap_multi_reg_write(struct regmap *map,
 	return _regmap_raw_multi_reg_write(map, regs, num_regs);
 }
 
-/*
- * regmap_multi_reg_write(): Write multiple registers to the device
- *
- * where the set of register,value pairs are supplied in any order,
- * possibly not all in a single range.
+/**
+ * regmap_multi_reg_write() - Write multiple registers to the device
  *
  * @map: Register map to write to
  * @regs: Array of structures containing register,value to be written
  * @num_regs: Number of registers to write
  *
+ * Write multiple registers to the device where the set of register, value
+ * pairs are supplied in any order, possibly not all in a single range.
+ *
  * The 'normal' block write mode will send ultimately send data on the
- * target bus as R,V1,V2,V3,..,Vn where successively higer registers are
+ * target bus as R,V1,V2,V3,..,Vn where successively higher registers are
  * addressed. However, this alternative block multi write mode will send
  * the data as R1,V1,R2,V2,..,Rn,Vn on the target bus. The target device
  * must of course support the mode.
@@ -2208,16 +2212,17 @@ int regmap_multi_reg_write(struct regmap *map, const struct reg_sequence *regs,
 }
 EXPORT_SYMBOL_GPL(regmap_multi_reg_write);
 
-/*
- * regmap_multi_reg_write_bypassed(): Write multiple registers to the
- *                                    device but not the cache
- *
- * where the set of register are supplied in any order
+/**
+ * regmap_multi_reg_write_bypassed() - Write multiple registers to the
+ *                                     device but not the cache
  *
  * @map: Register map to write to
  * @regs: Array of structures containing register,value to be written
  * @num_regs: Number of registers to write
  *
+ * Write multiple registers to the device but not the cache where the set
+ * of register are supplied in any order.
+ *
  * This function is intended to be used for writing a large block of data
  * atomically to the device in single transfer for those I2C client devices
  * that implement this alternative block write mode.
@@ -2248,8 +2253,8 @@ int regmap_multi_reg_write_bypassed(struct regmap *map,
 EXPORT_SYMBOL_GPL(regmap_multi_reg_write_bypassed);
 
 /**
- * regmap_raw_write_async(): Write raw values to one or more registers
- *                           asynchronously
+ * regmap_raw_write_async() - Write raw values to one or more registers
+ *                            asynchronously
  *
  * @map: Register map to write to
  * @reg: Initial register to write to
@@ -2385,7 +2390,7 @@ static int _regmap_read(struct regmap *map, unsigned int reg,
 }
 
 /**
- * regmap_read(): Read a value from a single register
+ * regmap_read() - Read a value from a single register
  *
  * @map: Register map to read from
  * @reg: Register to be read from
@@ -2412,7 +2417,7 @@ int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val)
 EXPORT_SYMBOL_GPL(regmap_read);
 
 /**
- * regmap_raw_read(): Read raw data from the device
+ * regmap_raw_read() - Read raw data from the device
  *
  * @map: Register map to read from
  * @reg: First register to be read from
@@ -2477,7 +2482,7 @@ int regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
 EXPORT_SYMBOL_GPL(regmap_raw_read);
 
 /**
- * regmap_field_read(): Read a value to a single register field
+ * regmap_field_read() - Read a value to a single register field
  *
  * @field: Register field to read from
  * @val: Pointer to store read value
@@ -2502,7 +2507,7 @@ int regmap_field_read(struct regmap_field *field, unsigned int *val)
 EXPORT_SYMBOL_GPL(regmap_field_read);
 
 /**
- * regmap_fields_read(): Read a value to a single register field with port ID
+ * regmap_fields_read() - Read a value to a single register field with port ID
  *
  * @field: Register field to read from
  * @id: port ID
@@ -2535,7 +2540,7 @@ int regmap_fields_read(struct regmap_field *field, unsigned int id,
 EXPORT_SYMBOL_GPL(regmap_fields_read);
 
 /**
- * regmap_bulk_read(): Read multiple registers from the device
+ * regmap_bulk_read() - Read multiple registers from the device
  *
  * @map: Register map to read from
  * @reg: First register to be read from
@@ -2692,9 +2697,7 @@ static int _regmap_update_bits(struct regmap *map, unsigned int reg,
 }
 
 /**
- * regmap_update_bits_base:
- *	Perform a read/modify/write cycle on the
- *	register map with change, async, force option
+ * regmap_update_bits_base() - Perform a read/modify/write cycle on a register
  *
  * @map: Register map to update
  * @reg: Register to update
@@ -2704,10 +2707,14 @@ static int _regmap_update_bits(struct regmap *map, unsigned int reg,
  * @async: Boolean indicating asynchronously
  * @force: Boolean indicating use force update
  *
- * if async was true,
- * With most buses the read must be done synchronously so this is most
- * useful for devices with a cache which do not need to interact with
- * the hardware to determine the current register value.
+ * Perform a read/modify/write cycle on a register map with change, async, force
+ * options.
+ *
+ * If async is true:
+ *
+ * With most buses the read must be done synchronously so this is most useful
+ * for devices with a cache which do not need to interact with the hardware to
+ * determine the current register value.
  *
  * Returns zero for success, a negative number on error.
  */
@@ -2765,7 +2772,7 @@ static int regmap_async_is_done(struct regmap *map)
 }
 
 /**
- * regmap_async_complete: Ensure all asynchronous I/O has completed.
+ * regmap_async_complete - Ensure all asynchronous I/O has completed.
  *
  * @map: Map to operate on.
  *
@@ -2797,8 +2804,8 @@ int regmap_async_complete(struct regmap *map)
 EXPORT_SYMBOL_GPL(regmap_async_complete);
 
 /**
- * regmap_register_patch: Register and apply register updates to be applied
- *                        on device initialistion
+ * regmap_register_patch - Register and apply register updates to be applied
+ *                         on device initialistion
  *
  * @map: Register map to apply updates to.
  * @regs: Values to update.
@@ -2855,8 +2862,10 @@ int regmap_register_patch(struct regmap *map, const struct reg_sequence *regs,
 }
 EXPORT_SYMBOL_GPL(regmap_register_patch);
 
-/*
- * regmap_get_val_bytes(): Report the size of a register value
+/**
+ * regmap_get_val_bytes() - Report the size of a register value
+ *
+ * @map: Register map to operate on.
  *
  * Report the size of a register value, mainly intended to for use by
  * generic infrastructure built on top of regmap.
@@ -2871,7 +2880,9 @@ int regmap_get_val_bytes(struct regmap *map)
 EXPORT_SYMBOL_GPL(regmap_get_val_bytes);
 
 /**
- * regmap_get_max_register(): Report the max register value
+ * regmap_get_max_register() - Report the max register value
+ *
+ * @map: Register map to operate on.
  *
  * Report the max register value, mainly intended to for use by
  * generic infrastructure built on top of regmap.
@@ -2883,7 +2894,9 @@ int regmap_get_max_register(struct regmap *map)
 EXPORT_SYMBOL_GPL(regmap_get_max_register);
 
 /**
- * regmap_get_reg_stride(): Report the register address stride
+ * regmap_get_reg_stride() - Report the register address stride
+ *
+ * @map: Register map to operate on.
  *
  * Report the register address stride, mainly intended to for use by
  * generic infrastructure built on top of regmap.
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 9adc7b21903d..38a85d50fefd 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -39,12 +39,13 @@ enum regcache_type {
 };
 
 /**
- * Default value for a register.  We use an array of structs rather
- * than a simple array as many modern devices have very sparse
- * register maps.
+ * struct reg_default - Default value for a register.
  *
  * @reg: Register address.
  * @def: Register default value.
+ *
+ * We use an array of structs rather than a simple array as many modern devices
+ * have very sparse register maps.
  */
 struct reg_default {
 	unsigned int reg;
@@ -52,12 +53,14 @@ struct reg_default {
 };
 
 /**
- * Register/value pairs for sequences of writes with an optional delay in
- * microseconds to be applied after each write.
+ * struct reg_sequence - An individual write from a sequence of writes.
  *
  * @reg: Register address.
  * @def: Register value.
  * @delay_us: Delay to be applied after the register write in microseconds
+ *
+ * Register/value pairs for sequences of writes with an optional delay in
+ * microseconds to be applied after each write.
  */
 struct reg_sequence {
 	unsigned int reg;
@@ -97,6 +100,7 @@ struct reg_sequence {
 
 /**
  * regmap_read_poll_timeout - Poll until a condition is met or a timeout occurs
+ *
  * @map: Regmap to read from
  * @addr: Address to poll
  * @val: Unsigned integer variable to read the value into
@@ -145,8 +149,8 @@ enum regmap_endian {
 };
 
 /**
- * A register range, used for access related checks
- * (readable/writeable/volatile/precious checks)
+ * struct regmap_range - A register range, used for access related checks
+ *                       (readable/writeable/volatile/precious checks)
  *
  * @range_min: address of first register
  * @range_max: address of last register
@@ -158,16 +162,18 @@ struct regmap_range {
 
 #define regmap_reg_range(low, high) { .range_min = low, .range_max = high, }
 
-/*
- * A table of ranges including some yes ranges and some no ranges.
- * If a register belongs to a no_range, the corresponding check function
- * will return false. If a register belongs to a yes range, the corresponding
- * check function will return true. "no_ranges" are searched first.
+/**
+ * struct regmap_access_table - A table of register ranges for access checks
  *
  * @yes_ranges : pointer to an array of regmap ranges used as "yes ranges"
  * @n_yes_ranges: size of the above array
  * @no_ranges: pointer to an array of regmap ranges used as "no ranges"
  * @n_no_ranges: size of the above array
+ *
+ * A table of ranges including some yes ranges and some no ranges.
+ * If a register belongs to a no_range, the corresponding check function
+ * will return false. If a register belongs to a yes range, the corresponding
+ * check function will return true. "no_ranges" are searched first.
  */
 struct regmap_access_table {
 	const struct regmap_range *yes_ranges;
@@ -180,7 +186,7 @@ typedef void (*regmap_lock)(void *);
 typedef void (*regmap_unlock)(void *);
 
 /**
- * Configuration for the register map of a device.
+ * struct regmap_config - Configuration for the register map of a device.
  *
  * @name: Optional name of the regmap. Useful when a device has multiple
  *        register regions.
@@ -313,22 +319,24 @@ struct regmap_config {
 };
 
 /**
- * Configuration for indirectly accessed or paged registers.
- * Registers, mapped to this virtual range, are accessed in two steps:
- *     1. page selector register update;
- *     2. access through data window registers.
+ * struct regmap_range_cfg - Configuration for indirectly accessed or paged
+ *                           registers.
  *
  * @name: Descriptive name for diagnostics
  *
  * @range_min: Address of the lowest register address in virtual range.
  * @range_max: Address of the highest register in virtual range.
  *
- * @page_sel_reg: Register with selector field.
- * @page_sel_mask: Bit shift for selector value.
- * @page_sel_shift: Bit mask for selector value.
+ * @selector_reg: Register with selector field.
+ * @selector_mask: Bit shift for selector value.
+ * @selector_shift: Bit mask for selector value.
  *
  * @window_start: Address of first (lowest) register in data window.
  * @window_len: Number of registers in data window.
+ *
+ * Registers, mapped to this virtual range, are accessed in two steps:
+ *     1. page selector register update;
+ *     2. access through data window registers.
  */
 struct regmap_range_cfg {
 	const char *name;
@@ -371,7 +379,8 @@ typedef struct regmap_async *(*regmap_hw_async_alloc)(void);
 typedef void (*regmap_hw_free_context)(void *context);
 
 /**
- * Description of a hardware bus for the register map infrastructure.
+ * struct regmap_bus - Description of a hardware bus for the register map
+ *                     infrastructure.
  *
  * @fast_io: Register IO is fast. Use a spinlock instead of a mutex
  *	     to perform locking. This field is ignored if custom lock/unlock
@@ -384,6 +393,10 @@ typedef void (*regmap_hw_free_context)(void *context);
  *               must serialise with respect to non-async I/O.
  * @reg_write: Write a single register value to the given register address. This
  *             write operation has to complete when returning from the function.
+ * @reg_update_bits: Update bits operation to be used against volatile
+ *                   registers, intended for devices supporting some mechanism
+ *                   for setting clearing bits without having to
+ *                   read/modify/write.
  * @read: Read operation.  Data is returned in the buffer used to transmit
  *         data.
  * @reg_read: Read a single register value from a given register address.
@@ -513,7 +526,7 @@ struct regmap *__devm_regmap_init_ac97(struct snd_ac97 *ac97,
 #endif
 
 /**
- * regmap_init(): Initialise register map
+ * regmap_init() - Initialise register map
  *
  * @dev: Device that will be interacted with
  * @bus: Bus-specific callbacks to use with device
@@ -531,7 +544,7 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 		      const struct regmap_config *config);
 
 /**
- * regmap_init_i2c(): Initialise register map
+ * regmap_init_i2c() - Initialise register map
  *
  * @i2c: Device that will be interacted with
  * @config: Configuration for register map
@@ -544,9 +557,9 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 				i2c, config)
 
 /**
- * regmap_init_spi(): Initialise register map
+ * regmap_init_spi() - Initialise register map
  *
- * @spi: Device that will be interacted with
+ * @dev: Device that will be interacted with
  * @config: Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer to
@@ -557,8 +570,9 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 				dev, config)
 
 /**
- * regmap_init_spmi_base(): Create regmap for the Base register space
- * @sdev:	SPMI device that will be interacted with
+ * regmap_init_spmi_base() - Create regmap for the Base register space
+ *
+ * @dev:	SPMI device that will be interacted with
  * @config:	Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer to
@@ -569,8 +583,9 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 				dev, config)
 
 /**
- * regmap_init_spmi_ext(): Create regmap for Ext register space
- * @sdev:	Device that will be interacted with
+ * regmap_init_spmi_ext() - Create regmap for Ext register space
+ *
+ * @dev:	Device that will be interacted with
  * @config:	Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer to
@@ -581,7 +596,7 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 				dev, config)
 
 /**
- * regmap_init_mmio_clk(): Initialise register map with register clock
+ * regmap_init_mmio_clk() - Initialise register map with register clock
  *
  * @dev: Device that will be interacted with
  * @clk_id: register clock consumer ID
@@ -596,7 +611,7 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 				dev, clk_id, regs, config)
 
 /**
- * regmap_init_mmio(): Initialise register map
+ * regmap_init_mmio() - Initialise register map
  *
  * @dev: Device that will be interacted with
  * @regs: Pointer to memory-mapped IO region
@@ -609,7 +624,7 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 	regmap_init_mmio_clk(dev, NULL, regs, config)
 
 /**
- * regmap_init_ac97(): Initialise AC'97 register map
+ * regmap_init_ac97() - Initialise AC'97 register map
  *
  * @ac97: Device that will be interacted with
  * @config: Configuration for register map
@@ -623,7 +638,7 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 
 /**
- * devm_regmap_init(): Initialise managed register map
+ * devm_regmap_init() - Initialise managed register map
  *
  * @dev: Device that will be interacted with
  * @bus: Bus-specific callbacks to use with device
@@ -640,7 +655,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 				dev, bus, bus_context, config)
 
 /**
- * devm_regmap_init_i2c(): Initialise managed register map
+ * devm_regmap_init_i2c() - Initialise managed register map
  *
  * @i2c: Device that will be interacted with
  * @config: Configuration for register map
@@ -654,9 +669,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 				i2c, config)
 
 /**
- * devm_regmap_init_spi(): Initialise register map
+ * devm_regmap_init_spi() - Initialise register map
  *
- * @spi: Device that will be interacted with
+ * @dev: Device that will be interacted with
  * @config: Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer
@@ -668,8 +683,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 				dev, config)
 
 /**
- * devm_regmap_init_spmi_base(): Create managed regmap for Base register space
- * @sdev:	SPMI device that will be interacted with
+ * devm_regmap_init_spmi_base() - Create managed regmap for Base register space
+ *
+ * @dev:	SPMI device that will be interacted with
  * @config:	Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer
@@ -681,8 +697,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 				dev, config)
 
 /**
- * devm_regmap_init_spmi_ext(): Create managed regmap for Ext register space
- * @sdev:	SPMI device that will be interacted with
+ * devm_regmap_init_spmi_ext() - Create managed regmap for Ext register space
+ *
+ * @dev:	SPMI device that will be interacted with
  * @config:	Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer
@@ -694,7 +711,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 				dev, config)
 
 /**
- * devm_regmap_init_mmio_clk(): Initialise managed register map with clock
+ * devm_regmap_init_mmio_clk() - Initialise managed register map with clock
  *
  * @dev: Device that will be interacted with
  * @clk_id: register clock consumer ID
@@ -710,7 +727,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 				dev, clk_id, regs, config)
 
 /**
- * devm_regmap_init_mmio(): Initialise managed register map
+ * devm_regmap_init_mmio() - Initialise managed register map
  *
  * @dev: Device that will be interacted with
  * @regs: Pointer to memory-mapped IO region
@@ -724,7 +741,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 	devm_regmap_init_mmio_clk(dev, NULL, regs, config)
 
 /**
- * devm_regmap_init_ac97(): Initialise AC'97 register map
+ * devm_regmap_init_ac97() - Initialise AC'97 register map
  *
  * @ac97: Device that will be interacted with
  * @config: Configuration for register map
@@ -799,7 +816,7 @@ bool regmap_reg_in_ranges(unsigned int reg,
 			  unsigned int nranges);
 
 /**
- * Description of an register field
+ * struct reg_field - Description of an register field
  *
  * @reg: Offset of the register within the regmap bank
  * @lsb: lsb of the register field.
@@ -840,7 +857,7 @@ int regmap_fields_update_bits_base(struct regmap_field *field,  unsigned int id,
 				   bool *change, bool async, bool force);
 
 /**
- * Description of an IRQ for the generic regmap irq_chip.
+ * struct regmap_irq - Description of an IRQ for the generic regmap irq_chip.
  *
  * @reg_offset: Offset of the status/mask register within the bank
  * @mask:       Mask used to flag/control the register.
@@ -860,9 +877,7 @@ struct regmap_irq {
 	[_irq] = { .reg_offset = (_off), .mask = (_mask) }
 
 /**
- * Description of a generic regmap irq_chip.  This is not intended to
- * handle every possible interrupt controller, but it should handle a
- * substantial proportion of those that are found in the wild.
+ * struct regmap_irq_chip - Description of a generic regmap irq_chip.
  *
  * @name:        Descriptive name for IRQ controller.
  *
@@ -896,6 +911,10 @@ struct regmap_irq {
  *		     after handling the interrupts in regmap_irq_handler().
  * @irq_drv_data:    Driver specific IRQ data which is passed as parameter when
  *		     driver specific pre/post interrupt handler is called.
+ *
+ * This is not intended to handle every possible interrupt controller, but
+ * it should handle a substantial proportion of those that are found in the
+ * wild.
  */
 struct regmap_irq_chip {
 	const char *name;
-- 
cgit v1.2.3


From 8fb472c09b9df478a062eacc7841448e40fc3c17 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 12 Jan 2017 15:53:33 +0100
Subject: ipmr: improve hash scalability

Recently we started using ipmr with thousands of entries and easily hit
soft lockups on smaller devices. The reason is that the hash function
uses the high order bits from the src and dst, but those don't change in
many common cases, also the hash table  is only 64 elements so with
thousands it doesn't scale at all.
This patch migrates the hash table to rhashtable, and in particular the
rhl interface which allows for duplicate elements to be chained because
of the MFC_PROXY support (*,G; *,*,oif cases) which allows for multiple
duplicate entries to be added with different interfaces (IMO wrong, but
it's been in for a long time).

And here are some results from tests I've run in a VM:
 mr_table size (default, allocated for all namespaces):
  Before                    After
   49304 bytes               2400 bytes

 Add 65000 routes (the diff is much larger on smaller devices):
  Before                    After
   1m42s                     58s

 Forwarding 256 byte packets with 65000 routes (test done in a VM):
  Before                    After
   3 Mbps / ~1465 pps        122 Mbps / ~59000 pps

As a bonus we no longer see the soft lockups on smaller devices which
showed up even with 2000 entries before.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |  57 ++++++++---
 net/ipv4/ipmr.c        | 255 +++++++++++++++++++++++++++----------------------
 2 files changed, 182 insertions(+), 130 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index f019b62f27b5..d7f63339ef0b 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -3,6 +3,7 @@
 
 #include <linux/in.h>
 #include <linux/pim.h>
+#include <linux/rhashtable.h>
 #include <net/sock.h>
 #include <uapi/linux/mroute.h>
 
@@ -60,7 +61,6 @@ struct vif_device {
 #define VIFF_STATIC 0x8000
 
 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
-#define MFC_LINES 64
 
 struct mr_table {
 	struct list_head	list;
@@ -69,8 +69,9 @@ struct mr_table {
 	struct sock __rcu	*mroute_sk;
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc_unres_queue;
-	struct list_head	mfc_cache_array[MFC_LINES];
 	struct vif_device	vif_table[MAXVIFS];
+	struct rhltable		mfc_hash;
+	struct list_head	mfc_cache_list;
 	int			maxvif;
 	atomic_t		cache_resolve_queue_len;
 	bool			mroute_do_assert;
@@ -85,17 +86,48 @@ enum {
 	MFC_STATIC = BIT(0),
 };
 
+struct mfc_cache_cmp_arg {
+	__be32 mfc_mcastgrp;
+	__be32 mfc_origin;
+};
+
+/**
+ * struct mfc_cache - multicast routing entries
+ * @mnode: rhashtable list
+ * @mfc_mcastgrp: destination multicast group address
+ * @mfc_origin: source address
+ * @cmparg: used for rhashtable comparisons
+ * @mfc_parent: source interface (iif)
+ * @mfc_flags: entry flags
+ * @expires: unresolved entry expire time
+ * @unresolved: unresolved cached skbs
+ * @last_assert: time of last assert
+ * @minvif: minimum VIF id
+ * @maxvif: maximum VIF id
+ * @bytes: bytes that have passed for this entry
+ * @pkt: packets that have passed for this entry
+ * @wrong_if: number of wrong source interface hits
+ * @lastuse: time of last use of the group (traffic or update)
+ * @ttls: OIF TTL threshold array
+ * @list: global entry list
+ * @rcu: used for entry destruction
+ */
 struct mfc_cache {
-	struct list_head list;
-	__be32 mfc_mcastgrp;			/* Group the entry belongs to 	*/
-	__be32 mfc_origin;			/* Source of packet 		*/
-	vifi_t mfc_parent;			/* Source interface		*/
-	int mfc_flags;				/* Flags on line		*/
+	struct rhlist_head mnode;
+	union {
+		struct {
+			__be32 mfc_mcastgrp;
+			__be32 mfc_origin;
+		};
+		struct mfc_cache_cmp_arg cmparg;
+	};
+	vifi_t mfc_parent;
+	int mfc_flags;
 
 	union {
 		struct {
 			unsigned long expires;
-			struct sk_buff_head unresolved;	/* Unresolved buffers		*/
+			struct sk_buff_head unresolved;
 		} unres;
 		struct {
 			unsigned long last_assert;
@@ -105,18 +137,13 @@ struct mfc_cache {
 			unsigned long pkt;
 			unsigned long wrong_if;
 			unsigned long lastuse;
-			unsigned char ttls[MAXVIFS];	/* TTL thresholds		*/
+			unsigned char ttls[MAXVIFS];
 		} res;
 	} mfc_un;
+	struct list_head list;
 	struct rcu_head	rcu;
 };
 
-#ifdef __BIG_ENDIAN
-#define MFC_HASH(a,b)	(((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1))
-#else
-#define MFC_HASH(a,b)	((((__force u32)(__be32)a)^(((__force u32)(__be32)b)>>2))&(MFC_LINES-1))
-#endif
-
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 824c4fdf21eb..beacd028848c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -299,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 }
 #endif
 
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+				const void *ptr)
+{
+	const struct mfc_cache_cmp_arg *cmparg = arg->key;
+	struct mfc_cache *c = (struct mfc_cache *)ptr;
+
+	return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+	       cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+	.head_offset = offsetof(struct mfc_cache, mnode),
+	.key_offset = offsetof(struct mfc_cache, cmparg),
+	.key_len = sizeof(struct mfc_cache_cmp_arg),
+	.nelem_hint = 3,
+	.locks_mul = 1,
+	.obj_cmpfn = ipmr_hash_cmp,
+	.automatic_shrinking = true,
+};
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
-	unsigned int i;
 
 	/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
 	if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -318,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	write_pnet(&mrt->net, net);
 	mrt->id = id;
 
-	/* Forwarding cache */
-	for (i = 0; i < MFC_LINES; i++)
-		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
-
+	rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
+	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
 	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -338,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
 {
 	del_timer_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, true);
+	rhltable_destroy(&mrt->mfc_hash);
 	kfree(mrt);
 }
 
@@ -839,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 					 __be32 origin,
 					 __be32 mcastgrp)
 {
-	int line = MFC_HASH(mcastgrp, origin);
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
-			return c;
-	}
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		return c;
+
 	return NULL;
 }
 
@@ -853,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
 						    int vifi)
 {
-	int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = htonl(INADDR_ANY),
+			.mfc_origin = htonl(INADDR_ANY)
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
-		if (c->mfc_origin == htonl(INADDR_ANY) &&
-		    c->mfc_mcastgrp == htonl(INADDR_ANY) &&
-		    c->mfc_un.res.ttls[vifi] < 255)
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (c->mfc_un.res.ttls[vifi] < 255)
 			return c;
 
 	return NULL;
@@ -869,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
 					     __be32 mcastgrp, int vifi)
 {
-	int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = htonl(INADDR_ANY)
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc_cache *c, *proxy;
 
 	if (mcastgrp == htonl(INADDR_ANY))
 		goto skip;
 
-	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
-		if (c->mfc_origin == htonl(INADDR_ANY) &&
-		    c->mfc_mcastgrp == mcastgrp) {
-			if (c->mfc_un.res.ttls[vifi] < 255)
-				return c;
-
-			/* It's ok if the vifi is part of the static tree */
-			proxy = ipmr_cache_find_any_parent(mrt,
-							   c->mfc_parent);
-			if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
-				return c;
-		}
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+		if (c->mfc_un.res.ttls[vifi] < 255)
+			return c;
+
+		/* It's ok if the vifi is part of the static tree */
+		proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
+		if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+			return c;
+	}
 
 skip:
 	return ipmr_cache_find_any_parent(mrt, vifi);
 }
 
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+						__be32 origin, __be32 mcastgrp,
+						int parent)
+{
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin,
+	};
+	struct rhlist_head *tmp, *list;
+	struct mfc_cache *c;
+
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (parent == -1 || parent == c->mfc_parent)
+			return c;
+
+	return NULL;
+}
+
 /* Allocate a multicast cache entry */
 static struct mfc_cache *ipmr_cache_alloc(void)
 {
@@ -1028,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
 static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 				 struct sk_buff *skb)
 {
+	const struct iphdr *iph = ip_hdr(skb);
+	struct mfc_cache *c;
 	bool found = false;
 	int err;
-	struct mfc_cache *c;
-	const struct iphdr *iph = ip_hdr(skb);
 
 	spin_lock_bh(&mfc_unres_lock);
 	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1095,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 {
-	int line;
-	struct mfc_cache *c, *next;
+	struct mfc_cache *c;
 
-	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (!c)
+		return -ENOENT;
+	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+	list_del_rcu(&c->list);
+	mroute_netlink_event(mrt, c, RTM_DELROUTE);
+	ipmr_cache_free(c);
 
-	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
-		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
-		    (parent == -1 || parent == c->mfc_parent)) {
-			list_del_rcu(&c->list);
-			mroute_netlink_event(mrt, c, RTM_DELROUTE);
-			ipmr_cache_free(c);
-			return 0;
-		}
-	}
-	return -ENOENT;
+	return 0;
 }
 
 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 			struct mfcctl *mfc, int mrtsock, int parent)
 {
-	bool found = false;
-	int line;
 	struct mfc_cache *uc, *c;
+	bool found;
+	int ret;
 
 	if (mfc->mfcc_parent >= MAXVIFS)
 		return -ENFILE;
 
-	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
-
-	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
-		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
-		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
-		    (parent == -1 || parent == c->mfc_parent)) {
-			found = true;
-			break;
-		}
-	}
-
-	if (found) {
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (c) {
 		write_lock_bh(&mrt_lock);
 		c->mfc_parent = mfc->mfcc_parent;
 		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1160,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-	list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
-
+	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+				  ipmr_rht_params);
+	if (ret) {
+		pr_err("ipmr: rhtable insert error %d\n", ret);
+		ipmr_cache_free(c);
+		return ret;
+	}
+	list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
 	/* Check to see if we resolved a queued list. If so we
 	 * need to send on the frames and tidy up.
 	 */
@@ -1191,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 /* Close the multicast socket, and clear the vif tables etc */
 static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
-	int i;
+	struct mfc_cache *c, *tmp;
 	LIST_HEAD(list);
-	struct mfc_cache *c, *next;
+	int i;
 
 	/* Shut down all active vif entries */
 	for (i = 0; i < mrt->maxvif; i++) {
@@ -1204,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 	unregister_netdevice_many(&list);
 
 	/* Wipe the cache */
-	for (i = 0; i < MFC_LINES; i++) {
-		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
-			if (!all && (c->mfc_flags & MFC_STATIC))
-				continue;
-			list_del_rcu(&c->list);
-			mroute_netlink_event(mrt, c, RTM_DELROUTE);
-			ipmr_cache_free(c);
-		}
+	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+		if (!all && (c->mfc_flags & MFC_STATIC))
+			continue;
+		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+		list_del_rcu(&c->list);
+		mroute_netlink_event(mrt, c, RTM_DELROUTE);
+		ipmr_cache_free(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
 			mroute_netlink_event(mrt, c, RTM_DELROUTE);
 			ipmr_destroy_unres(mrt, c);
@@ -1791,9 +1836,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			  struct sk_buff *skb, struct mfc_cache *cache,
 			  int local)
 {
+	int true_vifi = ipmr_find_vif(mrt, skb->dev);
 	int psend = -1;
 	int vif, ct;
-	int true_vifi = ipmr_find_vif(mrt, skb->dev);
 
 	vif = cache->mfc_parent;
 	cache->mfc_un.res.pkt++;
@@ -2293,34 +2338,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	struct mr_table *mrt;
 	struct mfc_cache *mfc;
 	unsigned int t = 0, s_t;
-	unsigned int h = 0, s_h;
 	unsigned int e = 0, s_e;
 
 	s_t = cb->args[0];
-	s_h = cb->args[1];
-	s_e = cb->args[2];
+	s_e = cb->args[1];
 
 	rcu_read_lock();
 	ipmr_for_each_table(mrt, net) {
 		if (t < s_t)
 			goto next_table;
-		if (t > s_t)
-			s_h = 0;
-		for (h = s_h; h < MFC_LINES; h++) {
-			list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
-				if (e < s_e)
-					goto next_entry;
-				if (ipmr_fill_mroute(mrt, skb,
-						     NETLINK_CB(cb->skb).portid,
-						     cb->nlh->nlmsg_seq,
-						     mfc, RTM_NEWROUTE,
-						     NLM_F_MULTI) < 0)
-					goto done;
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+			if (e < s_e)
+				goto next_entry;
+			if (ipmr_fill_mroute(mrt, skb,
+					     NETLINK_CB(cb->skb).portid,
+					     cb->nlh->nlmsg_seq,
+					     mfc, RTM_NEWROUTE,
+					     NLM_F_MULTI) < 0)
+				goto done;
 next_entry:
-				e++;
-			}
-			e = s_e = 0;
+			e++;
 		}
+		e = 0;
+		s_e = 0;
+
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 			if (e < s_e)
@@ -2337,16 +2378,15 @@ next_entry2:
 			e++;
 		}
 		spin_unlock_bh(&mfc_unres_lock);
-		e = s_e = 0;
-		s_h = 0;
+		e = 0;
+		s_e = 0;
 next_table:
 		t++;
 	}
 done:
 	rcu_read_unlock();
 
-	cb->args[2] = e;
-	cb->args[1] = h;
+	cb->args[1] = e;
 	cb->args[0] = t;
 
 	return skb->len;
@@ -2590,10 +2630,8 @@ struct ipmr_mfc_iter {
 	struct seq_net_private p;
 	struct mr_table *mrt;
 	struct list_head *cache;
-	int ct;
 };
 
-
 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 					  struct ipmr_mfc_iter *it, loff_t pos)
 {
@@ -2601,12 +2639,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 	struct mfc_cache *mfc;
 
 	rcu_read_lock();
-	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
-		it->cache = &mrt->mfc_cache_array[it->ct];
-		list_for_each_entry_rcu(mfc, it->cache, list)
-			if (pos-- == 0)
-				return mfc;
-	}
+	it->cache = &mrt->mfc_cache_list;
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+		if (pos-- == 0)
+			return mfc;
 	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
@@ -2633,17 +2669,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 
 	it->mrt = mrt;
 	it->cache = NULL;
-	it->ct = 0;
 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
 		: SEQ_START_TOKEN;
 }
 
 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct mfc_cache *mfc = v;
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt = it->mrt;
+	struct mfc_cache *mfc = v;
 
 	++*pos;
 
@@ -2656,19 +2691,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (it->cache == &mrt->mfc_unres_queue)
 		goto end_of_list;
 
-	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
-
-	while (++it->ct < MFC_LINES) {
-		it->cache = &mrt->mfc_cache_array[it->ct];
-		if (list_empty(it->cache))
-			continue;
-		return list_first_entry(it->cache, struct mfc_cache, list);
-	}
-
 	/* exhausted cache_array, show unresolved */
 	rcu_read_unlock();
 	it->cache = &mrt->mfc_unres_queue;
-	it->ct = 0;
 
 	spin_lock_bh(&mfc_unres_lock);
 	if (!list_empty(it->cache))
@@ -2688,7 +2713,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
 	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &mrt->mfc_cache_array[it->ct])
+	else if (it->cache == &mrt->mfc_cache_list)
 		rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From 950b0d91dc108f54bccca5a2f75bb46f2df63d29 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Wed, 11 Jan 2017 14:13:34 -0800
Subject: pinctrl: core: Fix regression caused by delayed work for hogs

Commit df61b366af26 ("pinctrl: core: Use delayed work for hogs") caused a
regression at least with sh-pfc that is also a GPIO controller as
noted by Geert Uytterhoeven <geert@linux-m68k.org>.

As the original pinctrl_register() has issues calling pin controller
driver functions early before the controller has finished registering,
we can't just revert commit df61b366af26. That would break the drivers
using GENERIC_PINCTRL_GROUPS or GENERIC_PINMUX_FUNCTIONS.

So let's fix the issue with the following steps as a single patch:

1. Revert the late_init parts of commit df61b366af26.

   The late_init clearly won't work and we have to just give up
   on fixing pinctrl_register() for GENERIC_PINCTRL_GROUPS and
   GENERIC_PINMUX_FUNCTIONS.

2. Split pinctrl_register() into two parts

   By splitting pinctrl_register() into pinctrl_init_controller()
   and pinctrl_create_and_start() we have better control over when
   it's safe to call pinctrl_create().

3. Introduce a new pinctrl_register_and_init() function

   As suggested by Linus Walleij <linus.walleij@linaro.org>, we
   can just introduce a new function for the controllers that need
   pinctrl_create() called later.

4. Convert the four known problem cases to use new function

   Let's convert pinctrl-imx, pinctrl-single, sh-pfc and ti-iodelay
   to use the new function to fix the issues. The rest of the drivers
   can be converted later. Let's also update Documentation/pinctrl.txt
   accordingly because of the known issues with pinctrl_register().

Fixes: df61b366af26 ("pinctrl: core: Use delayed work for hogs")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Gary Bisson <gary.bisson@boundarydevices.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/pinctrl.txt               |   4 +-
 drivers/pinctrl/core.c                  | 201 ++++++++++++++++++++++----------
 drivers/pinctrl/core.h                  |   2 -
 drivers/pinctrl/freescale/pinctrl-imx.c |   8 +-
 drivers/pinctrl/pinctrl-single.c        |   5 +-
 drivers/pinctrl/sh-pfc/pinctrl.c        |   4 +-
 drivers/pinctrl/ti/pinctrl-ti-iodelay.c |   5 +-
 include/linux/pinctrl/pinctrl.h         |  15 +++
 8 files changed, 165 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt
index 0d3b9ce0a0b9..54bd5faa8782 100644
--- a/Documentation/pinctrl.txt
+++ b/Documentation/pinctrl.txt
@@ -79,9 +79,7 @@ int __init foo_probe(void)
 {
 	struct pinctrl_dev *pctl;
 
-	pctl = pinctrl_register(&foo_desc, <PARENT>, NULL);
-	if (!pctl)
-		pr_err("could not register foo pin driver\n");
+	return pinctrl_register_and_init(&foo_desc, <PARENT>, NULL, &pctl);
 }
 
 To enable the pinctrl subsystem and the subgroups for PINMUX and PINCONF and
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 65c0ae0969dc..bc73a63163ee 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1905,59 +1905,14 @@ static int pinctrl_check_ops(struct pinctrl_dev *pctldev)
 }
 
 /**
- * pinctrl_late_init() - finish pin controller device registration
- * @work: work struct
- */
-static void pinctrl_late_init(struct work_struct *work)
-{
-	struct pinctrl_dev *pctldev;
-
-	pctldev = container_of(work, struct pinctrl_dev, late_init.work);
-
-	/*
-	 * If the pin controller does NOT have hogs, this will report an
-	 * error and we skip over this entire branch. This is why we can
-	 * call this function directly when we do not have hogs on the
-	 * device.
-	 */
-	pctldev->p = create_pinctrl(pctldev->dev, pctldev);
-	if (!IS_ERR(pctldev->p)) {
-		kref_get(&pctldev->p->users);
-		pctldev->hog_default =
-			pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
-		if (IS_ERR(pctldev->hog_default)) {
-			dev_dbg(pctldev->dev,
-				"failed to lookup the default state\n");
-		} else {
-			if (pinctrl_select_state(pctldev->p,
-						 pctldev->hog_default))
-				dev_err(pctldev->dev,
-					"failed to select default state\n");
-		}
-
-		pctldev->hog_sleep =
-			pinctrl_lookup_state(pctldev->p,
-					     PINCTRL_STATE_SLEEP);
-		if (IS_ERR(pctldev->hog_sleep))
-			dev_dbg(pctldev->dev,
-				"failed to lookup the sleep state\n");
-	}
-
-	mutex_lock(&pinctrldev_list_mutex);
-	list_add_tail(&pctldev->node, &pinctrldev_list);
-	mutex_unlock(&pinctrldev_list_mutex);
-
-	pinctrl_init_device_debugfs(pctldev);
-}
-
-/**
- * pinctrl_register() - register a pin controller device
+ * pinctrl_init_controller() - init a pin controller device
  * @pctldesc: descriptor for this pin controller
  * @dev: parent device for this pin controller
  * @driver_data: private pin controller data for this pin controller
  */
-struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
-				    struct device *dev, void *driver_data)
+struct pinctrl_dev *pinctrl_init_controller(struct pinctrl_desc *pctldesc,
+					    struct device *dev,
+					    void *driver_data)
 {
 	struct pinctrl_dev *pctldev;
 	int ret;
@@ -1983,7 +1938,6 @@ struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 	INIT_RADIX_TREE(&pctldev->pin_function_tree, GFP_KERNEL);
 #endif
 	INIT_LIST_HEAD(&pctldev->gpio_ranges);
-	INIT_DELAYED_WORK(&pctldev->late_init, pinctrl_late_init);
 	pctldev->dev = dev;
 	mutex_init(&pctldev->mutex);
 
@@ -2018,17 +1972,6 @@ struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 		goto out_err;
 	}
 
-	/*
-	 * If the device has hogs we want the probe() function of the driver
-	 * to complete before we go in and hog them and add the pin controller
-	 * to the list of controllers. If it has no hogs, we can just complete
-	 * the registration immediately.
-	 */
-	if (pinctrl_dt_has_hogs(pctldev))
-		schedule_delayed_work(&pctldev->late_init, 0);
-	else
-		pinctrl_late_init(&pctldev->late_init.work);
-
 	return pctldev;
 
 out_err:
@@ -2036,8 +1979,107 @@ out_err:
 	kfree(pctldev);
 	return ERR_PTR(ret);
 }
+
+static int pinctrl_create_and_start(struct pinctrl_dev *pctldev)
+{
+	pctldev->p = create_pinctrl(pctldev->dev, pctldev);
+	if (!IS_ERR(pctldev->p)) {
+		kref_get(&pctldev->p->users);
+		pctldev->hog_default =
+			pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
+		if (IS_ERR(pctldev->hog_default)) {
+			dev_dbg(pctldev->dev,
+				"failed to lookup the default state\n");
+		} else {
+			if (pinctrl_select_state(pctldev->p,
+						pctldev->hog_default))
+				dev_err(pctldev->dev,
+					"failed to select default state\n");
+		}
+
+		pctldev->hog_sleep =
+			pinctrl_lookup_state(pctldev->p,
+						    PINCTRL_STATE_SLEEP);
+		if (IS_ERR(pctldev->hog_sleep))
+			dev_dbg(pctldev->dev,
+				"failed to lookup the sleep state\n");
+	}
+
+	mutex_lock(&pinctrldev_list_mutex);
+	list_add_tail(&pctldev->node, &pinctrldev_list);
+	mutex_unlock(&pinctrldev_list_mutex);
+
+	pinctrl_init_device_debugfs(pctldev);
+
+	return 0;
+}
+
+/**
+ * pinctrl_register() - register a pin controller device
+ * @pctldesc: descriptor for this pin controller
+ * @dev: parent device for this pin controller
+ * @driver_data: private pin controller data for this pin controller
+ *
+ * Note that pinctrl_register() is known to have problems as the pin
+ * controller driver functions are called before the driver has a
+ * struct pinctrl_dev handle. To avoid issues later on, please use the
+ * new pinctrl_register_and_init() below instead.
+ */
+struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
+				    struct device *dev, void *driver_data)
+{
+	struct pinctrl_dev *pctldev;
+	int error;
+
+	pctldev = pinctrl_init_controller(pctldesc, dev, driver_data);
+	if (IS_ERR(pctldev))
+		return pctldev;
+
+	error = pinctrl_create_and_start(pctldev);
+	if (error) {
+		mutex_destroy(&pctldev->mutex);
+		kfree(pctldev);
+
+		return ERR_PTR(error);
+	}
+
+	return pctldev;
+
+}
 EXPORT_SYMBOL_GPL(pinctrl_register);
 
+int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
+			      struct device *dev, void *driver_data,
+			      struct pinctrl_dev **pctldev)
+{
+	struct pinctrl_dev *p;
+	int error;
+
+	p = pinctrl_init_controller(pctldesc, dev, driver_data);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	/*
+	 * We have pinctrl_start() call functions in the pin controller
+	 * driver with create_pinctrl() for at least dt_node_to_map(). So
+	 * let's make sure pctldev is properly initialized for the
+	 * pin controller driver before we do anything.
+	 */
+	*pctldev = p;
+
+	error = pinctrl_create_and_start(p);
+	if (error) {
+		mutex_destroy(&p->mutex);
+		kfree(p);
+		*pctldev = NULL;
+
+		return error;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pinctrl_register_and_init);
+
 /**
  * pinctrl_unregister() - unregister pinmux
  * @pctldev: pin controller to unregister
@@ -2052,7 +2094,6 @@ void pinctrl_unregister(struct pinctrl_dev *pctldev)
 	if (pctldev == NULL)
 		return;
 
-	cancel_delayed_work_sync(&pctldev->late_init);
 	mutex_lock(&pctldev->mutex);
 	pinctrl_remove_device_debugfs(pctldev);
 	mutex_unlock(&pctldev->mutex);
@@ -2133,6 +2174,42 @@ struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_pinctrl_register);
 
+/**
+ * devm_pinctrl_register_and_init() - Resource managed pinctrl register and init
+ * @dev: parent device for this pin controller
+ * @pctldesc: descriptor for this pin controller
+ * @driver_data: private pin controller data for this pin controller
+ *
+ * Returns an error pointer if pincontrol register failed. Otherwise
+ * it returns valid pinctrl handle.
+ *
+ * The pinctrl device will be automatically released when the device is unbound.
+ */
+int devm_pinctrl_register_and_init(struct device *dev,
+				   struct pinctrl_desc *pctldesc,
+				   void *driver_data,
+				   struct pinctrl_dev **pctldev)
+{
+	struct pinctrl_dev **ptr;
+	int error;
+
+	ptr = devres_alloc(devm_pinctrl_dev_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	error = pinctrl_register_and_init(pctldesc, dev, driver_data, pctldev);
+	if (error) {
+		devres_free(ptr);
+		return error;
+	}
+
+	*ptr = *pctldev;
+	devres_add(dev, ptr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_pinctrl_register_and_init);
+
 /**
  * devm_pinctrl_unregister() - Resource managed version of pinctrl_unregister().
  * @dev: device for which which resource was allocated
diff --git a/drivers/pinctrl/core.h b/drivers/pinctrl/core.h
index ad812a2d7248..1c35de59a658 100644
--- a/drivers/pinctrl/core.h
+++ b/drivers/pinctrl/core.h
@@ -37,7 +37,6 @@ struct pinctrl_gpio_range;
  * @p: result of pinctrl_get() for this device
  * @hog_default: default state for pins hogged by this device
  * @hog_sleep: sleep state for pins hogged by this device
- * @late_init: delayed work for pin controller to finish registration
  * @mutex: mutex taken on each pin controller specific action
  * @device_root: debugfs root for this device
  */
@@ -60,7 +59,6 @@ struct pinctrl_dev {
 	struct pinctrl *p;
 	struct pinctrl_state *hog_default;
 	struct pinctrl_state *hog_sleep;
-	struct delayed_work late_init;
 	struct mutex mutex;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *device_root;
diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c
index bccd9416d44f..a7ace9e1ad81 100644
--- a/drivers/pinctrl/freescale/pinctrl-imx.c
+++ b/drivers/pinctrl/freescale/pinctrl-imx.c
@@ -774,11 +774,11 @@ int imx_pinctrl_probe(struct platform_device *pdev,
 	ipctl->info = info;
 	ipctl->dev = info->dev;
 	platform_set_drvdata(pdev, ipctl);
-	ipctl->pctl = devm_pinctrl_register(&pdev->dev,
-					    imx_pinctrl_desc, ipctl);
-	if (IS_ERR(ipctl->pctl)) {
+	ret = devm_pinctrl_register_and_init(&pdev->dev,
+					     imx_pinctrl_desc, ipctl,
+					     &ipctl->pctl);
+	if (ret) {
 		dev_err(&pdev->dev, "could not register IMX pinctrl driver\n");
-		ret = PTR_ERR(ipctl->pctl);
 		goto free;
 	}
 
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index 8408263de466..d23e0e0aeb22 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -1747,10 +1747,9 @@ static int pcs_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto free;
 
-	pcs->pctl = pinctrl_register(&pcs->desc, pcs->dev, pcs);
-	if (IS_ERR(pcs->pctl)) {
+	ret = pinctrl_register_and_init(&pcs->desc, pcs->dev, pcs, &pcs->pctl);
+	if (ret) {
 		dev_err(pcs->dev, "could not register single pinctrl driver\n");
-		ret = PTR_ERR(pcs->pctl);
 		goto free;
 	}
 
diff --git a/drivers/pinctrl/sh-pfc/pinctrl.c b/drivers/pinctrl/sh-pfc/pinctrl.c
index fcacfa73ef6e..08150a321be6 100644
--- a/drivers/pinctrl/sh-pfc/pinctrl.c
+++ b/drivers/pinctrl/sh-pfc/pinctrl.c
@@ -816,6 +816,6 @@ int sh_pfc_register_pinctrl(struct sh_pfc *pfc)
 	pmx->pctl_desc.pins = pmx->pins;
 	pmx->pctl_desc.npins = pfc->info->nr_pins;
 
-	pmx->pctl = devm_pinctrl_register(pfc->dev, &pmx->pctl_desc, pmx);
-	return PTR_ERR_OR_ZERO(pmx->pctl);
+	return devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx,
+					      &pmx->pctl);
 }
diff --git a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
index 3b86d3db7358..7f472f123515 100644
--- a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
+++ b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
@@ -891,10 +891,9 @@ static int ti_iodelay_probe(struct platform_device *pdev)
 	iod->desc.name = dev_name(dev);
 	iod->desc.owner = THIS_MODULE;
 
-	iod->pctl = pinctrl_register(&iod->desc, dev, iod);
-	if (!iod->pctl) {
+	ret = pinctrl_register_and_init(&iod->desc, dev, iod, &iod->pctl);
+	if (ret) {
 		dev_err(dev, "Failed to register pinctrl\n");
-		ret = -ENODEV;
 		goto exit_out;
 	}
 
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index a42e57da270d..8ce2d87a238b 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -141,12 +141,27 @@ struct pinctrl_desc {
 };
 
 /* External interface to pin controller */
+
+extern int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
+				     struct device *dev, void *driver_data,
+				     struct pinctrl_dev **pctldev);
+
+/* Please use pinctrl_register_and_init() instead */
 extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 				struct device *dev, void *driver_data);
+
 extern void pinctrl_unregister(struct pinctrl_dev *pctldev);
+
+extern int devm_pinctrl_register_and_init(struct device *dev,
+				struct pinctrl_desc *pctldesc,
+				void *driver_data,
+				struct pinctrl_dev **pctldev);
+
+/* Please use devm_pinctrl_register_and_init() instead */
 extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
 				struct pinctrl_desc *pctldesc,
 				void *driver_data);
+
 extern void devm_pinctrl_unregister(struct device *dev,
 				struct pinctrl_dev *pctldev);
 
-- 
cgit v1.2.3


From 4fe0395550aeb6709ea5332f46de3644aef7d328 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2017 21:37:40 +0100
Subject: PCI/MSI: Remove pci_enable_msi_{exact,range}()

All multi-MSI allocations are now done through pci_irq_alloc_vectors(), so
remove the old pci_enable_msi_range() and pci_enable_msi_exact()
interfaces.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/PCI/MSI-HOWTO.txt |  6 ++----
 drivers/pci/msi.c               | 26 +++++++++-----------------
 include/linux/pci.h             | 16 ++--------------
 3 files changed, 13 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index cd9c9f6a7cd9..1e37138027a3 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -162,8 +162,6 @@ The following old APIs to enable and disable MSI or MSI-X interrupts should
 not be used in new code:
 
   pci_enable_msi()		/* deprecated */
-  pci_enable_msi_range()	/* deprecated */
-  pci_enable_msi_exact()	/* deprecated */
   pci_disable_msi()		/* deprecated */
   pci_enable_msix_range()	/* deprecated */
   pci_enable_msix_exact()	/* deprecated */
@@ -268,5 +266,5 @@ or disabled (0).  If 0 is found in any of the msi_bus files belonging
 to bridges between the PCI root and the device, MSIs are disabled.
 
 It is also worth checking the device driver to see whether it supports MSIs.
-For example, it may contain calls to pci_enable_msi_range() or
-pci_enable_msix_range().
+For example, it may contain calls to pci_irq_alloc_vectors() with the
+PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index ca9112afd53a..b6785c8be44d 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1109,23 +1109,15 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	}
 }
 
-/**
- * pci_enable_msi_range - configure device's MSI capability structure
- * @dev: device to configure
- * @minvec: minimal number of interrupts to configure
- * @maxvec: maximum number of interrupts to configure
- *
- * This function tries to allocate a maximum possible number of interrupts in a
- * range between @minvec and @maxvec. It returns a negative errno if an error
- * occurs. If it succeeds, it returns the actual number of interrupts allocated
- * and updates the @dev's irq member to the lowest new interrupt number;
- * the other interrupt numbers allocated to this device are consecutive.
- **/
-int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+/* deprecated, don't use */
+int pci_enable_msi(struct pci_dev *dev)
 {
-	return __pci_enable_msi_range(dev, minvec, maxvec, NULL);
+	int rc = __pci_enable_msi_range(dev, 1, 1, NULL);
+	if (rc < 0)
+		return rc;
+	return 0;
 }
-EXPORT_SYMBOL(pci_enable_msi_range);
+EXPORT_SYMBOL(pci_enable_msi);
 
 static int __pci_enable_msix_range(struct pci_dev *dev,
 				   struct msix_entry *entries, int minvec,
@@ -1381,7 +1373,7 @@ int pci_msi_domain_check_cap(struct irq_domain *domain,
 {
 	struct msi_desc *desc = first_pci_msi_entry(to_pci_dev(dev));
 
-	/* Special handling to support pci_enable_msi_range() */
+	/* Special handling to support __pci_enable_msi_range() */
 	if (pci_msi_desc_is_multi_msi(desc) &&
 	    !(info->flags & MSI_FLAG_MULTI_PCI_MSI))
 		return 1;
@@ -1394,7 +1386,7 @@ int pci_msi_domain_check_cap(struct irq_domain *domain,
 static int pci_msi_domain_handle_error(struct irq_domain *domain,
 				       struct msi_desc *desc, int error)
 {
-	/* Special handling to support pci_enable_msi_range() */
+	/* Special handling to support __pci_enable_msi_range() */
 	if (pci_msi_desc_is_multi_msi(desc) && error == -ENOSPC)
 		return 1;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e2d1a124216a..2159376dc673 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1309,14 +1309,7 @@ void pci_msix_shutdown(struct pci_dev *dev);
 void pci_disable_msix(struct pci_dev *dev);
 void pci_restore_msi_state(struct pci_dev *dev);
 int pci_msi_enabled(void);
-int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec);
-static inline int pci_enable_msi_exact(struct pci_dev *dev, int nvec)
-{
-	int rc = pci_enable_msi_range(dev, nvec, nvec);
-	if (rc < 0)
-		return rc;
-	return 0;
-}
+int pci_enable_msi(struct pci_dev *dev);
 int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 			  int minvec, int maxvec);
 static inline int pci_enable_msix_exact(struct pci_dev *dev,
@@ -1347,10 +1340,7 @@ static inline void pci_msix_shutdown(struct pci_dev *dev) { }
 static inline void pci_disable_msix(struct pci_dev *dev) { }
 static inline void pci_restore_msi_state(struct pci_dev *dev) { }
 static inline int pci_msi_enabled(void) { return 0; }
-static inline int pci_enable_msi_range(struct pci_dev *dev, int minvec,
-				       int maxvec)
-{ return -ENOSYS; }
-static inline int pci_enable_msi_exact(struct pci_dev *dev, int nvec)
+static inline int pci_enable_msi(struct pci_dev *dev)
 { return -ENOSYS; }
 static inline int pci_enable_msix_range(struct pci_dev *dev,
 		      struct msix_entry *entries, int minvec, int maxvec)
@@ -1426,8 +1416,6 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
 static inline void pcie_ecrc_get_policy(char *str) { }
 #endif
 
-#define pci_enable_msi(pdev)	pci_enable_msi_exact(pdev, 1)
-
 #ifdef CONFIG_HT_IRQ
 /* The functions a driver should call */
 int  ht_create_irq(struct pci_dev *dev, int idx);
-- 
cgit v1.2.3


From deed7be78f512d003c6290da0a781479b31b3d74 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:32 -0800
Subject: tcp: record most recent RTT in RACK loss detection

Record the most recent RTT in RACK. It is often identical to the
"ca_rtt_us" values in tcp_clean_rtx_queue. But when the packet has
been retransmitted, RACK choses to believe the ACK is for the
(latest) retransmitted packet if the RTT is over minimum RTT.

This requires passing the arrival time of the most recent ACK to
RACK routines. The timestamp is now recorded in the "ack_time"
in tcp_sacktag_state during the ACK processing.

This patch does not change the RACK algorithm itself. It only adds
the RTT variable to prepare the next main patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  1 +
 include/net/tcp.h       |  7 ++++---
 net/ipv4/tcp_input.c    | 36 ++++++++++++++++++++++--------------
 net/ipv4/tcp_recovery.c | 41 +++++++++++++++++++++++------------------
 4 files changed, 50 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fc5848dad7a4..1255c592719c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -207,6 +207,7 @@ struct tcp_sock {
 	/* Information of the most recently (s)acked skb */
 	struct tcp_rack {
 		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
+		u32 rtt_us;  /* Associated RTT */
 		u8 advanced; /* mstamp advanced since last lost marking */
 		u8 reord;    /* reordering detected */
 	} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51183bba3835..1439107658c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,9 +1863,10 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern void tcp_rack_mark_lost(struct sock *sk);
-extern void tcp_rack_advance(struct tcp_sock *tp,
-			     const struct skb_mstamp *xmit_time, u8 sacked);
+extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+			     const struct skb_mstamp *xmit_time,
+			     const struct skb_mstamp *ack_time);
 
 /*
  * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb24b93e64bc..8ccd171999bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1135,6 +1135,7 @@ struct tcp_sacktag_state {
 	 */
 	struct skb_mstamp first_sackt;
 	struct skb_mstamp last_sackt;
+	struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
 	struct rate_sample *rate;
 	int	flag;
 };
@@ -1217,7 +1218,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
-		tcp_rack_advance(tp, xmit_time, sacked);
+		tcp_rack_advance(tp, sacked, xmit_time, &state->ack_time);
 
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
@@ -2813,7 +2814,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
-				  bool is_dupack, int *ack_flag, int *rexmit)
+				  bool is_dupack, int *ack_flag, int *rexmit,
+				  const struct skb_mstamp *ack_time)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2868,7 +2870,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
 		u32 prior_retrans = tp->retrans_out;
 
-		tcp_rack_mark_lost(sk);
+		tcp_rack_mark_lost(sk, ack_time);
 		if (prior_retrans > tp->retrans_out) {
 			flag |= FLAG_LOST_RETRANS;
 			*ack_flag |= FLAG_LOST_RETRANS;
@@ -3105,11 +3107,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			       u32 prior_snd_una, int *acked,
-			       struct tcp_sacktag_state *sack,
-			       struct skb_mstamp *now)
+			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct skb_mstamp first_ackt, last_ackt;
+	struct skb_mstamp *now = &sack->ack_time;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
@@ -3169,7 +3171,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else if (tcp_is_sack(tp)) {
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
-				tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+				tcp_rack_advance(tp, sacked,
+						 &skb->skb_mstamp,
+						 &sack->ack_time);
 		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
@@ -3599,7 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
-	struct skb_mstamp now;
 
 	sack_state.first_sackt.v64 = 0;
 	sack_state.rate = &rs;
@@ -3625,7 +3628,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
-	skb_mstamp_get(&now);
+	skb_mstamp_get(&sack_state.ack_time);
 
 	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
@@ -3693,11 +3696,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state, &now);
+				    &sack_state);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	}
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
@@ -3712,15 +3716,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tcp_schedule_loss_probe(sk);
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
-	tcp_rate_gen(sk, delivered, lost, &now, &rs);
-	tcp_cong_control(sk, ack, delivered, flag, &rs);
+	tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
+		     sack_state.rate);
+	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3741,9 +3747,11 @@ old_ack:
 	 * If data was DSACKed, see if we can undo a cwnd reduction.
 	 */
 	if (TCP_SKB_CB(skb)->sacked) {
+		skb_mstamp_get(&sack_state.ack_time);
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 7ea0377229c0..557363cde58a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,7 +32,7 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-static void tcp_rack_detect_loss(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -62,13 +62,14 @@ static void tcp_rack_detect_loss(struct sock *sk)
 			continue;
 
 		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
-
-			if (skb_mstamp_us_delta(&tp->rack.mstamp,
-						&skb->skb_mstamp) <= reo_wnd)
-				continue;
-
-			/* skb is lost if packet sent later is sacked */
-			tcp_rack_mark_skb_lost(sk, skb);
+			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
+			 * A packet is lost if its elapsed time is beyond
+			 * the recent RTT plus the reordering window.
+			 */
+			if (skb_mstamp_us_delta(now, &skb->skb_mstamp) >
+			    tp->rack.rtt_us + reo_wnd) {
+				tcp_rack_mark_skb_lost(sk, skb);
+			}
 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
 			/* Original data are sent sequentially so stop early
 			 * b/c the rest are all sent after rack_sent
@@ -78,7 +79,7 @@ static void tcp_rack_detect_loss(struct sock *sk)
 	}
 }
 
-void tcp_rack_mark_lost(struct sock *sk)
+void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -86,20 +87,25 @@ void tcp_rack_mark_lost(struct sock *sk)
 		return;
 	/* Reset the advanced flag to avoid unnecessary queue scanning */
 	tp->rack.advanced = 0;
-	tcp_rack_detect_loss(sk);
+	tcp_rack_detect_loss(sk, now);
 }
 
-/* Record the most recently (re)sent time among the (s)acked packets */
-void tcp_rack_advance(struct tcp_sock *tp,
-		      const struct skb_mstamp *xmit_time, u8 sacked)
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+		      const struct skb_mstamp *xmit_time,
+		      const struct skb_mstamp *ack_time)
 {
+	u32 rtt_us;
+
 	if (tp->rack.mstamp.v64 &&
 	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
 		return;
 
+	rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
 	if (sacked & TCPCB_RETRANS) {
-		struct skb_mstamp now;
-
 		/* If the sacked packet was retransmitted, it's ambiguous
 		 * whether the retransmission or the original (or the prior
 		 * retransmission) was sacked.
@@ -110,11 +116,10 @@ void tcp_rack_advance(struct tcp_sock *tp,
 		 * so it's at least one RTT (i.e., retransmission is at least
 		 * an RTT later).
 		 */
-		skb_mstamp_get(&now);
-		if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+		if (rtt_us < tcp_min_rtt(tp))
 			return;
 	}
-
+	tp->rack.rtt_us = rtt_us;
 	tp->rack.mstamp = *xmit_time;
 	tp->rack.advanced = 1;
 }
-- 
cgit v1.2.3


From 1d0833df594390876647c54c2c88069d29059665 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:34 -0800
Subject: tcp: use sequence to break TS ties for RACK loss detection

The packets inside a jumbo skb (e.g., TSO) share the same skb
timestamp, even though they are sent sequentially on the wire. Since
RACK is based on time, it can not detect some packets inside the
same skb are lost.  However, we can leverage the packet sequence
numbers as extended timestamps to detect losses. Therefore, when
RACK timestamp is identical to skb's timestamp (i.e., one of the
packets of the skb is acked or sacked), we use the sequence numbers
of the acked and unacked packets to break ties.

We can use the same sequence logic to advance RACK xmit time as
well to detect more losses and avoid timeout.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  1 +
 include/net/tcp.h       |  2 +-
 net/ipv4/tcp_input.c    |  5 +++--
 net/ipv4/tcp_recovery.c | 17 ++++++++++++++---
 4 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1255c592719c..970d5f00589f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -208,6 +208,7 @@ struct tcp_sock {
 	struct tcp_rack {
 		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
 		u32 rtt_us;  /* Associated RTT */
+		u32 end_seq; /* Ending TCP sequence of the skb */
 		u8 advanced; /* mstamp advanced since last lost marking */
 		u8 reord;    /* reordering detected */
 	} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 64fcdeb3358b..5fb1e75a32a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1867,7 +1867,7 @@ extern int sysctl_tcp_recovery;
 #define TCP_RACK_LOST_RETRANS  0x1
 
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
-extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     const struct skb_mstamp *xmit_time,
 			     const struct skb_mstamp *ack_time);
 extern void tcp_rack_reo_timeout(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be1191829963..e42ca11c0326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1218,7 +1218,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
-		tcp_rack_advance(tp, sacked, xmit_time, &state->ack_time);
+		tcp_rack_advance(tp, sacked, end_seq,
+				 xmit_time, &state->ack_time);
 
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
@@ -3171,7 +3172,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else if (tcp_is_sack(tp)) {
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
-				tcp_rack_advance(tp, sacked,
+				tcp_rack_advance(tp, sacked, scb->end_seq,
 						 &skb->skb_mstamp,
 						 &sack->ack_time);
 		}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index eb39b1b6d1dc..1e330a2f913d 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -16,6 +16,14 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+				const struct skb_mstamp *t2,
+				u32 seq1, u32 seq2)
+{
+	return skb_mstamp_after(t1, t2) ||
+	       (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -60,7 +68,8 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 		    scb->sacked & TCPCB_SACKED_ACKED)
 			continue;
 
-		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+		if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
+					tp->rack.end_seq, scb->end_seq)) {
 			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
 			 * A packet is lost if its elapsed time is beyond
 			 * the recent RTT plus the reordering window.
@@ -113,14 +122,15 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
  * draft-cheng-tcpm-rack-00.txt
  */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 		      const struct skb_mstamp *xmit_time,
 		      const struct skb_mstamp *ack_time)
 {
 	u32 rtt_us;
 
 	if (tp->rack.mstamp.v64 &&
-	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+	    !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
+				 end_seq, tp->rack.end_seq))
 		return;
 
 	rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
@@ -140,6 +150,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
 	}
 	tp->rack.rtt_us = rtt_us;
 	tp->rack.mstamp = *xmit_time;
+	tp->rack.end_seq = end_seq;
 	tp->rack.advanced = 1;
 }
 
-- 
cgit v1.2.3


From 840a3cbe89694fad75578856976f180e852e69aa Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:38 -0800
Subject: tcp: remove forward retransmit feature

Forward retransmit is an esoteric feature in RFC3517 (condition(3)
in the NextSeg()). Basically if a packet is not considered lost by
the current criteria (# of dupacks etc), but the congestion window
has room for more packets, then retransmit this packet.

However it actually conflicts with the rest of recovery design. For
example, when reordering is detected we want to be conservative
in retransmitting packets but forward-retransmit feature would
break that to force more retransmission. Also the implementation is
fairly complicated inside the retransmission logic inducing extra
iterations in the write queue. With RACK losses are being detected
timely and this heuristic is no longer necessary. There this patch
removes the feature.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 -
 net/ipv4/tcp_input.c  |  5 -----
 net/ipv4/tcp_output.c | 61 +++------------------------------------------------
 3 files changed, 3 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 970d5f00589f..8e5f4c15d0e5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -307,7 +307,6 @@ struct tcp_sock {
 					 */
 
 	int     lost_cnt_hint;
-	u32     retransmit_high;	/* L-bits may be on up to this seqno */
 
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
 	u32	high_seq;	/* snd_nxt at onset of congestion	*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9469ce384d3b..a041a92348ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -916,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 	    before(TCP_SKB_CB(skb)->seq,
 		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
 		tp->retransmit_skb_hint = skb;
-
-	if (!tp->lost_out ||
-	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
-		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 /* Sum the number of packets on the wire we have marked as lost.
@@ -1983,7 +1979,6 @@ void tcp_enter_loss(struct sock *sk)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
-			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0ba9026cb70d..6327e4d368a4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2831,36 +2831,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	return err;
 }
 
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	/* Forward retransmissions are possible only during Recovery. */
-	if (icsk->icsk_ca_state != TCP_CA_Recovery)
-		return false;
-
-	/* No forward retransmissions in Reno are possible. */
-	if (tcp_is_reno(tp))
-		return false;
-
-	/* Yeah, we have to make difficult choice between forward transmission
-	 * and retransmission... Both ways have their merits...
-	 *
-	 * For now we do not retransmit anything, while we have some new
-	 * segments to send. In the other cases, follow rule 3 for
-	 * NextSeg() specified in RFC3517.
-	 */
-
-	if (tcp_may_send_now(sk))
-		return false;
-
-	return true;
-}
-
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2875,24 +2845,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	struct sk_buff *hole = NULL;
-	u32 max_segs, last_lost;
+	u32 max_segs;
 	int mib_idx;
-	int fwd_rexmitting = 0;
 
 	if (!tp->packets_out)
 		return;
 
-	if (!tp->lost_out)
-		tp->retransmit_high = tp->snd_una;
-
 	if (tp->retransmit_skb_hint) {
 		skb = tp->retransmit_skb_hint;
-		last_lost = TCP_SKB_CB(skb)->end_seq;
-		if (after(last_lost, tp->retransmit_high))
-			last_lost = tp->retransmit_high;
 	} else {
 		skb = tcp_write_queue_head(sk);
-		last_lost = tp->snd_una;
 	}
 
 	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2915,31 +2877,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		 */
 		segs = min_t(int, segs, max_segs);
 
-		if (fwd_rexmitting) {
-begin_fwd:
-			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-				break;
-			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
-		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
-			tp->retransmit_high = last_lost;
-			if (!tcp_can_forward_retransmit(sk))
-				break;
-			/* Backtrack if necessary to non-L'ed skb */
-			if (hole) {
-				skb = hole;
-				hole = NULL;
-			}
-			fwd_rexmitting = 1;
-			goto begin_fwd;
-
+		if (tp->retrans_out >= tp->lost_out) {
+			break;
 		} else if (!(sacked & TCPCB_LOST)) {
 			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
 				hole = skb;
 			continue;
 
 		} else {
-			last_lost = TCP_SKB_CB(skb)->end_seq;
 			if (icsk->icsk_ca_state != TCP_CA_Loss)
 				mib_idx = LINUX_MIB_TCPFASTRETRANS;
 			else
-- 
cgit v1.2.3


From bec41a11dd3dc8c54f766b4f494140ca92ba7c10 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:39 -0800
Subject: tcp: remove early retransmit

This patch removes the support of RFC5827 early retransmit (i.e.,
fast recovery on small inflight with <3 dupacks) because it is
subsumed by the new RACK loss detection. More specifically when
RACK receives DUPACKs, it'll arm a reordering timer to start fast
recovery after a quarter of (min)RTT, hence it covers the early
retransmit except RACK does not limit itself to specific inflight
or dupack numbers.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 19 +++--------
 include/linux/tcp.h                    |  3 +-
 include/net/tcp.h                      | 19 -----------
 net/ipv4/inet_diag.c                   |  1 -
 net/ipv4/tcp.c                         |  3 --
 net/ipv4/tcp_input.c                   | 60 ++--------------------------------
 net/ipv4/tcp_ipv4.c                    |  1 -
 net/ipv4/tcp_metrics.c                 |  1 -
 net/ipv4/tcp_minisocks.c               |  1 -
 net/ipv4/tcp_output.c                  | 11 +++----
 net/ipv4/tcp_timer.c                   |  3 --
 net/ipv6/tcp_ipv6.c                    |  1 -
 12 files changed, 12 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7dd65c9cf707..7de2cf79e16f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
 	Allows TCP to send "duplicate" SACKs.
 
 tcp_early_retrans - INTEGER
-	Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
-	for triggering fast retransmit when the amount of outstanding data is
-	small and when no previously unsent data can be transmitted (such
-	that limited transmit could be used). Also controls the use of
-	Tail loss probe (TLP) that converts RTOs occurring due to tail
-	losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
+	Tail loss probe (TLP) converts RTOs occurring due to tail
+	losses into fast recovery (draft-ietf-tcpm-rack). Note that
+	TLP requires RACK to function properly (see tcp_recovery below)
 	Possible values:
-		0 disables ER
-		1 enables ER
-		2 enables ER but delays fast recovery and fast retransmit
-		  by a fourth of RTT. This mitigates connection falsely
-		  recovers when network has a small degree of reordering
-		  (less than 3 packets).
-		3 enables delayed ER and TLP.
-		4 enables TLP only.
+		0 disables TLP
+		3 or 4 enables TLP
 	Default: 3
 
 tcp_ecn - INTEGER
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8e5f4c15d0e5..4733368f953a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,8 +224,7 @@ struct tcp_sock {
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
-	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-		syn_data:1,	/* SYN includes data */
+	u8	syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 423438dd6fe9..c55d65f74f7f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -565,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 			     const struct sk_buff *next_skb);
 
 /* tcp_input.c */
-void tcp_resume_early_retransmit(struct sock *sk);
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
@@ -1037,24 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
 	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
 }
 
-/* TCP early-retransmit (ER) is similar to but more conservative than
- * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
- */
-static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
-{
-	struct net *net = sock_net((struct sock *)tp);
-
-	tp->do_early_retrans = sysctl_tcp_early_retrans &&
-		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
-		!(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
-		net->ipv4.sysctl_tcp_reordering == 3;
-}
-
-static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
-{
-	tp->do_early_retrans = 0;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index d216e40623d3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -215,7 +215,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		r->idiag_timer = 1;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c8d46c140b4a..d9023e8ed53e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
-	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
@@ -2477,8 +2476,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		else {
 			tp->thin_dupack = val;
-			if (tp->thin_dupack)
-				tcp_disable_early_retrans(tp);
 		}
 		break;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a041a92348ee..79c819077a59 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -904,8 +904,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		tcp_disable_fack(tp);
 	}
 
-	if (metric > 0)
-		tcp_disable_early_retrans(tp);
 	tp->rack.reord = 1;
 }
 
@@ -2054,30 +2052,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
 }
 
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned long delay;
-
-	/* Delay early retransmit and entering fast recovery for
-	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
-	 * available, or RTO is scheduled to fire first.
-	 */
-	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
-	    (flag & FLAG_ECE) || !tp->srtt_us)
-		return false;
-
-	delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
-		    msecs_to_jiffies(2));
-
-	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
-		return false;
-
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
-				  TCP_RTO_MAX);
-	return true;
-}
-
 /* Linux NewReno/SACK/FACK/ECN state machine.
  * --------------------------------------
  *
@@ -2221,16 +2195,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	    tcp_is_sack(tp) && !tcp_send_head(sk))
 		return true;
 
-	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
-	 * retransmissions due to small network reorderings, we implement
-	 * Mitigation A.3 in the RFC and delay the retransmission for a short
-	 * interval if appropriate.
-	 */
-	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
-	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
-	    !tcp_may_send_now(sk))
-		return !tcp_pause_early_retransmit(sk, flag);
-
 	return false;
 }
 
@@ -3050,8 +3014,7 @@ void tcp_rearm_rto(struct sock *sk)
 	} else {
 		u32 rto = inet_csk(sk)->icsk_rto;
 		/* Offset the time elapsed after installing regular RTO */
-		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-		    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp =
@@ -3068,24 +3031,6 @@ void tcp_rearm_rto(struct sock *sk)
 	}
 }
 
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tcp_rearm_rto(sk);
-
-	/* Stop if ER is disabled after the delayed ER timer is scheduled */
-	if (!tp->do_early_retrans)
-		return;
-
-	tcp_enter_recovery(sk, false);
-	tcp_update_scoreboard(sk, 1);
-	tcp_xmit_retransmit_queue(sk);
-}
-
 /* If we get here, the whole TSO packet has not been acked. */
 static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
@@ -3651,8 +3596,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	skb_mstamp_get(&sack_state.ack_time);
 
-	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+	if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
 
 	if (after(ack, prior_snd_una)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf3e0c4967a..63214136cf1c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2229,7 +2229,6 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	int state;
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ba8f02d0f283..b9ed0d50aead 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk)
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 	if (val && tp->reordering != val) {
 		tcp_disable_fack(tp);
-		tcp_disable_early_retrans(tp);
 		tp->reordering = val;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 06fde26a82b7..bdb443471c39 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->sacked_out = 0;
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = treq->snt_synack.stamp_jiffies;
 		newsk->sk_txhash = treq->txhash;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6327e4d368a4..9a1a1494b9dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
 	tp->packets_out += tcp_skb_pcount(skb);
-	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
-	}
 
 	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
 		      tcp_skb_pcount(skb));
@@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	u32 timeout, tlp_time_stamp, rto_time_stamp;
 	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
 
-	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
-		return false;
 	/* No consecutive loss probes. */
 	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
 		tcp_rearm_rto(sk);
@@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	/* Schedule a loss probe in 2*RTT for SACK capable connections
 	 * in Open state, that are either limited by cwnd or application.
 	 */
-	if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
-	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+	if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+	    !tp->packets_out || !tcp_is_sack(tp) ||
+	    icsk->icsk_ca_state != TCP_CA_Open)
 		return false;
 
 	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 953c02a8566e..40d893556e67 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -566,9 +566,6 @@ void tcp_write_timer_handler(struct sock *sk)
 	case ICSK_TIME_REO_TIMEOUT:
 		tcp_rack_reo_timeout(sk);
 		break;
-	case ICSK_TIME_EARLY_RETRANS:
-		tcp_resume_early_retransmit(sk);
-		break;
 	case ICSK_TIME_LOSS_PROBE:
 		tcp_send_loss_probe(sk);
 		break;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f52c3742b404..fc14e04028bf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1745,7 +1745,6 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
-- 
cgit v1.2.3


From 4a7f6009441144783e5925551c72e3f2e1b0839b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:41 -0800
Subject: tcp: remove thin_dupack feature

Thin stream DUPACK is to start fast recovery on only one DUPACK
provided the connection is a thin stream (i.e., low inflight).  But
this older feature is now subsumed with RACK. If a connection
receives only a single DUPACK, RACK would arm a reordering timer
and soon starts fast recovery instead of timeout if no further
ACKs are received.

The socket option (THIN_DUPACK) is kept as a nop for compatibility.
Note that this patch does not change another thin-stream feature
which enables linear RTO. Although it might be good to generalize
that in the future (i.e., linear RTO for the first say 3 retries).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 12 ------------
 include/linux/tcp.h                    |  2 +-
 net/ipv4/sysctl_net_ipv4.c             |  7 -------
 net/ipv4/tcp.c                         |  6 ++----
 net/ipv4/tcp_input.c                   | 13 -------------
 5 files changed, 3 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7de2cf79e16f..aa1bb49f1dc6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -703,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
-tcp_thin_dupack - BOOLEAN
-	Enable dynamic triggering of retransmissions after one dupACK
-	for thin streams. If set, a check is performed upon reception
-	of a dupACK to determine if the stream is thin (less than 4
-	packets in flight). As long as the stream is found to be thin,
-	data is retransmitted on the first received dupACK. This
-	improves retransmission latency for non-aggressive thin
-	streams, often found to be time-dependent.
-	For more information on thin streams, see
-	Documentation/networking/tcp-thin.txt
-	Default: 0
-
 tcp_limit_output_bytes - INTEGER
 	Controls TCP Small Queue limit per tcp socket.
 	TCP bulk sender tends to increase packets in flight until it
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4733368f953a..6c22332afb75 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -220,7 +220,7 @@ struct tcp_sock {
 		unused:5;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		thin_dupack : 1,/* Fast retransmit on first dupack      */
+		unused1	    : 1,
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0f2d37e8e983..c8d283615c6f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -536,13 +536,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec
 	},
-	{
-		.procname       = "tcp_thin_dupack",
-		.data           = &sysctl_tcp_thin_dupack,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec
-	},
 	{
 		.procname	= "tcp_early_retrans",
 		.data		= &sysctl_tcp_early_retrans,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d9023e8ed53e..aba6ea76338e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2474,9 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		if (val < 0 || val > 1)
 			err = -EINVAL;
-		else {
-			tp->thin_dupack = val;
-		}
 		break;
 
 	case TCP_REPAIR:
@@ -2966,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
+
 	case TCP_THIN_DUPACK:
-		val = tp->thin_dupack;
+		val = 0;
 		break;
 
 	case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87315ab1ab1a..39ebc20ca1b2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -2170,16 +2167,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	if (tcp_dupack_heuristics(tp) > tp->reordering)
 		return true;
 
-	/* If a thin stream is detected, retransmit after first
-	 * received dupack. Employ only if SACK is supported in order
-	 * to avoid possible corner-case series of spurious retransmissions
-	 * Use only if there are no unsent data.
-	 */
-	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
-	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
-	    tcp_is_sack(tp) && !tcp_send_head(sk))
-		return true;
-
 	return false;
 }
 
-- 
cgit v1.2.3


From a665ece8b471de45bc19af05d52a1eaa5bc06dca Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 12 Jan 2017 13:23:31 +0200
Subject: x86/platform/intel: Remove PMIC GPIO block support

Moorestown support was removed by commit:

  1a8359e411eb ("x86/mid: Remove Intel Moorestown")

Remove this leftover.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: platform-driver-x86@vger.kernel.org
Link: http://lkml.kernel.org/r/20170112112331.93236-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/sfi.c      |   1 -
 drivers/platform/x86/Kconfig           |   7 -
 drivers/platform/x86/Makefile          |   1 -
 drivers/platform/x86/intel_pmic_gpio.c | 326 ---------------------------------
 include/linux/intel_pmic_gpio.h        |  15 --
 5 files changed, 350 deletions(-)
 delete mode 100644 drivers/platform/x86/intel_pmic_gpio.c
 delete mode 100644 include/linux/intel_pmic_gpio.h

(limited to 'include/linux')

diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index ce1303830231..19b43e3a9f0f 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -15,7 +15,6 @@
 #include <linux/interrupt.h>
 #include <linux/scatterlist.h>
 #include <linux/sfi.h>
-#include <linux/intel_pmic_gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/i2c.h>
 #include <linux/skbuff.h>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 5fe8be089b8b..fe5a3da3682f 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -816,13 +816,6 @@ config INTEL_SCU_IPC_UTIL
 	  low level access for debug work and updating the firmware. Say
 	  N unless you will be doing this on an Intel MID platform.
 
-config GPIO_INTEL_PMIC
-	bool "Intel PMIC GPIO support"
-	depends on INTEL_SCU_IPC && GPIOLIB
-	---help---
-	  Say Y here to support GPIO via the SCU IPC interface
-	  on Intel MID platforms.
-
 config INTEL_MID_POWER_BUTTON
 	tristate "power button driver for Intel MID platforms"
 	depends on INTEL_SCU_IPC && INPUT
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index d4111f0f8a78..b2f52a7690af 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -50,7 +50,6 @@ obj-$(CONFIG_INTEL_SCU_IPC)	+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC_UTIL) += intel_scu_ipcutil.o
 obj-$(CONFIG_INTEL_MFLD_THERMAL) += intel_mid_thermal.o
 obj-$(CONFIG_INTEL_IPS)		+= intel_ips.o
-obj-$(CONFIG_GPIO_INTEL_PMIC)	+= intel_pmic_gpio.o
 obj-$(CONFIG_XO1_RFKILL)	+= xo1-rfkill.o
 obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
 obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
diff --git a/drivers/platform/x86/intel_pmic_gpio.c b/drivers/platform/x86/intel_pmic_gpio.c
deleted file mode 100644
index 91ae58510d92..000000000000
--- a/drivers/platform/x86/intel_pmic_gpio.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/* Moorestown PMIC GPIO (access through IPC) driver
- * Copyright (c) 2008 - 2009, Intel Corporation.
- *
- * Author: Alek Du <alek.du@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* Supports:
- * Moorestown platform PMIC chip
- */
-
-#define pr_fmt(fmt) "%s: " fmt, __func__
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/gpio/driver.h>
-#include <asm/intel_scu_ipc.h>
-#include <linux/device.h>
-#include <linux/intel_pmic_gpio.h>
-#include <linux/platform_device.h>
-
-#define DRIVER_NAME "pmic_gpio"
-
-/* register offset that IPC driver should use
- * 8 GPIO + 8 GPOSW (6 controllable) + 8GPO
- */
-enum pmic_gpio_register {
-	GPIO0		= 0xE0,
-	GPIO7		= 0xE7,
-	GPIOINT		= 0xE8,
-	GPOSWCTL0	= 0xEC,
-	GPOSWCTL5	= 0xF1,
-	GPO		= 0xF4,
-};
-
-/* bits definition for GPIO & GPOSW */
-#define GPIO_DRV 0x01
-#define GPIO_DIR 0x02
-#define GPIO_DIN 0x04
-#define GPIO_DOU 0x08
-#define GPIO_INTCTL 0x30
-#define GPIO_DBC 0xc0
-
-#define GPOSW_DRV 0x01
-#define GPOSW_DOU 0x08
-#define GPOSW_RDRV 0x30
-
-#define GPIO_UPDATE_TYPE	0x80000000
-
-#define NUM_GPIO 24
-
-struct pmic_gpio {
-	struct mutex		buslock;
-	struct gpio_chip	chip;
-	void			*gpiointr;
-	int			irq;
-	unsigned		irq_base;
-	unsigned int		update_type;
-	u32			trigger_type;
-};
-
-static void pmic_program_irqtype(int gpio, int type)
-{
-	if (type & IRQ_TYPE_EDGE_RISING)
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x20, 0x20);
-	else
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x20);
-
-	if (type & IRQ_TYPE_EDGE_FALLING)
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x10, 0x10);
-	else
-		intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x10);
-};
-
-static int pmic_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
-{
-	if (offset >= 8) {
-		pr_err("only pin 0-7 support input\n");
-		return -1;/* we only have 8 GPIO can use as input */
-	}
-	return intel_scu_ipc_update_register(GPIO0 + offset,
-							GPIO_DIR, GPIO_DIR);
-}
-
-static int pmic_gpio_direction_output(struct gpio_chip *chip,
-			unsigned offset, int value)
-{
-	int rc = 0;
-
-	if (offset < 8)/* it is GPIO */
-		rc = intel_scu_ipc_update_register(GPIO0 + offset,
-				GPIO_DRV | (value ? GPIO_DOU : 0),
-				GPIO_DRV | GPIO_DOU | GPIO_DIR);
-	else if (offset < 16)/* it is GPOSW */
-		rc = intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8,
-				GPOSW_DRV | (value ? GPOSW_DOU : 0),
-				GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV);
-	else if (offset > 15 && offset < 24)/* it is GPO */
-		rc = intel_scu_ipc_update_register(GPO,
-				value ? 1 << (offset - 16) : 0,
-				1 << (offset - 16));
-	else {
-		pr_err("invalid PMIC GPIO pin %d!\n", offset);
-		WARN_ON(1);
-	}
-
-	return rc;
-}
-
-static int pmic_gpio_get(struct gpio_chip *chip, unsigned offset)
-{
-	u8 r;
-	int ret;
-
-	/* we only have 8 GPIO pins we can use as input */
-	if (offset >= 8)
-		return -EOPNOTSUPP;
-	ret = intel_scu_ipc_ioread8(GPIO0 + offset, &r);
-	if (ret < 0)
-		return ret;
-	return r & GPIO_DIN;
-}
-
-static void pmic_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	if (offset < 8)/* it is GPIO */
-		intel_scu_ipc_update_register(GPIO0 + offset,
-			GPIO_DRV | (value ? GPIO_DOU : 0),
-			GPIO_DRV | GPIO_DOU);
-	else if (offset < 16)/* it is GPOSW */
-		intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8,
-			GPOSW_DRV | (value ? GPOSW_DOU : 0),
-			GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV);
-	else if (offset > 15 && offset < 24) /* it is GPO */
-		intel_scu_ipc_update_register(GPO,
-			value ? 1 << (offset - 16) : 0,
-			1 << (offset - 16));
-}
-
-/*
- * This is called from genirq with pg->buslock locked and
- * irq_desc->lock held. We can not access the scu bus here, so we
- * store the change and update in the bus_sync_unlock() function below
- */
-static int pmic_irq_type(struct irq_data *data, unsigned type)
-{
-	struct pmic_gpio *pg = irq_data_get_irq_chip_data(data);
-	u32 gpio = data->irq - pg->irq_base;
-
-	if (gpio >= pg->chip.ngpio)
-		return -EINVAL;
-
-	pg->trigger_type = type;
-	pg->update_type = gpio | GPIO_UPDATE_TYPE;
-	return 0;
-}
-
-static int pmic_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
-{
-	struct pmic_gpio *pg = gpiochip_get_data(chip);
-
-	return pg->irq_base + offset;
-}
-
-static void pmic_bus_lock(struct irq_data *data)
-{
-	struct pmic_gpio *pg = irq_data_get_irq_chip_data(data);
-
-	mutex_lock(&pg->buslock);
-}
-
-static void pmic_bus_sync_unlock(struct irq_data *data)
-{
-	struct pmic_gpio *pg = irq_data_get_irq_chip_data(data);
-
-	if (pg->update_type) {
-		unsigned int gpio = pg->update_type & ~GPIO_UPDATE_TYPE;
-
-		pmic_program_irqtype(gpio, pg->trigger_type);
-		pg->update_type = 0;
-	}
-	mutex_unlock(&pg->buslock);
-}
-
-/* the gpiointr register is read-clear, so just do nothing. */
-static void pmic_irq_unmask(struct irq_data *data) { }
-
-static void pmic_irq_mask(struct irq_data *data) { }
-
-static struct irq_chip pmic_irqchip = {
-	.name			= "PMIC-GPIO",
-	.irq_mask		= pmic_irq_mask,
-	.irq_unmask		= pmic_irq_unmask,
-	.irq_set_type		= pmic_irq_type,
-	.irq_bus_lock		= pmic_bus_lock,
-	.irq_bus_sync_unlock	= pmic_bus_sync_unlock,
-};
-
-static irqreturn_t pmic_irq_handler(int irq, void *data)
-{
-	struct pmic_gpio *pg = data;
-	u8 intsts = *((u8 *)pg->gpiointr + 4);
-	int gpio;
-	irqreturn_t ret = IRQ_NONE;
-
-	for (gpio = 0; gpio < 8; gpio++) {
-		if (intsts & (1 << gpio)) {
-			pr_debug("pmic pin %d triggered\n", gpio);
-			generic_handle_irq(pg->irq_base + gpio);
-			ret = IRQ_HANDLED;
-		}
-	}
-	return ret;
-}
-
-static int platform_pmic_gpio_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	int irq = platform_get_irq(pdev, 0);
-	struct intel_pmic_gpio_platform_data *pdata = dev->platform_data;
-
-	struct pmic_gpio *pg;
-	int retval;
-	int i;
-
-	if (irq < 0) {
-		dev_dbg(dev, "no IRQ line\n");
-		return -EINVAL;
-	}
-
-	if (!pdata || !pdata->gpio_base || !pdata->irq_base) {
-		dev_dbg(dev, "incorrect or missing platform data\n");
-		return -EINVAL;
-	}
-
-	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
-	if (!pg)
-		return -ENOMEM;
-
-	dev_set_drvdata(dev, pg);
-
-	pg->irq = irq;
-	/* setting up SRAM mapping for GPIOINT register */
-	pg->gpiointr = ioremap_nocache(pdata->gpiointr, 8);
-	if (!pg->gpiointr) {
-		pr_err("Can not map GPIOINT\n");
-		retval = -EINVAL;
-		goto err2;
-	}
-	pg->irq_base = pdata->irq_base;
-	pg->chip.label = "intel_pmic";
-	pg->chip.direction_input = pmic_gpio_direction_input;
-	pg->chip.direction_output = pmic_gpio_direction_output;
-	pg->chip.get = pmic_gpio_get;
-	pg->chip.set = pmic_gpio_set;
-	pg->chip.to_irq = pmic_gpio_to_irq;
-	pg->chip.base = pdata->gpio_base;
-	pg->chip.ngpio = NUM_GPIO;
-	pg->chip.can_sleep = 1;
-	pg->chip.parent = dev;
-
-	mutex_init(&pg->buslock);
-
-	pg->chip.parent = dev;
-	retval = gpiochip_add_data(&pg->chip, pg);
-	if (retval) {
-		pr_err("Can not add pmic gpio chip\n");
-		goto err;
-	}
-
-	retval = request_irq(pg->irq, pmic_irq_handler, 0, "pmic", pg);
-	if (retval) {
-		pr_warn("Interrupt request failed\n");
-		goto fail_request_irq;
-	}
-
-	for (i = 0; i < 8; i++) {
-		irq_set_chip_and_handler_name(i + pg->irq_base,
-					      &pmic_irqchip,
-					      handle_simple_irq,
-					      "demux");
-		irq_set_chip_data(i + pg->irq_base, pg);
-	}
-	return 0;
-
-fail_request_irq:
-	gpiochip_remove(&pg->chip);
-err:
-	iounmap(pg->gpiointr);
-err2:
-	kfree(pg);
-	return retval;
-}
-
-/* at the same time, register a platform driver
- * this supports the sfi 0.81 fw */
-static struct platform_driver platform_pmic_gpio_driver = {
-	.driver = {
-		.name		= DRIVER_NAME,
-	},
-	.probe		= platform_pmic_gpio_probe,
-};
-
-static int __init platform_pmic_gpio_init(void)
-{
-	return platform_driver_register(&platform_pmic_gpio_driver);
-}
-subsys_initcall(platform_pmic_gpio_init);
diff --git a/include/linux/intel_pmic_gpio.h b/include/linux/intel_pmic_gpio.h
deleted file mode 100644
index 920109a29191..000000000000
--- a/include/linux/intel_pmic_gpio.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef LINUX_INTEL_PMIC_H
-#define LINUX_INTEL_PMIC_H
-
-struct intel_pmic_gpio_platform_data {
-	/* the first IRQ of the chip */
-	unsigned	irq_base;
-	/* number assigned to the first GPIO */
-	unsigned	gpio_base;
-	/* sram address for gpiointr register, the langwell chip will map
-	 * the PMIC spi GPIO expander's GPIOINTR register in sram.
-	 */
-	unsigned	gpiointr;
-};
-
-#endif
-- 
cgit v1.2.3


From 5b485629ba0d5d027880769ff467c587b24b4bde Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 8 Jan 2017 23:58:09 +0900
Subject: kprobes, extable: Identify kprobes trampolines as kernel text area

Improve __kernel_text_address()/kernel_text_address() to return
true if the given address is on a kprobe's instruction slot
trampoline.

This can help stacktraces to determine the address is on a
text area or not.

To implement this atomically in is_kprobe_*_slot(), also change
the insn_cache page list to an RCU list.

This changes timings a bit (it delays page freeing to the RCU garbage
collection phase), but none of that is in the hot path.

Note: this change can add small overhead to stack unwinders because
it adds 2 additional checks to __kernel_text_address(). However, the
impact should be very small, because kprobe_insn_pages list has 1 entry
per 256 probes(on x86, on arm/arm64 it will be 1024 probes),
and kprobe_optinsn_pages has 1 entry per 32 probes(on x86).
In most use cases, the number of kprobe events may be less
than 20, which means that is_kprobe_*_slot() will check just one entry.

Tested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/148388747896.6869.6354262871751682264.stgit@devbox
[ Improved the changelog and coding style. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h | 30 +++++++++++++++++++-
 kernel/extable.c        |  9 +++++-
 kernel/kprobes.c        | 73 ++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 91 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 8f6849084248..16ddfb8b304a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -278,9 +278,13 @@ struct kprobe_insn_cache {
 	int nr_garbage;
 };
 
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
 extern void __free_insn_slot(struct kprobe_insn_cache *c,
 			     kprobe_opcode_t *slot, int dirty);
+/* sleep-less address checking routine  */
+extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c,
+				unsigned long addr);
 
 #define DEFINE_INSN_CACHE_OPS(__name)					\
 extern struct kprobe_insn_cache kprobe_##__name##_slots;		\
@@ -294,6 +298,18 @@ static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
 {									\
 	__free_insn_slot(&kprobe_##__name##_slots, slot, dirty);	\
 }									\
+									\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);	\
+}
+#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
+#define DEFINE_INSN_CACHE_OPS(__name)					\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return 0;							\
+}
+#endif
 
 DEFINE_INSN_CACHE_OPS(insn);
 
@@ -330,7 +346,6 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table,
 					     int write, void __user *buffer,
 					     size_t *length, loff_t *ppos);
 #endif
-
 #endif /* CONFIG_OPTPROBES */
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
@@ -481,6 +496,19 @@ static inline int enable_jprobe(struct jprobe *jp)
 	return enable_kprobe(&jp->kp);
 }
 
+#ifndef CONFIG_KPROBES
+static inline bool is_kprobe_insn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+#ifndef CONFIG_OPTPROBES
+static inline bool is_kprobe_optinsn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+
 #ifdef CONFIG_KPROBES
 /*
  * Blacklist ganerating macro. Specify functions which is not probed
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..e1359474baa5 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
+#include <linux/kprobes.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_ftrace_trampoline(addr))
 		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
 	/*
 	 * There might be init symbols in saved stacktraces.
 	 * Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_module_text_address(addr))
 		return 1;
-	return is_ftrace_trampoline(addr);
+	if (is_ftrace_trampoline(addr))
+		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 43460104f119..ebb4dadca66b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	struct kprobe_insn_page *kip;
 	kprobe_opcode_t *slot = NULL;
 
+	/* Since the slot array is not protected by rcu, we need a mutex */
 	mutex_lock(&c->mutex);
  retry:
-	list_for_each_entry(kip, &c->pages, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
 		if (kip->nused < slots_per_page(c)) {
 			int i;
 			for (i = 0; i < slots_per_page(c); i++) {
@@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 					kip->slot_used[i] = SLOT_USED;
 					kip->nused++;
 					slot = kip->insns + (i * c->insn_size);
+					rcu_read_unlock();
 					goto out;
 				}
 			}
@@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 			WARN_ON(1);
 		}
 	}
+	rcu_read_unlock();
 
 	/* If there are any garbage slots, collect it and try again. */
 	if (c->nr_garbage && collect_garbage_slots(c) == 0)
@@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->nused = 1;
 	kip->ngarbage = 0;
 	kip->cache = c;
-	list_add(&kip->list, &c->pages);
+	list_add_rcu(&kip->list, &c->pages);
 	slot = kip->insns;
 out:
 	mutex_unlock(&c->mutex);
@@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 * next time somebody inserts a probe.
 		 */
 		if (!list_is_singular(&kip->list)) {
-			list_del(&kip->list);
+			list_del_rcu(&kip->list);
+			synchronize_rcu();
 			kip->cache->free(kip->insns);
 			kfree(kip);
 		}
@@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
 			continue;
 		kip->ngarbage = 0;	/* we will collect all garbages */
 		for (i = 0; i < slots_per_page(c); i++) {
-			if (kip->slot_used[i] == SLOT_DIRTY &&
-			    collect_one_slot(kip, i))
+			if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
 				break;
 		}
 	}
@@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c,
 		      kprobe_opcode_t *slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
+	long idx;
 
 	mutex_lock(&c->mutex);
-	list_for_each_entry(kip, &c->pages, list) {
-		long idx = ((long)slot - (long)kip->insns) /
-				(c->insn_size * sizeof(kprobe_opcode_t));
-		if (idx >= 0 && idx < slots_per_page(c)) {
-			WARN_ON(kip->slot_used[idx] != SLOT_USED);
-			if (dirty) {
-				kip->slot_used[idx] = SLOT_DIRTY;
-				kip->ngarbage++;
-				if (++c->nr_garbage > slots_per_page(c))
-					collect_garbage_slots(c);
-			} else
-				collect_one_slot(kip, idx);
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		idx = ((long)slot - (long)kip->insns) /
+			(c->insn_size * sizeof(kprobe_opcode_t));
+		if (idx >= 0 && idx < slots_per_page(c))
 			goto out;
-		}
 	}
-	/* Could not free this slot. */
+	/* Could not find this slot. */
 	WARN_ON(1);
+	kip = NULL;
 out:
+	rcu_read_unlock();
+	/* Mark and sweep: this may sleep */
+	if (kip) {
+		/* Check double free */
+		WARN_ON(kip->slot_used[idx] != SLOT_USED);
+		if (dirty) {
+			kip->slot_used[idx] = SLOT_DIRTY;
+			kip->ngarbage++;
+			if (++c->nr_garbage > slots_per_page(c))
+				collect_garbage_slots(c);
+		} else {
+			collect_one_slot(kip, idx);
+		}
+	}
 	mutex_unlock(&c->mutex);
 }
 
+/*
+ * Check given address is on the page of kprobe instruction slots.
+ * This will be used for checking whether the address on a stack
+ * is on a text area or not.
+ */
+bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
+{
+	struct kprobe_insn_page *kip;
+	bool ret = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		if (addr >= (unsigned long)kip->insns &&
+		    addr < (unsigned long)kip->insns + PAGE_SIZE) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
-- 
cgit v1.2.3


From c31cc6a5187e8b09ccee34f81728a90f80e872e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:43 +0100
Subject: sched/cputime: Allow accounting system time using cpustat index

In order to prepare for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y to delay
cputime accounting to the tick, let's provide APIs to account system
time to precise contexts: hardirq, softirq, pure system, ...

Inspired-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel_stat.h |  2 ++
 kernel/sched/cputime.c      | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 00f776816aa3..14b63bb714d4 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -80,6 +80,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
+extern void account_system_index_time(struct task_struct *, cputime_t,
+				      enum cpu_usage_stat);
 extern void account_steal_time(cputime_t);
 extern void account_idle_time(cputime_t);
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7700a9cba335..aad42835938c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -176,8 +176,8 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
  * @cputime: the cpu time spent in kernel space since the last update
  * @index: pointer to cpustat field that has to be updated
  */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
+void account_system_index_time(struct task_struct *p,
+			       cputime_t cputime, enum cpu_usage_stat index)
 {
 	/* Add system time to process. */
 	p->stime += cputime;
@@ -213,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	else
 		index = CPUTIME_SYSTEM;
 
-	__account_system_time(p, cputime, index);
+	account_system_index_time(p, cputime, index);
 }
 
 /*
@@ -400,7 +400,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		__account_system_time(p, cputime, CPUTIME_SOFTIRQ);
+		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
@@ -408,7 +408,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
 		account_guest_time(p, cputime);
 	} else {
-		__account_system_time(p, cputime, CPUTIME_SYSTEM);
+		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 	}
 }
 
-- 
cgit v1.2.3


From 1213699ab426608ff1925ab263dd6925102bb92a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:44 +0100
Subject: sched/cputime: Export account_guest_time()

In order to prepare for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y to delay
cputime accounting to the tick, let's allow archs to account cputime
directly to gtime.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel_stat.h | 1 +
 kernel/sched/cputime.c      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 14b63bb714d4..cfd6c0c6d4e8 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 }
 
 extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_guest_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
 				      enum cpu_usage_stat);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aad42835938c..5813ee4a5168 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -151,7 +151,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+void account_guest_time(struct task_struct *p, cputime_t cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-- 
cgit v1.2.3


From c8d7dabf8f91fadd265e6eb87afb201d14ea299b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:50 +0100
Subject: sched/cputime: Rename vtime_account_user() to vtime_flush()

CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y used to accumulate user time and
account it on ticks and context switches only through the
vtime_account_user() function.

Now this model has been generalized on the 3 archs for all kind of
cputime (system, irq, ...) and all the cputime flushing happens under
vtime_account_user().

So let's rename this function to better reflect its new role.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-11-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     | 2 +-
 arch/powerpc/kernel/time.c  | 6 ++----
 arch/s390/kernel/vtime.c    | 2 +-
 include/linux/kernel_stat.h | 2 +-
 include/linux/vtime.h       | 7 +++++--
 kernel/sched/cputime.c      | 4 +---
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 37f1b315d9b6..d040f12ea9f9 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -61,7 +61,7 @@ static struct clocksource *itc_clocksource;
 
 extern cputime_t cycle_to_cputime(u64 cyc);
 
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	struct thread_info *ti = task_thread_info(tsk);
 	cputime_t delta;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 4255e6930ac1..02e97305d22b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -382,15 +382,13 @@ void vtime_account_idle(struct task_struct *tsk)
 }
 
 /*
- * Transfer the user time accumulated in the paca
- * by the exception entry and exit code to the generic
- * process user time records.
+ * Account the whole cputime accumulated in the paca
  * Must be called with interrupts disabled.
  * Assumes that vtime_account_system/idle() has been called
  * recently (i.e. since the last entry from usermode) so that
  * get_paca()->user_time_scaled is up to date.
  */
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	struct cpu_accounting_data *acct = get_accounting(tsk);
 
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 1a53e0bdc90a..0a9e5d67547d 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -214,7 +214,7 @@ void vtime_task_switch(struct task_struct *prev)
  * accounting system time in order to correctly compute
  * the stolen time accounting.
  */
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	if (do_account_vtime(tsk))
 		virt_timer_expire();
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cfd6c0c6d4e8..c3e38ded2d73 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -89,7 +89,7 @@ extern void account_idle_time(cputime_t);
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline void account_process_tick(struct task_struct *tsk, int user)
 {
-	vtime_account_user(tsk);
+	vtime_flush(tsk);
 }
 #else
 extern void account_process_tick(struct task_struct *, int user);
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index aa9bfea8804a..0681fe25abeb 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -58,27 +58,28 @@ static inline void vtime_task_switch(struct task_struct *prev)
 
 extern void vtime_account_system(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
-extern void vtime_account_user(struct task_struct *tsk);
 
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 static inline void vtime_task_switch(struct task_struct *prev) { }
 static inline void vtime_account_system(struct task_struct *tsk) { }
-static inline void vtime_account_user(struct task_struct *tsk) { }
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
+extern void vtime_account_user(struct task_struct *tsk);
 extern void vtime_user_enter(struct task_struct *tsk);
 
 static inline void vtime_user_exit(struct task_struct *tsk)
 {
 	vtime_account_user(tsk);
 }
+
 extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
+static inline void vtime_account_user(struct task_struct *tsk) { }
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
@@ -93,9 +94,11 @@ static inline void vtime_account_irq_exit(struct task_struct *tsk)
 	/* On hard|softirq exit we always account to hard|softirq cputime */
 	vtime_account_system(tsk);
 }
+extern void vtime_flush(struct task_struct *tsk);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
 static inline void vtime_account_irq_exit(struct task_struct *tsk) { }
+static inline void vtime_flush(struct task_struct *tsk) { }
 #endif
 
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5813ee4a5168..f7c14cc71d06 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -437,9 +437,7 @@ void vtime_common_task_switch(struct task_struct *prev)
 	else
 		vtime_account_system(prev);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	vtime_account_user(prev);
-#endif
+	vtime_flush(prev);
 	arch_vtime_task_switch(prev);
 }
 #endif
-- 
cgit v1.2.3


From 642fa448ae6b3a4e5e8737054a094173405b7643 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 3 Jan 2017 13:43:14 -0800
Subject: sched/core: Remove set_task_state()

This is a nasty interface and setting the state of a foreign task must
not be done. As of the following commit:

  be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()")

... everyone in the kernel calls set_task_state() with current, allowing
the helper to be removed.

However, as the comment indicates, it is still around for those archs
where computing current is more expensive than using a pointer, at least
in theory. An important arch that is affected is arm64, however this has
been addressed now [1] and performance is up to par making no difference
with either calls.

Of all the callers, if any, it's the locking bits that would care most
about this -- ie: we end up passing a tsk pointer to a lot of the lock
slowpath, and setting ->state on that. The following numbers are based
on two tests: a custom ad-hoc microbenchmark that just measures
latencies (for ~65 million calls) between get_task_state() vs
get_current_state().

Secondly for a higher overview, an unlink microbenchmark was used,
which pounds on a single file with open, close,unlink combos with
increasing thread counts (up to 4x ncpus). While the workload is quite
unrealistic, it does contend a lot on the inode mutex or now rwsem.

[1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com

== 1. x86-64 ==

Avg runtime set_task_state():    601 msecs
Avg runtime set_current_state(): 552 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      36089.26 (  0.00%)    38977.33 (  8.00%)
Hmean    unlink1-processes-5      28555.01 (  0.00%)    29832.55 (  4.28%)
Hmean    unlink1-processes-8      37323.75 (  0.00%)    44974.57 ( 20.50%)
Hmean    unlink1-processes-12     43571.88 (  0.00%)    44283.01 (  1.63%)
Hmean    unlink1-processes-21     34431.52 (  0.00%)    38284.45 ( 11.19%)
Hmean    unlink1-processes-30     34813.26 (  0.00%)    37975.17 (  9.08%)
Hmean    unlink1-processes-48     37048.90 (  0.00%)    39862.78 (  7.59%)
Hmean    unlink1-processes-79     35630.01 (  0.00%)    36855.30 (  3.44%)
Hmean    unlink1-processes-110    36115.85 (  0.00%)    39843.91 ( 10.32%)
Hmean    unlink1-processes-141    32546.96 (  0.00%)    35418.52 (  8.82%)
Hmean    unlink1-processes-172    34674.79 (  0.00%)    36899.21 (  6.42%)
Hmean    unlink1-processes-203    37303.11 (  0.00%)    36393.04 ( -2.44%)
Hmean    unlink1-processes-224    35712.13 (  0.00%)    36685.96 (  2.73%)

== 2. ppc64le ==

Avg runtime set_task_state():  938 msecs
Avg runtime set_current_state: 940 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      19269.19 (  0.00%)    30704.50 ( 59.35%)
Hmean    unlink1-processes-5      20106.15 (  0.00%)    21804.15 (  8.45%)
Hmean    unlink1-processes-8      17496.97 (  0.00%)    17243.28 ( -1.45%)
Hmean    unlink1-processes-12     14224.15 (  0.00%)    17240.21 ( 21.20%)
Hmean    unlink1-processes-21     14155.66 (  0.00%)    15681.23 ( 10.78%)
Hmean    unlink1-processes-30     14450.70 (  0.00%)    15995.83 ( 10.69%)
Hmean    unlink1-processes-48     16945.57 (  0.00%)    16370.42 ( -3.39%)
Hmean    unlink1-processes-79     15788.39 (  0.00%)    14639.27 ( -7.28%)
Hmean    unlink1-processes-110    14268.48 (  0.00%)    14377.40 (  0.76%)
Hmean    unlink1-processes-141    14023.65 (  0.00%)    16271.69 ( 16.03%)
Hmean    unlink1-processes-172    13417.62 (  0.00%)    16067.55 ( 19.75%)
Hmean    unlink1-processes-203    15293.08 (  0.00%)    15440.40 (  0.96%)
Hmean    unlink1-processes-234    13719.32 (  0.00%)    16190.74 ( 18.01%)
Hmean    unlink1-processes-265    16400.97 (  0.00%)    16115.22 ( -1.74%)
Hmean    unlink1-processes-296    14388.60 (  0.00%)    16216.13 ( 12.70%)
Hmean    unlink1-processes-320    15771.85 (  0.00%)    15905.96 (  0.85%)

x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching)
and ppc64 (with paca) show similar improvements in the unlink microbenches.
The small delta for ppc64 (2ms), does not represent the gains on the unlink
runs. In the case of x86, there was a decent amount of variation in the
latency runs, but always within a 20 to 50ms increase), ppc was more constant.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: mark.rutland@arm.com
Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/um/drivers/random.c                           |  2 +-
 drivers/md/dm-bufio.c                              |  2 +-
 drivers/md/dm-crypt.c                              |  4 ++--
 drivers/md/persistent-data/dm-block-manager.c      |  4 ++--
 .../staging/lustre/lnet/libcfs/linux/linux-debug.c |  2 +-
 drivers/tty/tty_ldsem.c                            | 10 ++++----
 include/linux/sched.h                              | 27 +---------------------
 kernel/exit.c                                      |  4 ++--
 kernel/locking/mutex.c                             |  8 +++----
 kernel/locking/rwsem-spinlock.c                    |  8 +++----
 kernel/locking/rwsem-xadd.c                        |  4 ++--
 kernel/locking/semaphore.c                         |  2 +-
 12 files changed, 26 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 05523f14d7b2..57f03050c850 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -76,7 +76,7 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
 			add_sigio_fd(random_fd);
 
 			add_wait_queue(&host_read_wait, &wait);
-			set_task_state(current, TASK_INTERRUPTIBLE);
+			set_current_state(TASK_INTERRUPTIBLE);
 
 			schedule();
 			remove_wait_queue(&host_read_wait, &wait);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 84d2f0e4c754..d36d427a9efb 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -794,7 +794,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
 	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue(&c->free_buffer_wait, &wait);
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	dm_bufio_unlock(c);
 
 	io_schedule();
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7c6c57216bf2..96692d13a6e4 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1210,14 +1210,14 @@ continue_locked:
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
 		if (unlikely(kthread_should_stop())) {
-			set_task_state(current, TASK_RUNNING);
+			set_current_state(TASK_RUNNING);
 			remove_wait_queue(&cc->write_thread_wait, &wait);
 			break;
 		}
 
 		schedule();
 
-		set_task_state(current, TASK_RUNNING);
+		set_current_state(TASK_RUNNING);
 		spin_lock_irq(&cc->write_thread_wait.lock);
 		__remove_wait_queue(&cc->write_thread_wait, &wait);
 		goto continue_locked;
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a6dde7cab458..758d90cc2733 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -120,7 +120,7 @@ static int __check_holder(struct block_lock *lock)
 static void __wait(struct waiter *w)
 {
 	for (;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		if (!w->task)
 			break;
@@ -128,7 +128,7 @@ static void __wait(struct waiter *w)
 		schedule();
 	}
 
-	set_task_state(current, TASK_RUNNING);
+	set_current_state(TASK_RUNNING);
 }
 
 static void __wake_waiter(struct waiter *w)
diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
index 39a72e3f0c18..7035356e56b3 100644
--- a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
@@ -107,7 +107,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
 		libcfs_debug_dumplog();
 	if (libcfs_panic_on_lbug)
 		panic("LBUG");
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	while (1)
 		schedule();
 }
diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index 3e6722954f29..9229de43e19d 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -231,7 +231,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 
 	/* wait to be given the lock */
 	for (;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		if (!waiter.task)
 			break;
@@ -240,7 +240,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 		timeout = schedule_timeout(timeout);
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	if (!timeout) {
 		/* lock timed out but check if this task was just
@@ -289,14 +289,14 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 
 	waiter.task = current;
 
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	for (;;) {
 		if (!timeout)
 			break;
 		raw_spin_unlock_irq(&sem->wait_lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->wait_lock);
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		locked = writer_trylock(sem);
 		if (locked)
 			break;
@@ -307,7 +307,7 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	/* lock wait may have timed out */
 	if (!locked)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9ec61f7..f4f9d32f12b3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -227,7 +227,7 @@ extern void proc_sched_set_task(struct task_struct *p);
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
 
-/* Convenience macros for the sake of set_task_state */
+/* Convenience macros for the sake of set_current_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
@@ -254,17 +254,6 @@ extern char ___assert_task_state[1 - 2*!!(
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
-#define __set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		(tsk)->state = (state_value);			\
-	} while (0)
-#define set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		smp_store_mb((tsk)->state, (state_value));	\
-	} while (0)
-
 #define __set_current_state(state_value)			\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
@@ -277,20 +266,6 @@ extern char ___assert_task_state[1 - 2*!!(
 	} while (0)
 
 #else
-
-/*
- * @tsk had better be current, or you get to keep the pieces.
- *
- * The only reason is that computing current can be more expensive than
- * using a pointer that's already available.
- *
- * Therefore, see set_current_state().
- */
-#define __set_task_state(tsk, state_value)		\
-	do { (tsk)->state = (state_value); } while (0)
-#define set_task_state(tsk, state_value)		\
-	smp_store_mb((tsk)->state, (state_value))
-
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
diff --git a/kernel/exit.c b/kernel/exit.c
index 2385d434a46e..27c68653e2fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -501,12 +501,12 @@ static void exit_mm(void)
 			complete(&core_state->startup);
 
 		for (;;) {
-			set_task_state(current, TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
 			freezable_schedule();
 		}
-		__set_task_state(current, TASK_RUNNING);
+		__set_current_state(TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4c7d04362c95..97d142486f93 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -666,7 +666,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	lock_contended(&lock->dep_map, ip);
 
-	set_task_state(current, state);
+	set_current_state(state);
 	for (;;) {
 		/*
 		 * Once we hold wait_lock, we're serialized against
@@ -701,7 +701,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
 
-		set_task_state(current, state);
+		set_current_state(state);
 		/*
 		 * Here we order against unlock; we must either see it change
 		 * state back to RUNNING and fall through the next schedule(),
@@ -715,7 +715,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	spin_lock_mutex(&lock->wait_lock, flags);
 acquired:
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	mutex_remove_waiter(lock, &waiter, current);
 	if (likely(list_empty(&lock->wait_list)))
@@ -735,7 +735,7 @@ skip_wait:
 	return 0;
 
 err:
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 60d15d3bb2a8..5eacab880f67 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -139,7 +139,7 @@ void __sched __down_read(struct rw_semaphore *sem)
 		goto out;
 	}
 
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
 	waiter.task = current;
@@ -156,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem)
 		if (!waiter.task)
 			break;
 		schedule();
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
  out:
 	;
 }
@@ -216,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
 			ret = -EINTR;
 			goto out;
 		}
-		set_task_state(current, state);
+		set_current_state(state);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d3b819c7f07f..a3a381aee24b 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -253,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* wait to be given the lock */
 	while (true) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
 		schedule();
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 	return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 29aac9f9e5e4..9512e37637dc 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -215,7 +215,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 			goto interrupted;
 		if (unlikely(timeout <= 0))
 			goto timed_out;
-		__set_task_state(current, state);
+		__set_current_state(state);
 		raw_spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->lock);
-- 
cgit v1.2.3


From 8f95c90ceb541a38ac16fec48c05142ef1450c25 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 11 Jan 2017 07:22:25 -0800
Subject: sched/wait, RCU: Introduce rcuwait machinery

rcuwait provides support for (single) RCU-safe task wait/wake functionality,
with the caveat that it must not be called after exit_notify(), such that
we avoid racing with rcu delayed_put_task_struct callbacks, task_struct
being rcu unaware in this context -- for which we similarly have
task_rcu_dereference() magic, but with different return semantics, which
can conflict with the wakeup side.

The interfaces are quite straightforward:

  rcuwait_wait_event()
  rcuwait_wake_up()

More details are in the comments, but it's perhaps worth mentioning at least,
that users must provide proper serialization when waiting on a condition, and
avoid corrupting a concurrent waiter. Also care must be taken between the task
and the condition for when calling the wakeup -- we cannot miss wakeups. When
porting users, this is for example, a given when using waitqueues in that
everything is done under the q->lock. As such, it can remove sources of non
preemptable unbounded work for realtime.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1484148146-14210-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcuwait.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c           | 30 +++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 include/linux/rcuwait.h

(limited to 'include/linux')

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
new file mode 100644
index 000000000000..0e93d56c7ab2
--- /dev/null
+++ b/include/linux/rcuwait.h
@@ -0,0 +1,63 @@
+#ifndef _LINUX_RCUWAIT_H_
+#define _LINUX_RCUWAIT_H_
+
+#include <linux/rcupdate.h>
+
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner; where it is forbidden to use
+ * after exit_notify(). task_struct is not properly rcu protected,
+ * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ *
+ * Alternatively we have task_rcu_dereference(), but the return
+ * semantics have different implications which would break the
+ * wakeup side. The only time @task is non-nil is when a user is
+ * blocked (or checking if it needs to) on a condition, and reset
+ * as soon as we know that the condition has succeeded and are
+ * awoken.
+ */
+struct rcuwait {
+	struct task_struct *task;
+};
+
+#define __RCUWAIT_INITIALIZER(name)		\
+	{ .task = NULL, }
+
+static inline void rcuwait_init(struct rcuwait *w)
+{
+	w->task = NULL;
+}
+
+extern void rcuwait_wake_up(struct rcuwait *w);
+
+/*
+ * The caller is responsible for locking around rcuwait_wait_event(),
+ * such that writes to @task are properly serialized.
+ */
+#define rcuwait_wait_event(w, condition)				\
+({									\
+	/*								\
+	 * Complain if we are called after do_exit()/exit_notify(),     \
+	 * as we cannot rely on the rcu critical region for the		\
+	 * wakeup side.							\
+	 */                                                             \
+	WARN_ON(current->exit_state);                                   \
+									\
+	rcu_assign_pointer((w)->task, current);				\
+	for (;;) {							\
+		/*							\
+		 * Implicit barrier (A) pairs with (B) in		\
+		 * rcuwait_trywake().					\
+		 */							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+									\
+	WRITE_ONCE((w)->task, NULL);					\
+	__set_current_state(TASK_RUNNING);				\
+})
+
+#endif /* _LINUX_RCUWAIT_H_ */
diff --git a/kernel/exit.c b/kernel/exit.c
index 27c68653e2fc..a9441da69e29 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,7 @@
 #include <linux/shm.h>
 #include <linux/kcov.h>
 #include <linux/random.h>
+#include <linux/rcuwait.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -282,6 +283,35 @@ retry:
 	return task;
 }
 
+void rcuwait_wake_up(struct rcuwait *w)
+{
+	struct task_struct *task;
+
+	rcu_read_lock();
+
+	/*
+	 * Order condition vs @task, such that everything prior to the load
+	 * of @task is visible. This is the condition as to why the user called
+	 * rcuwait_trywake() in the first place. Pairs with set_current_state()
+	 * barrier (A) in rcuwait_wait_event().
+	 *
+	 *    WAIT                WAKE
+	 *    [S] tsk = current	  [S] cond = true
+	 *        MB (A)	      MB (B)
+	 *    [L] cond		  [L] tsk
+	 */
+	smp_rmb(); /* (B) */
+
+	/*
+	 * Avoid using task_rcu_dereference() magic as long as we are careful,
+	 * see comment in rcuwait_wait_event() regarding ->exit_state.
+	 */
+	task = rcu_dereference(w->task);
+	if (task)
+		wake_up_process(task);
+	rcu_read_unlock();
+}
+
 struct task_struct *try_get_task_struct(struct task_struct **ptask)
 {
 	struct task_struct *task;
-- 
cgit v1.2.3


From 52b94129f274937e4c25dd17b76697664a3c43c9 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 11 Jan 2017 07:22:26 -0800
Subject: locking/percpu-rwsem: Replace waitqueue with rcuwait

The use of any kind of wait queue is an overkill for pcpu-rwsems.
While one option would be to use the less heavy simple (swait)
flavor, this is still too much for what pcpu-rwsems needs. For one,
we do not care about any sort of queuing in that the only (rare) time
writers (and readers, for that matter) are queued is when trying to
acquire the regular contended rw_sem. There cannot be any further
queuing as writers are serialized by the rw_sem in the first place.

Given that percpu_down_write() must not be called after exit_notify(),
we can replace the bulky waitqueue with rcuwait such that a writer
can wait for its turn to take the lock. As such, we can avoid the
queue handling and locking overhead.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1484148146-14210-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h  | 8 ++++----
 kernel/locking/percpu-rwsem.c | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5b2e6159b744..93664f022ecf 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -4,15 +4,15 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
+#include <linux/rcuwait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem;
-	wait_queue_head_t	writer;
+	struct rw_semaphore	rw_sem; /* slowpath */
+	struct rcuwait          writer; /* blocked writer */
 	int			readers_block;
 };
 
@@ -22,7 +22,7 @@ static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
-	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
+	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
 }
 
 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index ce182599cf2e..883cf1b92d90 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,7 +1,6 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
@@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
-	init_waitqueue_head(&sem->writer);
+	rcuwait_init(&sem->writer);
 	sem->readers_block = 0;
 	return 0;
 }
@@ -103,7 +102,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
 	__this_cpu_dec(*sem->read_count);
 
 	/* Prod writer to recheck readers_active */
-	wake_up(&sem->writer);
+	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
 
@@ -160,7 +159,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 */
 
 	/* Wait for all now active readers to complete. */
-	wait_event(sem->writer, readers_active_check(sem));
+	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
-- 
cgit v1.2.3


From e274795ea7b7caa0fd74ef651594382a69e2a951 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Jan 2017 14:17:48 +0100
Subject: locking/mutex: Fix mutex handoff

While reviewing the ww_mutex patches, I noticed that it was still
possible to (incorrectly) succeed for (incorrect) code like:

	mutex_lock(&a);
	mutex_lock(&a);

This was possible if the second mutex_lock() would block (as expected)
but then receive a spurious wakeup. At that point it would find itself
at the front of the queue, request a handoff and instantly claim
ownership and continue, since owner would point to itself.

Avoid this scenario and simplify the code by introducing a third low
bit to signal handoff pickup. So once we request handoff, unlock
clears the handoff bit and sets the pickup bit along with the new
owner.

This also removes the need for the .handoff argument to
__mutex_trylock(), since that becomes superfluous with PICKUP.

In order to guarantee enough low bits, ensure task_struct alignment is
at least L1_CACHE_BYTES (which seems a good ideal regardless).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9d659ae14b54 ("locking/mutex: Add lock handoff to avoid starvation")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |   2 +-
 kernel/fork.c          |   6 ++-
 kernel/locking/mutex.c | 108 ++++++++++++++++++++++++-------------------------
 3 files changed, 57 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b97870f2debd..3e1fccb47f11 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -65,7 +65,7 @@ struct mutex {
 
 static inline struct task_struct *__mutex_owner(struct mutex *lock)
 {
-	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03);
+	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07);
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..a90510d0bbf8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -432,11 +432,13 @@ void __init fork_init(void)
 	int i;
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
-#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
+#define ARCH_MIN_TASKALIGN	0
 #endif
+	int align = min_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
-			arch_task_struct_size, ARCH_MIN_TASKALIGN,
+			arch_task_struct_size, align,
 			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 97d142486f93..24284497c425 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -50,16 +50,17 @@ EXPORT_SYMBOL(__mutex_init);
 /*
  * @owner: contains: 'struct task_struct *' to the current lock owner,
  * NULL means not owned. Since task_struct pointers are aligned at
- * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low
- * bits to store extra state.
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
  *
  * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
  * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
  */
 #define MUTEX_FLAG_WAITERS	0x01
 #define MUTEX_FLAG_HANDOFF	0x02
+#define MUTEX_FLAG_PICKUP	0x04
 
-#define MUTEX_FLAGS		0x03
+#define MUTEX_FLAGS		0x07
 
 static inline struct task_struct *__owner_task(unsigned long owner)
 {
@@ -72,38 +73,29 @@ static inline unsigned long __owner_flags(unsigned long owner)
 }
 
 /*
- * Actual trylock that will work on any unlocked state.
- *
- * When setting the owner field, we must preserve the low flag bits.
- *
- * Be careful with @handoff, only set that in a wait-loop (where you set
- * HANDOFF) to avoid recursive lock attempts.
+ * Trylock variant that retuns the owning task on failure.
  */
-static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
 {
 	unsigned long owner, curr = (unsigned long)current;
 
 	owner = atomic_long_read(&lock->owner);
 	for (;;) { /* must loop, can race against a flag */
 		unsigned long old, flags = __owner_flags(owner);
+		unsigned long task = owner & ~MUTEX_FLAGS;
+
+		if (task) {
+			if (likely(task != curr))
+				break;
+
+			if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+				break;
 
-		if (__owner_task(owner)) {
-			if (handoff && unlikely(__owner_task(owner) == current)) {
-				/*
-				 * Provide ACQUIRE semantics for the lock-handoff.
-				 *
-				 * We cannot easily use load-acquire here, since
-				 * the actual load is a failed cmpxchg, which
-				 * doesn't imply any barriers.
-				 *
-				 * Also, this is a fairly unlikely scenario, and
-				 * this contains the cost.
-				 */
-				smp_mb(); /* ACQUIRE */
-				return true;
-			}
-
-			return false;
+			flags &= ~MUTEX_FLAG_PICKUP;
+		} else {
+#ifdef CONFIG_DEBUG_MUTEXES
+			DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
+#endif
 		}
 
 		/*
@@ -111,15 +103,24 @@ static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
 		 * past the point where we acquire it. This would be possible
 		 * if we (accidentally) set the bit on an unlocked mutex.
 		 */
-		if (handoff)
-			flags &= ~MUTEX_FLAG_HANDOFF;
+		flags &= ~MUTEX_FLAG_HANDOFF;
 
 		old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
 		if (old == owner)
-			return true;
+			return NULL;
 
 		owner = old;
 	}
+
+	return __owner_task(owner);
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+	return !__mutex_trylock_or_owner(lock);
 }
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -171,9 +172,9 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
 
 /*
  * Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE
- * semantics like a regular unlock, the __mutex_trylock() provides matching
- * ACQUIRE semantics for the handoff.
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * WAITERS. Provides RELEASE semantics like a regular unlock, the
+ * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
  */
 static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 {
@@ -184,10 +185,13 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif
 
 		new = (owner & MUTEX_FLAG_WAITERS);
 		new |= (unsigned long)task;
+		if (task)
+			new |= MUTEX_FLAG_PICKUP;
 
 		old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
 		if (old == owner)
@@ -435,8 +439,6 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 				  struct ww_acquire_ctx *ww_ctx,
 				  const bool use_ww_ctx, const bool waiter)
 {
-	struct task_struct *task = current;
-
 	if (!waiter) {
 		/*
 		 * The purpose of the mutex_can_spin_on_owner() function is
@@ -476,24 +478,17 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 				goto fail_unlock;
 		}
 
+		/* Try to acquire the mutex... */
+		owner = __mutex_trylock_or_owner(lock);
+		if (!owner)
+			break;
+
 		/*
-		 * If there's an owner, wait for it to either
+		 * There's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		owner = __mutex_owner(lock);
-		if (owner) {
-			if (waiter && owner == task) {
-				smp_mb(); /* ACQUIRE */
-				break;
-			}
-
-			if (!mutex_spin_on_owner(lock, owner))
-				goto fail_unlock;
-		}
-
-		/* Try to acquire the mutex if it is unlocked. */
-		if (__mutex_trylock(lock, waiter))
-			break;
+		if (!mutex_spin_on_owner(lock, owner))
+			goto fail_unlock;
 
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
@@ -637,7 +632,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-	if (__mutex_trylock(lock, false) ||
+	if (__mutex_trylock(lock) ||
 	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
@@ -651,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	/*
 	 * After waiting to acquire the wait_lock, try again.
 	 */
-	if (__mutex_trylock(lock, false))
+	if (__mutex_trylock(lock))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -674,7 +669,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * before testing the error conditions to make sure we pick up
 		 * the handoff.
 		 */
-		if (__mutex_trylock(lock, first))
+		if (__mutex_trylock(lock))
 			goto acquired;
 
 		/*
@@ -707,8 +702,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * state back to RUNNING and fall through the next schedule(),
 		 * or we must see its unlock and acquire.
 		 */
-		if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) ||
-		     __mutex_trylock(lock, first))
+		if (__mutex_trylock(lock) ||
+		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)))
 			break;
 
 		spin_lock_mutex(&lock->wait_lock, flags);
@@ -865,6 +860,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif
 
 		if (owner & MUTEX_FLAG_HANDOFF)
@@ -1003,7 +999,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	bool locked = __mutex_trylock(lock, false);
+	bool locked = __mutex_trylock(lock);
 
 	if (locked)
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
cgit v1.2.3


From ea9e0fb8fe1bdfca81bd76052a5cce70bb053430 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:32 +0100
Subject: locking/ww_mutex: Set use_ww_ctx even when locking without a context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We will add a new field to struct mutex_waiter.  This field must be
initialized for all waiters if any waiter uses the ww_use_ctx path.

So there is a trade-off: Keep ww_mutex locking without a context on
the faster non-use_ww_ctx path, at the cost of adding the
initialization to all mutex locks (including non-ww_mutexes), or avoid
the additional cost for non-ww_mutex locks, at the cost of adding
additional checks to the use_ww_ctx path.

We take the latter choice.  It may be worth eliminating the users of
ww_mutex_lock(lock, NULL), but there are a lot of them.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-5-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 11 ++---------
 kernel/locking/mutex.c   | 29 +++++++++++++++++------------
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 7b0066814fa0..5f2e8379baff 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -222,11 +222,7 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
  */
 static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	if (ctx)
-		return __ww_mutex_lock(lock, ctx);
-
-	mutex_lock(&lock->base);
-	return 0;
+	return __ww_mutex_lock(lock, ctx);
 }
 
 /**
@@ -262,10 +258,7 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct
 static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
 							   struct ww_acquire_ctx *ctx)
 {
-	if (ctx)
-		return __ww_mutex_lock_interruptible(lock, ctx);
-	else
-		return mutex_lock_interruptible(&lock->base);
+	return __ww_mutex_lock_interruptible(lock, ctx);
 }
 
 /**
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 9ad03b9a5f7f..44a64c0a851a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -469,7 +469,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 	for (;;) {
 		struct task_struct *owner;
 
-		if (use_ww_ctx && ww_ctx->acquired > 0) {
+		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
 			struct ww_mutex *ww;
 
 			ww = container_of(lock, struct ww_mutex, base);
@@ -629,8 +629,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct ww_mutex *ww;
 	int ret;
 
-	if (use_ww_ctx) {
-		ww = container_of(lock, struct ww_mutex, base);
+	ww = container_of(lock, struct ww_mutex, base);
+
+	if (use_ww_ctx && ww_ctx) {
 		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
 			return -EALREADY;
 	}
@@ -642,7 +643,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
-		if (use_ww_ctx)
+		if (use_ww_ctx && ww_ctx)
 			ww_mutex_set_context_fastpath(ww, ww_ctx);
 		preempt_enable();
 		return 0;
@@ -688,7 +689,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			goto err;
 		}
 
-		if (use_ww_ctx && ww_ctx->acquired > 0) {
+		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
 			ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
 			if (ret)
 				goto err;
@@ -728,7 +729,7 @@ skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);
 
-	if (use_ww_ctx)
+	if (use_ww_ctx && ww_ctx)
 		ww_mutex_set_context_slowpath(ww, ww_ctx);
 
 	spin_unlock_mutex(&lock->wait_lock, flags);
@@ -816,8 +817,9 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	might_sleep();
 	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
-				   0, &ctx->dep_map, _RET_IP_, ctx, 1);
-	if (!ret && ctx->acquired > 1)
+				   0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+				   ctx, 1);
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
 
 	return ret;
@@ -831,9 +833,10 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	might_sleep();
 	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-				  0, &ctx->dep_map, _RET_IP_, ctx, 1);
+				  0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+				  ctx, 1);
 
-	if (!ret && ctx->acquired > 1)
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
 
 	return ret;
@@ -1021,7 +1024,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	might_sleep();
 
 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}
 
@@ -1035,7 +1039,8 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	might_sleep();
 
 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From c5470b22d1833241cae996d23a8a346ff8ec4d58 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:33 +0100
Subject: locking/ww_mutex: Remove the __ww_mutex_lock*() inline wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keep the documentation in the header file since there is no good place
for it in mutex.c: there are two rather different implementations with
different EXPORT_SYMBOLs for each function.

Signed-off-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <Nicolai.Haehnle@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-6-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 18 ++++--------------
 kernel/locking/mutex.c   | 16 ++++++++--------
 2 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 5f2e8379baff..c07528522bbc 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -186,11 +186,6 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 #endif
 }
 
-extern int __must_check __ww_mutex_lock(struct ww_mutex *lock,
-					struct ww_acquire_ctx *ctx);
-extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
-						      struct ww_acquire_ctx *ctx);
-
 /**
  * ww_mutex_lock - acquire the w/w mutex
  * @lock: the mutex to be acquired
@@ -220,10 +215,8 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
-	return __ww_mutex_lock(lock, ctx);
-}
+extern int __must_check ww_mutex_lock(struct ww_mutex *lock,
+				      struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
@@ -255,11 +248,8 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
-							   struct ww_acquire_ctx *ctx)
-{
-	return __ww_mutex_lock_interruptible(lock, ctx);
-}
+extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
+						    struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 44a64c0a851a..c696614a6b8b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -811,7 +811,7 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 }
 
 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;
 
@@ -824,10 +824,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
 
 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;
 
@@ -841,7 +841,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
 
 #endif
 
@@ -1019,7 +1019,7 @@ EXPORT_SYMBOL(mutex_trylock);
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();
 
@@ -1031,10 +1031,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return __ww_mutex_lock_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock);
+EXPORT_SYMBOL(ww_mutex_lock);
 
 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();
 
@@ -1046,7 +1046,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 
 #endif
 
-- 
cgit v1.2.3


From 6baa5c60a97d7f1a37a4b5ab5fb3a450e56e8b06 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:34 +0100
Subject: locking/ww_mutex: Add waiters in stamp order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add regular waiters in stamp order. Keep adding waiters that have no
context in FIFO order and take care not to starve them.

While adding our task as a waiter, back off if we detect that there is
a waiter with a lower stamp in front of us.

Make sure to call lock_contended even when we back off early.

For w/w mutexes, being first in the wait list is only stable when
taking the lock without a context. Therefore, the purpose of the first
flag is split into two: 'first' remains to indicate whether we want to
spin optimistically, while 'handoff' indicates that we should be
prepared to accept a handoff.

For w/w locking with a context, we always accept handoffs after the
first schedule(), to handle the following sequence of events:

 1. Task #0 unlocks and hands off to Task #2 which is first in line

 2. Task #1 adds itself in front of Task #2

 3. Task #2 wakes up and must accept the handoff even though it is no
    longer first in line

Signed-off-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <Nicolai.Haehnle@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-7-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |  3 ++
 kernel/locking/mutex.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 72 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3e1fccb47f11..e17942ffb3fc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,8 @@
 #include <linux/osq_lock.h>
 #include <linux/debug_locks.h>
 
+struct ww_acquire_ctx;
+
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -75,6 +77,7 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
 struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
+	struct ww_acquire_ctx	*ww_ctx;
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index c696614a6b8b..d0f7628b5a3d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -615,6 +615,52 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 	return 0;
 }
 
+static inline int __sched
+__ww_mutex_add_waiter(struct mutex_waiter *waiter,
+		      struct mutex *lock,
+		      struct ww_acquire_ctx *ww_ctx)
+{
+	struct mutex_waiter *cur;
+	struct list_head *pos;
+
+	if (!ww_ctx) {
+		list_add_tail(&waiter->list, &lock->wait_list);
+		return 0;
+	}
+
+	/*
+	 * Add the waiter before the first waiter with a higher stamp.
+	 * Waiters without a context are skipped to avoid starving
+	 * them.
+	 */
+	pos = &lock->wait_list;
+	list_for_each_entry_reverse(cur, &lock->wait_list, list) {
+		if (!cur->ww_ctx)
+			continue;
+
+		if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
+			/* Back off immediately if necessary. */
+			if (ww_ctx->acquired > 0) {
+#ifdef CONFIG_DEBUG_MUTEXES
+				struct ww_mutex *ww;
+
+				ww = container_of(lock, struct ww_mutex, base);
+				DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+				ww_ctx->contending_lock = ww;
+#endif
+				return -EDEADLK;
+			}
+
+			break;
+		}
+
+		pos = &cur->list;
+	}
+
+	list_add_tail(&waiter->list, pos);
+	return 0;
+}
+
 /*
  * Lock a mutex (possibly interruptible), slowpath:
  */
@@ -659,15 +705,25 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	debug_mutex_lock_common(lock, &waiter);
 	debug_mutex_add_waiter(lock, &waiter, current);
 
-	/* add waiting tasks to the end of the waitqueue (FIFO): */
-	list_add_tail(&waiter.list, &lock->wait_list);
+	lock_contended(&lock->dep_map, ip);
+
+	if (!use_ww_ctx) {
+		/* add waiting tasks to the end of the waitqueue (FIFO): */
+		list_add_tail(&waiter.list, &lock->wait_list);
+	} else {
+		/* Add in stamp order, waking up waiters that must back off. */
+		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+		if (ret)
+			goto err_early_backoff;
+
+		waiter.ww_ctx = ww_ctx;
+	}
+
 	waiter.task = current;
 
 	if (__mutex_waiter_is_first(lock, &waiter))
 		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
 
-	lock_contended(&lock->dep_map, ip);
-
 	set_current_state(state);
 	for (;;) {
 		/*
@@ -698,9 +754,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		schedule_preempt_disabled();
 
-		if (!first && __mutex_waiter_is_first(lock, &waiter)) {
-			first = true;
-			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
+		/*
+		 * ww_mutex needs to always recheck its position since its waiter
+		 * list is not FIFO ordered.
+		 */
+		if ((use_ww_ctx && ww_ctx) || !first) {
+			first = __mutex_waiter_is_first(lock, &waiter);
+			if (first)
+				__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
 
 		set_current_state(state);
@@ -739,6 +800,7 @@ skip_wait:
 err:
 	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
+err_early_backoff:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
-- 
cgit v1.2.3


From 977625a693283dbb35b2a3f674bc0237f7347348 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:39 +0100
Subject: locking/mutex: Initialize mutex_waiter::ww_ctx with poison when
 debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Help catch cases where mutex_lock is used directly on w/w mutexes, which
otherwise result in the w/w tasks reading uninitialized data.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-12-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/poison.h | 1 +
 kernel/locking/mutex.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 51334edec506..a39540326417 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -80,6 +80,7 @@
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
+#define MUTEX_POISON_WW_CTX	((void *) 0x500 + POISON_POINTER_DELTA)
 
 /********** lib/flex_array.c **********/
 #define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cd8598aa0426..935116723a3d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -785,6 +785,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	if (!use_ww_ctx) {
 		/* add waiting tasks to the end of the waitqueue (FIFO): */
 		list_add_tail(&waiter.list, &lock->wait_list);
+
+#ifdef CONFIG_DEBUG_MUTEXES
+		waiter.ww_ctx = MUTEX_POISON_WW_CTX;
+#endif
 	} else {
 		/* Add in stamp order, waking up waiters that must back off. */
 		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
-- 
cgit v1.2.3


From 12907fbb1a691807bb0420a27126e15934cb7954 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Dec 2016 11:44:28 +0100
Subject: sched/clock, clocksource: Add optional cs::mark_unstable() method

PeterZ reported that we'd fail to mark the TSC unstable when the
clocksource watchdog finds it unsuitable.

Allow a clocksource to run a custom action when its being marked
unstable and hook up the TSC unstable code.

Reported-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c       | 11 +++++++++++
 include/linux/clocksource.h |  3 +++
 kernel/time/clocksource.c   |  4 ++++
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index be3a49ee0356..c8174c815d83 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1106,6 +1106,16 @@ static u64 read_tsc(struct clocksource *cs)
 	return (u64)rdtsc_ordered();
 }
 
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+	if (tsc_unstable)
+		return;
+	tsc_unstable = 1;
+	clear_sched_clock_stable();
+	disable_sched_clock_irqtime();
+	pr_info("Marking TSC unstable due to clocksource watchdog\n");
+}
+
 /*
  * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
  */
@@ -1118,6 +1128,7 @@ static struct clocksource clocksource_tsc = {
 				  CLOCK_SOURCE_MUST_VERIFY,
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
 	.resume			= tsc_resume,
+	.mark_unstable		= tsc_cs_mark_unstable,
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index e315d04a2fd9..cfc75848a35d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -62,6 +62,8 @@ struct module;
  * @archdata:		arch-specific data
  * @suspend:		suspend function for the clocksource, if necessary
  * @resume:		resume function for the clocksource, if necessary
+ * @mark_unstable:	Optional function to inform the clocksource driver that
+ *			the watchdog marked the clocksource unstable
  * @owner:		module reference, must be set by clocksource in modules
  *
  * Note: This struct is not used in hotpathes of the timekeeping code
@@ -93,6 +95,7 @@ struct clocksource {
 	unsigned long flags;
 	void (*suspend)(struct clocksource *cs);
 	void (*resume)(struct clocksource *cs);
+	void (*mark_unstable)(struct clocksource *cs);
 
 	/* private: */
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 665985b0a89a..93621ae718d3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs)
 {
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
+
+	if (cs->mark_unstable)
+		cs->mark_unstable(cs);
+
 	if (finished_booting)
 		schedule_work(&watchdog_work);
 }
-- 
cgit v1.2.3


From 9881b024b7d7671f6a014091bc96506b89081802 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Dec 2016 13:35:52 +0100
Subject: sched/clock: Delay switching sched_clock to stable

Currently we switch to the stable sched_clock if we guess the TSC is
usable, and then switch back to the unstable path if it turns out TSC
isn't stable during SMP bringup after all.

Delay switching to the stable path until after SMP bringup is
complete. This way we'll avoid switching during the time we detect the
worst of the TSC offences.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  5 +++++
 init/main.c           |  1 -
 kernel/sched/clock.c  | 50 ++++++++++++++++++++++----------------------------
 kernel/sched/core.c   |  4 ++++
 4 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9ec61f7..94a48bb58297 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2515,6 +2515,10 @@ extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_init(void);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_init_late(void)
+{
+}
+
 static inline void sched_clock_tick(void)
 {
 }
@@ -2537,6 +2541,7 @@ static inline u64 local_clock(void)
 	return sched_clock();
 }
 #else
+extern void sched_clock_init_late(void);
 /*
  * Architectures can set this to 1 if they have specified
  * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..19228149386c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -625,7 +625,6 @@ asmlinkage __visible void __init start_kernel(void)
 	numa_policy_init();
 	if (late_time_init)
 		late_time_init();
-	sched_clock_init();
 	calibrate_delay();
 	pidmap_init();
 	anon_vma_init();
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 5d6dd38b449c..b3466d4e0cc2 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -77,6 +77,11 @@ EXPORT_SYMBOL_GPL(sched_clock);
 
 __read_mostly int sched_clock_running;
 
+void sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
+
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
 static int __sched_clock_stable_early;
@@ -96,12 +101,18 @@ void set_sched_clock_stable(void)
 {
 	__sched_clock_stable_early = 1;
 
-	smp_mb(); /* matches sched_clock_init() */
-
-	if (!sched_clock_running)
-		return;
+	smp_mb(); /* matches sched_clock_init_late() */
 
-	__set_sched_clock_stable();
+	/*
+	 * This really should only be called early (before
+	 * sched_clock_init_late()) when guestimating our sched_clock() is
+	 * solid.
+	 *
+	 * After that we test stability and we can negate our guess using
+	 * clear_sched_clock_stable, possibly from a watchdog.
+	 */
+	if (WARN_ON_ONCE(sched_clock_running == 2))
+		__set_sched_clock_stable();
 }
 
 static void __clear_sched_clock_stable(struct work_struct *work)
@@ -117,12 +128,10 @@ void clear_sched_clock_stable(void)
 {
 	__sched_clock_stable_early = 0;
 
-	smp_mb(); /* matches sched_clock_init() */
-
-	if (!sched_clock_running)
-		return;
+	smp_mb(); /* matches sched_clock_init_late() */
 
-	schedule_work(&sched_clock_work);
+	if (sched_clock_running == 2)
+		schedule_work(&sched_clock_work);
 }
 
 struct sched_clock_data {
@@ -143,20 +152,9 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 	return &per_cpu(sched_clock_data, cpu);
 }
 
-void sched_clock_init(void)
+void sched_clock_init_late(void)
 {
-	u64 ktime_now = ktime_to_ns(ktime_get());
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct sched_clock_data *scd = cpu_sdc(cpu);
-
-		scd->tick_raw = 0;
-		scd->tick_gtod = ktime_now;
-		scd->clock = ktime_now;
-	}
-
-	sched_clock_running = 1;
+	sched_clock_running = 2;
 
 	/*
 	 * Ensure that it is impossible to not do a static_key update.
@@ -362,11 +360,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
-void sched_clock_init(void)
-{
-	sched_clock_running = 1;
-}
-
 u64 sched_clock_cpu(int cpu)
 {
 	if (unlikely(!sched_clock_running))
@@ -374,6 +367,7 @@ u64 sched_clock_cpu(int cpu)
 
 	return sched_clock();
 }
+
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a129b34b8206..96a4267e6020 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7498,6 +7498,7 @@ void __init sched_init_smp(void)
 	init_sched_dl_class();
 
 	sched_init_smt();
+	sched_clock_init_late();
 
 	sched_smp_initialized = true;
 }
@@ -7513,6 +7514,7 @@ early_initcall(migration_init);
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
+	sched_clock_init_late();
 }
 #endif /* CONFIG_SMP */
 
@@ -7556,6 +7558,8 @@ void __init sched_init(void)
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
+	sched_clock_init();
+
 	for (i = 0; i < WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(bit_wait_table + i);
 
-- 
cgit v1.2.3


From 10ab56434f2f633a51e432ee8b7c29e12438e163 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Oct 2016 12:58:10 -0400
Subject: sched/core: Separate out io_schedule_prepare() and
 io_schedule_finish()

Now that IO schedule accounting is done inside __schedule(),
io_schedule() can be split into three steps - prep, schedule, and
finish - where the schedule part doesn't need any special annotation.
This allows marking a sleep as iowait by simply wrapping an existing
blocking function with io_schedule_prepare() and io_schedule_finish().

Because task_struct->in_iowait is single bit, the caller of
io_schedule_prepare() needs to record and the pass its state to
io_schedule_finish() to be safe regarding nesting.  While this isn't
the prettiest, these functions are mostly gonna be used by core
functions and we don't want to use more space for ->in_iowait.

While at it, as it's simple to do now, reimplement io_schedule()
without unnecessarily going through io_schedule_timeout().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/1477673892-28940-3-git-send-email-tj@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  8 +++-----
 kernel/sched/core.c   | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 94a48bb58297..a8daed914eef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -461,12 +461,10 @@ extern signed long schedule_timeout_idle(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 
+extern int __must_check io_schedule_prepare(void);
+extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
-
-static inline void io_schedule(void)
-{
-	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-}
+extern void io_schedule(void);
 
 void __noreturn do_task_dead(void);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9fd37169b302..49ce1cb3d320 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5128,25 +5128,48 @@ out_irq:
 }
 EXPORT_SYMBOL_GPL(yield_to);
 
+int io_schedule_prepare(void)
+{
+	int old_iowait = current->in_iowait;
+
+	current->in_iowait = 1;
+	blk_schedule_flush_plug(current);
+
+	return old_iowait;
+}
+
+void io_schedule_finish(int token)
+{
+	current->in_iowait = token;
+}
+
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
 long __sched io_schedule_timeout(long timeout)
 {
-	int old_iowait = current->in_iowait;
+	int token;
 	long ret;
 
-	current->in_iowait = 1;
-	blk_schedule_flush_plug(current);
-
+	token = io_schedule_prepare();
 	ret = schedule_timeout(timeout);
-	current->in_iowait = old_iowait;
+	io_schedule_finish(token);
 
 	return ret;
 }
 EXPORT_SYMBOL(io_schedule_timeout);
 
+void io_schedule(void)
+{
+	int token;
+
+	token = io_schedule_prepare();
+	schedule();
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL(io_schedule);
+
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
-- 
cgit v1.2.3


From 1460cb65a10f6c7a6e3a1c76513338861a0a43b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Oct 2016 12:58:11 -0400
Subject: locking/mutex, sched/wait: Add mutex_lock_io()

We sometimes end up propagating IO blocking through mutexes; however,
because there currently is no way of annotating mutex sleeps as
iowait, there are cases where iowait and /proc/stat:procs_blocked
report misleading numbers obscuring the actual state of the system.

This patch adds mutex_lock_io() so that mutex sleeps can be marked as
iowait in those cases.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/1477673892-28940-4-git-send-email-tj@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |  4 ++++
 kernel/locking/mutex.c | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b97870f2debd..980ba16b8749 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -156,10 +156,12 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 					unsigned int subclass);
 extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
 					unsigned int subclass);
+extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass);
 
 #define mutex_lock(lock) mutex_lock_nested(lock, 0)
 #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
 #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0)
+#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)
 
 #define mutex_lock_nest_lock(lock, nest_lock)				\
 do {									\
@@ -171,11 +173,13 @@ do {									\
 extern void mutex_lock(struct mutex *lock);
 extern int __must_check mutex_lock_interruptible(struct mutex *lock);
 extern int __must_check mutex_lock_killable(struct mutex *lock);
+extern void mutex_lock_io(struct mutex *lock);
 
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
+# define mutex_lock_nest_io(lock, nest_lock) mutex_io(lock)
 #endif
 
 /*
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 9b349619f431..8464a5cbab97 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -783,6 +783,20 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 
+void __sched
+mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+	int token;
+
+	might_sleep();
+
+	token = io_schedule_prepare();
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+			    subclass, NULL, _RET_IP_, NULL, 0);
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
 static inline int
 ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
@@ -950,6 +964,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
+void __sched mutex_lock_io(struct mutex *lock)
+{
+	int token;
+
+	token = io_schedule_prepare();
+	mutex_lock(lock);
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io);
+
 static noinline void __sched
 __mutex_lock_slowpath(struct mutex *lock)
 {
-- 
cgit v1.2.3


From 9e3d6223d2093a8903c8f570a06284453ee59944 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Dec 2016 09:30:11 +0100
Subject: math64, timers: Fix 32bit mul_u64_u32_shr() and friends

It turns out that while GCC-4.4 manages to generate 32x32->64 mult
instructions for the 32bit mul_u64_u32_shr() code, any GCC after that
fails horribly.

Fix this by providing an explicit mul_u32_u32() function which can be
architcture provided.

Reported-by: Chris Metcalf <cmetcalf@mellanox.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Chris Metcalf <cmetcalf@mellanox.com> [for tile]
Cc: Christopher S. Hall <christopher.s.hall@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: Liav Rehana <liavr@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161209083011.GD15765@worktop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/tile/include/asm/Kbuild  |  1 -
 arch/tile/include/asm/div64.h | 14 ++++++++++++++
 arch/x86/include/asm/div64.h  | 11 +++++++++++
 include/linux/math64.h        | 26 ++++++++++++++++++--------
 4 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 arch/tile/include/asm/div64.h

(limited to 'include/linux')

diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 2d1f5638974c..20f2ba6d79be 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -5,7 +5,6 @@ generic-y += bug.h
 generic-y += bugs.h
 generic-y += clkdev.h
 generic-y += cputime.h
-generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
diff --git a/arch/tile/include/asm/div64.h b/arch/tile/include/asm/div64.h
new file mode 100644
index 000000000000..bf6161966dfa
--- /dev/null
+++ b/arch/tile/include/asm/div64.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_TILE_DIV64_H
+#define _ASM_TILE_DIV64_H
+
+#ifdef __tilegx__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return __insn_mul_lu_lu(a, b);
+}
+#define mul_u32_u32 mul_u32_u32
+#endif
+
+#include <asm-generic/div64.h>
+
+#endif /* _ASM_TILE_DIV64_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index ced283ac79df..af95c47d5c9e 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -59,6 +59,17 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 }
 #define div_u64_rem	div_u64_rem
 
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	u32 high, low;
+
+	asm ("mull %[b]" : "=a" (low), "=d" (high)
+			 : [a] "a" (a), [b] "rm" (b) );
+
+	return low | ((u64)high) << 32;
+}
+#define mul_u32_u32 mul_u32_u32
+
 #else
 # include <asm-generic/div64.h>
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6e8b5b270ffe..80690c96c734 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,6 +133,16 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 	return ret;
 }
 
+#ifndef mul_u32_u32
+/*
+ * Many a GCC version messes this up and generates a 64x64 mult :-(
+ */
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return (u64)a * b;
+}
+#endif
+
 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
 
 #ifndef mul_u64_u32_shr
@@ -160,9 +170,9 @@ static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 	al = a;
 	ah = a >> 32;
 
-	ret = ((u64)al * mul) >> shift;
+	ret = mul_u32_u32(al, mul) >> shift;
 	if (ah)
-		ret += ((u64)ah * mul) << (32 - shift);
+		ret += mul_u32_u32(ah, mul) << (32 - shift);
 
 	return ret;
 }
@@ -186,10 +196,10 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
 	a0.ll = a;
 	b0.ll = b;
 
-	rl.ll = (u64)a0.l.low * b0.l.low;
-	rm.ll = (u64)a0.l.low * b0.l.high;
-	rn.ll = (u64)a0.l.high * b0.l.low;
-	rh.ll = (u64)a0.l.high * b0.l.high;
+	rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
+	rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
+	rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
+	rh.ll = mul_u32_u32(a0.l.high, b0.l.high);
 
 	/*
 	 * Each of these lines computes a 64-bit intermediate result into "c",
@@ -229,8 +239,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 	} u, rl, rh;
 
 	u.ll = a;
-	rl.ll = (u64)u.l.low * mul;
-	rh.ll = (u64)u.l.high * mul + rl.l.high;
+	rl.ll = mul_u32_u32(u.l.low, mul);
+	rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high;
 
 	/* Bits 32-63 of the result will be in rh.l.low. */
 	rl.l.high = do_div(rh.ll, divisor);
-- 
cgit v1.2.3


From af2e859edd477fa1ea3d1d106f41a595cff3d162 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:04 +0000
Subject: locking/ww_mutex: Fix compilation of __WW_MUTEX_INITIALIZER

From conflicting macro parameters, passing the wrong name to
__MUTEX_INITIALIZER and a stray '\', #define __WW_MUTEX_INITIALIZER was
very unhappy.

One unnecessary change was to choose to pass &ww_class instead of
implicitly taking the address of the class within the macro.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 1b375dc30710 ("mutex: Move ww_mutex definitions to ww_mutex.h")
Link: http://lkml.kernel.org/r/20161201114711.28697-2-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index c07528522bbc..083950fd5b0b 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -51,10 +51,10 @@ struct ww_mutex {
 };
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \
-		, .ww_class = &ww_class
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) \
+		, .ww_class = class
 #else
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class)
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class)
 #endif
 
 #define __WW_CLASS_INITIALIZER(ww_class) \
@@ -63,7 +63,7 @@ struct ww_mutex {
 		, .mutex_name = #ww_class "_mutex" }
 
 #define __WW_MUTEX_INITIALIZER(lockname, class) \
-		{ .base = { \__MUTEX_INITIALIZER(lockname) } \
+		{ .base =  __MUTEX_INITIALIZER(lockname.base) \
 		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
 
 #define DEFINE_WW_CLASS(classname) \
-- 
cgit v1.2.3


From 1e24edca0557dba6486d39d3c24c288475432bcf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 17:12:23 +0100
Subject: locking/atomic, kref: Add KREF_INIT()

Since we need to change the implementation, stop exposing internals.

Provide KREF_INIT() to allow static initialization of struct kref.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_bitmap.c | 2 +-
 fs/fuse/fuse_i.h                 | 2 +-
 include/linux/kref.h             | 2 ++
 init/version.c                   | 4 +---
 kernel/pid.c                     | 4 +---
 5 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index ab62b81c2ca7..dece26f119d4 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1070,7 +1070,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
 		.done = 0,
 		.flags = flags,
 		.error = 0,
-		.kref = { ATOMIC_INIT(2) },
+		.kref = KREF_INIT(2),
 	};
 
 	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91307940c8ac..052f8d3c41cb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -256,7 +256,7 @@ struct fuse_io_priv {
 
 #define FUSE_IO_PRIV_SYNC(f) \
 {					\
-	.refcnt = { ATOMIC_INIT(1) },	\
+	.refcnt = KREF_INIT(1),		\
 	.async = 0,			\
 	.file = f,			\
 }
diff --git a/include/linux/kref.h b/include/linux/kref.h
index e15828fd71f1..9af255ad1e2f 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -24,6 +24,8 @@ struct kref {
 	atomic_t refcount;
 };
 
+#define KREF_INIT(n)	{ .refcount = ATOMIC_INIT(n), }
+
 /**
  * kref_init - initialize object.
  * @kref: object in question.
diff --git a/init/version.c b/init/version.c
index fe41a63efed6..5606341e9efd 100644
--- a/init/version.c
+++ b/init/version.c
@@ -23,9 +23,7 @@ int version_string(LINUX_VERSION_CODE);
 #endif
 
 struct uts_namespace init_uts_ns = {
-	.kref = {
-		.refcount	= ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.name = {
 		.sysname	= UTS_SYSNAME,
 		.nodename	= UTS_NODENAME,
diff --git a/kernel/pid.c b/kernel/pid.c
index f66162f2359b..0291804151b5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -68,9 +68,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
-	.kref = {
-		.refcount       = ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
-- 
cgit v1.2.3


From 2c935bc57221cc2edc787c72ea0e2d30cdcd3d5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 17:29:48 +0100
Subject: locking/atomic, kref: Add kref_read()

Since we need to change the implementation, stop exposing internals.

Provide kref_read() to read the current reference count; typically
used for debug messages.

Kills two anti-patterns:

	atomic_read(&kref->refcount)
	kref->refcount.counter

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_req.c                |  2 +-
 drivers/block/rbd.c                          |  8 +++---
 drivers/block/virtio_blk.c                   |  2 +-
 drivers/gpu/drm/drm_gem_cma_helper.c         |  2 +-
 drivers/gpu/drm/drm_info.c                   |  2 +-
 drivers/gpu/drm/drm_mode_object.c            |  4 +--
 drivers/gpu/drm/etnaviv/etnaviv_gem.c        |  2 +-
 drivers/gpu/drm/i915/i915_gem_object.h       |  2 +-
 drivers/gpu/drm/msm/msm_gem.c                |  2 +-
 drivers/gpu/drm/nouveau/nouveau_fence.c      |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c           |  2 +-
 drivers/gpu/drm/ttm/ttm_bo.c                 |  4 +--
 drivers/gpu/drm/ttm/ttm_object.c             |  2 +-
 drivers/infiniband/hw/cxgb3/iwch_cm.h        |  6 ++---
 drivers/infiniband/hw/cxgb3/iwch_qp.c        |  2 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h       |  6 ++---
 drivers/infiniband/hw/cxgb4/qp.c             |  2 +-
 drivers/infiniband/hw/usnic/usnic_ib_sysfs.c |  6 ++---
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |  4 +--
 drivers/misc/genwqe/card_dev.c               |  2 +-
 drivers/misc/mei/debugfs.c                   |  2 +-
 drivers/pci/hotplug/pnv_php.c                |  2 +-
 drivers/pci/slot.c                           |  2 +-
 drivers/scsi/bnx2fc/bnx2fc_io.c              |  8 +++---
 drivers/scsi/cxgbi/libcxgbi.h                |  4 +--
 drivers/scsi/lpfc/lpfc_debugfs.c             |  2 +-
 drivers/scsi/lpfc/lpfc_els.c                 |  2 +-
 drivers/scsi/lpfc/lpfc_hbadisc.c             | 40 ++++++++++++++--------------
 drivers/scsi/lpfc/lpfc_init.c                |  3 +--
 drivers/scsi/qla2xxx/tcm_qla2xxx.c           |  4 +--
 drivers/staging/android/ion/ion.c            |  2 +-
 drivers/staging/comedi/comedi_buf.c          |  2 +-
 drivers/target/target_core_pr.c              | 10 +++----
 drivers/target/tcm_fc/tfc_sess.c             |  2 +-
 drivers/usb/gadget/function/f_fs.c           |  2 +-
 fs/exofs/sys.c                               |  2 +-
 fs/ocfs2/cluster/netdebug.c                  |  2 +-
 fs/ocfs2/cluster/tcp.c                       |  2 +-
 fs/ocfs2/dlm/dlmdebug.c                      | 12 ++++-----
 fs/ocfs2/dlm/dlmdomain.c                     |  2 +-
 fs/ocfs2/dlm/dlmmaster.c                     |  8 +++---
 fs/ocfs2/dlm/dlmunlock.c                     |  2 +-
 include/drm/drm_framebuffer.h                |  2 +-
 include/drm/ttm/ttm_bo_driver.h              |  4 +--
 include/linux/kref.h                         |  5 ++++
 include/linux/sunrpc/cache.h                 |  2 +-
 include/net/bluetooth/hci_core.h             |  4 +--
 net/bluetooth/6lowpan.c                      |  2 +-
 net/bluetooth/a2mp.c                         |  4 +--
 net/bluetooth/amp.c                          |  4 +--
 net/bluetooth/l2cap_core.c                   |  4 +--
 net/ceph/messenger.c                         |  4 +--
 net/ceph/osd_client.c                        | 10 +++----
 net/sunrpc/cache.c                           |  2 +-
 net/sunrpc/svc_xprt.c                        |  6 ++---
 net/sunrpc/xprtrdma/svc_rdma_transport.c     |  4 +--
 56 files changed, 121 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index de279fe4e4fd..74306c054983 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		/* Completion does it's own kref_put.  If we are going to
 		 * kref_sub below, we need req to be still around then. */
 		int at_least = k_put + !!c_put;
-		int refcount = atomic_read(&req->kref.refcount);
+		int refcount = kref_read(&req->kref);
 		if (refcount < at_least)
 			drbd_err(device,
 				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 36d2b9f4e836..436baa66f701 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1535,7 +1535,7 @@ static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
-		atomic_read(&obj_request->kref.refcount));
+		kref_read(&obj_request->kref));
 	kref_get(&obj_request->kref);
 }
 
@@ -1544,14 +1544,14 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
 {
 	rbd_assert(obj_request != NULL);
 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
-		atomic_read(&obj_request->kref.refcount));
+		kref_read(&obj_request->kref));
 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
 }
 
 static void rbd_img_request_get(struct rbd_img_request *img_request)
 {
 	dout("%s: img %p (was %d)\n", __func__, img_request,
-	     atomic_read(&img_request->kref.refcount));
+	     kref_read(&img_request->kref));
 	kref_get(&img_request->kref);
 }
 
@@ -1562,7 +1562,7 @@ static void rbd_img_request_put(struct rbd_img_request *img_request)
 {
 	rbd_assert(img_request != NULL);
 	dout("%s: img %p (was %d)\n", __func__, img_request,
-		atomic_read(&img_request->kref.refcount));
+		kref_read(&img_request->kref));
 	if (img_request_child_test(img_request))
 		kref_put(&img_request->kref, rbd_parent_request_destroy);
 	else
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5545a679abd8..79a346c97446 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
-	refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
+	refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
 	put_disk(vblk->disk);
 	vdev->config->del_vqs(vdev);
 	kfree(vblk->vqs);
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index 1d6c335584ec..33cd51632721 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -376,7 +376,7 @@ void drm_gem_cma_describe(struct drm_gem_cma_object *cma_obj,
 	off = drm_vma_node_start(&obj->vma_node);
 
 	seq_printf(m, "%2d (%2d) %08llx %pad %p %zu",
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, &cma_obj->paddr, cma_obj->vaddr, obj->size);
 
 	seq_printf(m, "\n");
diff --git a/drivers/gpu/drm/drm_info.c b/drivers/gpu/drm/drm_info.c
index ffb2ab389d1d..6b68e9088436 100644
--- a/drivers/gpu/drm/drm_info.c
+++ b/drivers/gpu/drm/drm_info.c
@@ -118,7 +118,7 @@ static int drm_gem_one_name_info(int id, void *ptr, void *data)
 	seq_printf(m, "%6d %8zd %7d %8d\n",
 		   obj->name, obj->size,
 		   obj->handle_count,
-		   atomic_read(&obj->refcount.refcount));
+		   kref_read(&obj->refcount));
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c
index 9f17085b1fdd..c6885a4911c0 100644
--- a/drivers/gpu/drm/drm_mode_object.c
+++ b/drivers/gpu/drm/drm_mode_object.c
@@ -159,7 +159,7 @@ EXPORT_SYMBOL(drm_mode_object_find);
 void drm_mode_object_unreference(struct drm_mode_object *obj)
 {
 	if (obj->free_cb) {
-		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
 		kref_put(&obj->refcount, obj->free_cb);
 	}
 }
@@ -176,7 +176,7 @@ EXPORT_SYMBOL(drm_mode_object_unreference);
 void drm_mode_object_reference(struct drm_mode_object *obj)
 {
 	if (obj->free_cb) {
-		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+		DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
 		kref_get(&obj->refcount);
 	}
 }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 114dddbd297b..aa6e35ddc87f 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -486,7 +486,7 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 
 	seq_printf(m, "%08x: %c %2d (%2d) %08lx %p %zd\n",
 			etnaviv_obj->flags, is_active(etnaviv_obj) ? 'A' : 'I',
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, etnaviv_obj->vaddr, obj->size);
 
 	rcu_read_lock();
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 6a368de9d81e..ecfefb9d42e4 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -256,7 +256,7 @@ extern void drm_gem_object_unreference_unlocked(struct drm_gem_object *);
 static inline bool
 i915_gem_object_is_dead(const struct drm_i915_gem_object *obj)
 {
-	return atomic_read(&obj->base.refcount.refcount) == 0;
+	return kref_read(&obj->base.refcount) == 0;
 }
 
 static inline bool
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index d8bc59c7e261..4d24d9389036 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -640,7 +640,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 
 	seq_printf(m, "%08x: %c %2d (%2d) %08llx %p\t",
 			msm_obj->flags, is_active(msm_obj) ? 'A' : 'I',
-			obj->name, obj->refcount.refcount.counter,
+			obj->name, kref_read(&obj->refcount),
 			off, msm_obj->vaddr);
 
 	for (id = 0; id < priv->num_aspaces; id++)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index a6126c93f215..88ee60d1b907 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -527,7 +527,7 @@ static bool nouveau_fence_no_signaling(struct dma_fence *f)
 	 * caller should have a reference on the fence,
 	 * else fence could get freed here
 	 */
-	WARN_ON(atomic_read(&fence->base.refcount.refcount) <= 1);
+	WARN_ON(kref_read(&fence->base.refcount) <= 1);
 
 	/*
 	 * This needs uevents to work correctly, but dma_fence_add_callback relies on
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 4a90c690f09e..74a9968df421 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -1033,7 +1033,7 @@ void omap_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 	off = drm_vma_node_start(&obj->vma_node);
 
 	seq_printf(m, "%08x: %2d (%2d) %08llx %pad (%2d) %p %4d",
-			omap_obj->flags, obj->name, obj->refcount.refcount.counter,
+			omap_obj->flags, obj->name, kref_read(&obj->refcount),
 			off, &omap_obj->paddr, omap_obj->paddr_cnt,
 			omap_obj->vaddr, omap_obj->roll);
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index d5063618efa7..30aefcc0c969 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -140,8 +140,8 @@ static void ttm_bo_release_list(struct kref *list_kref)
 	struct ttm_bo_device *bdev = bo->bdev;
 	size_t acc_size = bo->acc_size;
 
-	BUG_ON(atomic_read(&bo->list_kref.refcount));
-	BUG_ON(atomic_read(&bo->kref.refcount));
+	BUG_ON(kref_read(&bo->list_kref));
+	BUG_ON(kref_read(&bo->kref));
 	BUG_ON(atomic_read(&bo->cpu_writers));
 	BUG_ON(bo->mem.mm_node != NULL);
 	BUG_ON(!list_empty(&bo->lru));
diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c
index 4f5fa8d65fe9..fdb451e3ec01 100644
--- a/drivers/gpu/drm/ttm/ttm_object.c
+++ b/drivers/gpu/drm/ttm/ttm_object.c
@@ -304,7 +304,7 @@ bool ttm_ref_object_exists(struct ttm_object_file *tfile,
 	 * Verify that the ref->obj pointer was actually valid!
 	 */
 	rmb();
-	if (unlikely(atomic_read(&ref->kref.refcount) == 0))
+	if (unlikely(kref_read(&ref->kref) == 0))
 		goto out_false;
 
 	rcu_read_unlock();
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
index b9efadfffb4f..e66e75921797 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@@ -55,14 +55,14 @@
 
 #define put_ep(ep) { \
 	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
-	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	     ep, kref_read(&((ep)->kref))); \
+	WARN_ON(kref_read(&((ep)->kref)) < 1); \
 	kref_put(&((ep)->kref), __free_ep); \
 }
 
 #define get_ep(ep) { \
 	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
+	     ep, kref_read(&((ep)->kref))); \
 	kref_get(&((ep)->kref));  \
 }
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index d939980a708f..a9194db7f9b8 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -961,7 +961,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
 	case IWCH_QP_STATE_RTS:
 		switch (attrs->next_state) {
 		case IWCH_QP_STATE_CLOSING:
-			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
 			qhp->attr.state = IWCH_QP_STATE_CLOSING;
 			if (!internal) {
 				abort=0;
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 4788e1a46fde..4dc141596e57 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -654,14 +654,14 @@ enum c4iw_mmid_state {
 
 #define c4iw_put_ep(ep) { \
 	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
-	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	     ep, kref_read(&((ep)->kref))); \
+	WARN_ON(kref_read(&((ep)->kref)) < 1); \
 	kref_put(&((ep)->kref), _c4iw_free_ep); \
 }
 
 #define c4iw_get_ep(ep) { \
 	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
-	     ep, atomic_read(&((ep)->kref.refcount))); \
+	     ep, kref_read(&((ep)->kref))); \
 	kref_get(&((ep)->kref));  \
 }
 void _c4iw_free_ep(struct kref *kref);
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index cda5542e13a2..347b3c93ffd7 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1503,7 +1503,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 	case C4IW_QP_STATE_RTS:
 		switch (attrs->next_state) {
 		case C4IW_QP_STATE_CLOSING:
-			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
 			t4_set_wq_in_error(&qhp->wq);
 			set_state(qhp, C4IW_QP_STATE_CLOSING);
 			ep = qhp->ep;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
index 80ef3f8998c8..04443242e258 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -80,7 +80,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr,
 	left = PAGE_SIZE;
 
 	mutex_lock(&us_ibdev->usdev_lock);
-	if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) {
+	if (kref_read(&us_ibdev->vf_cnt) > 0) {
 		char *busname;
 
 		/*
@@ -99,7 +99,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr,
 			PCI_FUNC(us_ibdev->pdev->devfn),
 			netdev_name(us_ibdev->netdev),
 			us_ibdev->ufdev->mac,
-			atomic_read(&us_ibdev->vf_cnt.refcount));
+			kref_read(&us_ibdev->vf_cnt));
 		UPDATE_PTR_LEFT(n, ptr, left);
 
 		for (res_type = USNIC_VNIC_RES_TYPE_EOL;
@@ -147,7 +147,7 @@ usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr,
 	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%u\n",
-			atomic_read(&us_ibdev->vf_cnt.refcount));
+			kref_read(&us_ibdev->vf_cnt));
 }
 
 static ssize_t
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 74819a7951e2..69df8e353123 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -291,11 +291,11 @@ int usnic_ib_query_device(struct ib_device *ibdev,
 	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
 			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
 	props->max_qp = qp_per_vf *
-		atomic_read(&us_ibdev->vf_cnt.refcount);
+		kref_read(&us_ibdev->vf_cnt);
 	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
 		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 	props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
-		atomic_read(&us_ibdev->vf_cnt.refcount);
+		kref_read(&us_ibdev->vf_cnt);
 	props->max_pd = USNIC_UIOM_MAX_PD_CNT;
 	props->max_mr = USNIC_UIOM_MAX_MR_CNT;
 	props->local_ca_ack_delay = 0;
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index 7f1b282d7d96..cb290b8ca0c8 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -1396,7 +1396,7 @@ int genwqe_device_remove(struct genwqe_dev *cd)
 	 * application which will decrease this reference from
 	 * 1/unused to 0/illegal and not from 2/used 1/empty.
 	 */
-	rc = atomic_read(&cd->cdev_genwqe.kobj.kref.refcount);
+	rc = kref_read(&cd->cdev_genwqe.kobj.kref);
 	if (rc != 1) {
 		dev_err(&pci_dev->dev,
 			"[%s] err: cdev_genwqe...refcount=%d\n", __func__, rc);
diff --git a/drivers/misc/mei/debugfs.c b/drivers/misc/mei/debugfs.c
index c6c051b52f55..e32a92341b54 100644
--- a/drivers/misc/mei/debugfs.c
+++ b/drivers/misc/mei/debugfs.c
@@ -67,7 +67,7 @@ static ssize_t mei_dbgfs_read_meclients(struct file *fp, char __user *ubuf,
 				me_cl->props.max_number_of_connections,
 				me_cl->props.max_msg_length,
 				me_cl->props.single_recv_buf,
-				atomic_read(&me_cl->refcnt.refcount));
+				kref_read(&me_cl->refcnt));
 
 			mei_me_cl_put(me_cl);
 		}
diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 56efaf72d08e..d2961ef39a3a 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -155,7 +155,7 @@ static void pnv_php_detach_device_nodes(struct device_node *parent)
 		pnv_php_detach_device_nodes(dn);
 
 		of_node_put(dn);
-		refcount = atomic_read(&dn->kobj.kref.refcount);
+		refcount = kref_read(&dn->kobj.kref);
 		if (refcount != 1)
 			pr_warn("Invalid refcount %d on <%s>\n",
 				refcount, of_node_full_name(dn));
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 429d34c348b9..e42909524dee 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(pci_create_slot);
 void pci_destroy_slot(struct pci_slot *slot)
 {
 	dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n",
-		slot->number, atomic_read(&slot->kobj.kref.refcount) - 1);
+		slot->number, kref_read(&slot->kobj.kref) - 1);
 
 	mutex_lock(&pci_slot_mutex);
 	kobject_put(&slot->kobj);
diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c
index f501095f91ac..898461b146cc 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_io.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@@ -74,7 +74,7 @@ static void bnx2fc_cmd_timeout(struct work_struct *work)
 				    &io_req->req_flags)) {
 			/* Handle internally generated ABTS timeout */
 			BNX2FC_IO_DBG(io_req, "ABTS timed out refcnt = %d\n",
-					io_req->refcount.refcount.counter);
+					kref_read(&io_req->refcount));
 			if (!(test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
 					       &io_req->req_flags))) {
 				/*
@@ -1141,7 +1141,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd)
 		return SUCCESS;
 	}
 	BNX2FC_IO_DBG(io_req, "eh_abort - refcnt = %d\n",
-		      io_req->refcount.refcount.counter);
+		      kref_read(&io_req->refcount));
 
 	/* Hold IO request across abort processing */
 	kref_get(&io_req->refcount);
@@ -1299,7 +1299,7 @@ void bnx2fc_process_cleanup_compl(struct bnx2fc_cmd *io_req,
 {
 	BNX2FC_IO_DBG(io_req, "Entered process_cleanup_compl "
 			      "refcnt = %d, cmd_type = %d\n",
-		   io_req->refcount.refcount.counter, io_req->cmd_type);
+		   kref_read(&io_req->refcount), io_req->cmd_type);
 	bnx2fc_scsi_done(io_req, DID_ERROR);
 	kref_put(&io_req->refcount, bnx2fc_cmd_release);
 	if (io_req->wait_for_comp)
@@ -1318,7 +1318,7 @@ void bnx2fc_process_abts_compl(struct bnx2fc_cmd *io_req,
 	BNX2FC_IO_DBG(io_req, "Entered process_abts_compl xid = 0x%x"
 			      "refcnt = %d, cmd_type = %d\n",
 		   io_req->xid,
-		   io_req->refcount.refcount.counter, io_req->cmd_type);
+		   kref_read(&io_req->refcount), io_req->cmd_type);
 
 	if (test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
 				       &io_req->req_flags)) {
diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h
index 95ba99044c3e..18e0ea83d361 100644
--- a/drivers/scsi/cxgbi/libcxgbi.h
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -301,7 +301,7 @@ static inline void __cxgbi_sock_put(const char *fn, struct cxgbi_sock *csk)
 {
 	log_debug(1 << CXGBI_DBG_SOCK,
 		"%s, put csk 0x%p, ref %u-1.\n",
-		fn, csk, atomic_read(&csk->refcnt.refcount));
+		fn, csk, kref_read(&csk->refcnt));
 	kref_put(&csk->refcnt, cxgbi_sock_free);
 }
 #define cxgbi_sock_put(csk)	__cxgbi_sock_put(__func__, csk)
@@ -310,7 +310,7 @@ static inline void __cxgbi_sock_get(const char *fn, struct cxgbi_sock *csk)
 {
 	log_debug(1 << CXGBI_DBG_SOCK,
 		"%s, get csk 0x%p, ref %u+1.\n",
-		fn, csk, atomic_read(&csk->refcnt.refcount));
+		fn, csk, kref_read(&csk->refcnt));
 	kref_get(&csk->refcnt);
 }
 #define cxgbi_sock_get(csk)	__cxgbi_sock_get(__func__, csk)
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index a63542bac153..caa7a7b0ec53 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -607,7 +607,7 @@ lpfc_debugfs_nodelist_data(struct lpfc_vport *vport, char *buf, int size)
 		len += snprintf(buf+len, size-len, "usgmap:%x ",
 			ndlp->nlp_usg_map);
 		len += snprintf(buf+len, size-len, "refcnt:%x",
-			atomic_read(&ndlp->kref.refcount));
+			kref_read(&ndlp->kref));
 		len +=  snprintf(buf+len, size-len, "\n");
 	}
 	spin_unlock_irq(shost->host_lock);
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index 236e4e51d161..7a8829546068 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -3688,7 +3688,7 @@ lpfc_mbx_cmpl_dflt_rpi(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 		lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
 				 "0006 rpi%x DID:%x flg:%x %d map:%x %p\n",
 				 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 		if (NLP_CHK_NODE_ACT(ndlp)) {
 			lpfc_nlp_put(ndlp);
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index ed223937798a..82047070cdc9 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -3440,7 +3440,7 @@ lpfc_mbx_cmpl_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0002 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 	if (ndlp->nlp_flag & NLP_REG_LOGIN_SEND)
 		ndlp->nlp_flag &= ~NLP_REG_LOGIN_SEND;
@@ -3861,7 +3861,7 @@ out:
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0003 rpi:%x DID:%x flg:%x %d map%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 
 	if (vport->port_state < LPFC_VPORT_READY) {
@@ -4238,7 +4238,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				"0277 lpfc_enable_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return NULL;
 	}
 	/* The ndlp should not already be in active mode */
@@ -4248,7 +4248,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				"0278 lpfc_enable_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return NULL;
 	}
 
@@ -4272,7 +4272,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				 "0008 rpi:%x DID:%x flg:%x refcnt:%d "
 				 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
 				 ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 	}
 
@@ -4546,7 +4546,7 @@ lpfc_unreg_rpi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 				    (bf_get(lpfc_sli_intf_if_type,
 				     &phba->sli4_hba.sli_intf) ==
 				      LPFC_SLI_INTF_IF_TYPE_2) &&
-				    (atomic_read(&ndlp->kref.refcount) > 0)) {
+				    (kref_read(&ndlp->kref) > 0)) {
 					mbox->context1 = lpfc_nlp_get(ndlp);
 					mbox->mbox_cmpl =
 						lpfc_sli4_unreg_rpi_cmpl_clr;
@@ -4695,14 +4695,14 @@ lpfc_cleanup_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 				"0280 lpfc_cleanup_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		lpfc_dequeue_node(vport, ndlp);
 	} else {
 		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
 				"0281 lpfc_cleanup_node: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		lpfc_disable_node(vport, ndlp);
 	}
 
@@ -4791,7 +4791,7 @@ lpfc_nlp_remove(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 		lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
 				 "0005 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 				 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 		if ((mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL))
 			!= NULL) {
@@ -5557,7 +5557,7 @@ lpfc_mbx_cmpl_fdmi_reg_login(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
 			 "0004 rpi:%x DID:%x flg:%x %d map:%x %p\n",
 			 ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
-			 atomic_read(&ndlp->kref.refcount),
+			 kref_read(&ndlp->kref),
 			 ndlp->nlp_usg_map, ndlp);
 	/*
 	 * Start issuing Fabric-Device Management Interface (FDMI) command to
@@ -5728,7 +5728,7 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 				 "0007 rpi:%x DID:%x flg:%x refcnt:%d "
 				 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
 				 ndlp->nlp_flag,
-				 atomic_read(&ndlp->kref.refcount),
+				 kref_read(&ndlp->kref),
 				 ndlp->nlp_usg_map, ndlp);
 
 		ndlp->active_rrqs_xri_bitmap =
@@ -5767,7 +5767,7 @@ lpfc_nlp_release(struct kref *kref)
 			"0279 lpfc_nlp_release: ndlp:x%p did %x "
 			"usgmap:x%x refcnt:%d rpi:%x\n",
 			(void *)ndlp, ndlp->nlp_DID, ndlp->nlp_usg_map,
-			atomic_read(&ndlp->kref.refcount), ndlp->nlp_rpi);
+			kref_read(&ndlp->kref), ndlp->nlp_rpi);
 
 	/* remove ndlp from action. */
 	lpfc_nlp_remove(ndlp->vport, ndlp);
@@ -5804,7 +5804,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 		lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 			"node get:        did:x%x flg:x%x refcnt:x%x",
 			ndlp->nlp_DID, ndlp->nlp_flag,
-			atomic_read(&ndlp->kref.refcount));
+			kref_read(&ndlp->kref));
 		/* The check of ndlp usage to prevent incrementing the
 		 * ndlp reference count that is in the process of being
 		 * released.
@@ -5817,7 +5817,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 				"0276 lpfc_nlp_get: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 			return NULL;
 		} else
 			kref_get(&ndlp->kref);
@@ -5844,7 +5844,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 	"node put:        did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
-		atomic_read(&ndlp->kref.refcount));
+		kref_read(&ndlp->kref));
 	phba = ndlp->phba;
 	spin_lock_irqsave(&phba->ndlp_lock, flags);
 	/* Check the ndlp memory free acknowledge flag to avoid the
@@ -5857,7 +5857,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 				"0274 lpfc_nlp_put: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return 1;
 	}
 	/* Check the ndlp inactivate log flag to avoid the possible
@@ -5870,7 +5870,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 				"0275 lpfc_nlp_put: ndlp:x%p "
 				"usgmap:x%x refcnt:%d\n",
 				(void *)ndlp, ndlp->nlp_usg_map,
-				atomic_read(&ndlp->kref.refcount));
+				kref_read(&ndlp->kref));
 		return 1;
 	}
 	/* For last put, mark the ndlp usage flags to make sure no
@@ -5878,7 +5878,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 	 * in between the process when the final kref_put has been
 	 * invoked on this ndlp.
 	 */
-	if (atomic_read(&ndlp->kref.refcount) == 1) {
+	if (kref_read(&ndlp->kref) == 1) {
 		/* Indicate ndlp is put to inactive state. */
 		NLP_SET_IACT_REQ(ndlp);
 		/* Acknowledge ndlp memory free has been seen. */
@@ -5906,8 +5906,8 @@ lpfc_nlp_not_used(struct lpfc_nodelist *ndlp)
 	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 		"node not used:   did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
-		atomic_read(&ndlp->kref.refcount));
-	if (atomic_read(&ndlp->kref.refcount) == 1)
+		kref_read(&ndlp->kref));
+	if (kref_read(&ndlp->kref) == 1)
 		if (lpfc_nlp_put(ndlp))
 			return 1;
 	return 0;
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 4776fd85514f..64717c171b15 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -2660,8 +2660,7 @@ lpfc_cleanup(struct lpfc_vport *vport)
 						"usgmap:x%x refcnt:%d\n",
 						ndlp->nlp_DID, (void *)ndlp,
 						ndlp->nlp_usg_map,
-						atomic_read(
-							&ndlp->kref.refcount));
+						kref_read(&ndlp->kref));
 			}
 			break;
 		}
diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
index 6643f6fc7795..dd28c69b6a92 100644
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -371,7 +371,7 @@ static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd)
 		 */
 		pr_debug("write_pending aborted cmd[%p] refcount %d "
 			"transport_state %x, t_state %x, se_cmd_flags %x\n",
-			cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+			cmd, kref_read(&cmd->se_cmd.cmd_kref),
 			cmd->se_cmd.transport_state,
 			cmd->se_cmd.t_state,
 			cmd->se_cmd.se_cmd_flags);
@@ -584,7 +584,7 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd)
 		 */
 		pr_debug("queue_data_in aborted cmd[%p] refcount %d "
 			"transport_state %x, t_state %x, se_cmd_flags %x\n",
-			cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+			cmd, kref_read(&cmd->se_cmd.cmd_kref),
 			cmd->se_cmd.transport_state,
 			cmd->se_cmd.t_state,
 			cmd->se_cmd.se_cmd_flags);
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index b653451843c8..937c2d5d7ec3 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -1300,7 +1300,7 @@ static int ion_debug_heap_show(struct seq_file *s, void *unused)
 			seq_printf(s, "%16s %16u %16zu %d %d\n",
 				   buffer->task_comm, buffer->pid,
 				   buffer->size, buffer->kmap_cnt,
-				   atomic_read(&buffer->ref.refcount));
+				   kref_read(&buffer->ref));
 			total_orphaned_size += buffer->size;
 		}
 	}
diff --git a/drivers/staging/comedi/comedi_buf.c b/drivers/staging/comedi/comedi_buf.c
index c7d7682b1412..1e1df89b5018 100644
--- a/drivers/staging/comedi/comedi_buf.c
+++ b/drivers/staging/comedi/comedi_buf.c
@@ -188,7 +188,7 @@ bool comedi_buf_is_mmapped(struct comedi_subdevice *s)
 {
 	struct comedi_buf_map *bm = s->async->buf_map;
 
-	return bm && (atomic_read(&bm->refcount.refcount) > 1);
+	return bm && (kref_read(&bm->refcount) > 1);
 }
 
 int comedi_buf_alloc(struct comedi_device *dev, struct comedi_subdevice *s,
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index d761025144f9..e18051185846 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -788,7 +788,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 			 * __core_scsi3_add_registration()
 			 */
 			dest_lun = rcu_dereference_check(deve_tmp->se_lun,
-				atomic_read(&deve_tmp->pr_kref.refcount) != 0);
+				kref_read(&deve_tmp->pr_kref) != 0);
 
 			pr_reg_atp = __core_scsi3_do_alloc_registration(dev,
 						nacl_tmp, dest_lun, deve_tmp,
@@ -1463,7 +1463,7 @@ static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve)
 	 * For nacl->dynamic_node_acl=1
 	 */
 	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				atomic_read(&se_deve->pr_kref.refcount) != 0);
+				kref_read(&se_deve->pr_kref) != 0);
 	if (!lun_acl)
 		return 0;
 
@@ -1478,7 +1478,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve)
 	 * For nacl->dynamic_node_acl=1
 	 */
 	lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
-				atomic_read(&se_deve->pr_kref.refcount) != 0);
+				kref_read(&se_deve->pr_kref) != 0);
 	if (!lun_acl) {
 		kref_put(&se_deve->pr_kref, target_pr_kref_release);
 		return;
@@ -1759,7 +1759,7 @@ core_scsi3_decode_spec_i_port(
 		 * 2nd loop which will never fail.
 		 */
 		dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+				kref_read(&dest_se_deve->pr_kref) != 0);
 
 		dest_pr_reg = __core_scsi3_alloc_registration(cmd->se_dev,
 					dest_node_acl, dest_lun, dest_se_deve,
@@ -3466,7 +3466,7 @@ after_iport_check:
 					iport_ptr);
 	if (!dest_pr_reg) {
 		struct se_lun *dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
-				atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+				kref_read(&dest_se_deve->pr_kref) != 0);
 
 		spin_unlock(&dev->dev_reservation_lock);
 		if (core_scsi3_alloc_registration(cmd->se_dev, dest_node_acl,
diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c
index fd5c3de79470..c91979c1463d 100644
--- a/drivers/target/tcm_fc/tfc_sess.c
+++ b/drivers/target/tcm_fc/tfc_sess.c
@@ -454,7 +454,7 @@ static void ft_sess_free(struct kref *kref)
 
 void ft_sess_put(struct ft_sess *sess)
 {
-	int sess_held = atomic_read(&sess->kref.refcount);
+	int sess_held = kref_read(&sess->kref);
 
 	BUG_ON(!sess_held);
 	kref_put(&sess->kref, ft_sess_free);
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 5e746adc8a2d..365443cae5b1 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -3687,7 +3687,7 @@ static void ffs_closed(struct ffs_data *ffs)
 		goto done;
 
 	if (opts->no_configfs || !opts->func_inst.group.cg_item.ci_parent
-	    || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount))
+	    || !kref_read(&opts->func_inst.group.cg_item.ci_kref))
 		goto done;
 
 	ci = opts->func_inst.group.cg_item.ci_parent->ci_parent;
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 5e6a2c0a1f0b..1f7d5e46cdda 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -122,7 +122,7 @@ void exofs_sysfs_dbg_print(void)
 	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
 		printk(KERN_INFO "%s: name %s ref %d\n",
 			__func__, kobject_name(k_name),
-			(int)atomic_read(&k_name->kref.refcount));
+			(int)kref_read(&k_name->kref));
 	}
 #endif
 }
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 27d1242c8383..564c504d6efd 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -349,7 +349,7 @@ static void sc_show_sock_container(struct seq_file *seq,
 		   "  func key:        0x%08x\n"
 		   "  func type:       %u\n",
 		   sc,
-		   atomic_read(&sc->sc_kref.refcount),
+		   kref_read(&sc->sc_kref),
 		   &saddr, inet ? ntohs(sport) : 0,
 		   &daddr, inet ? ntohs(dport) : 0,
 		   sc->sc_node->nd_name,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d4b5c81f0445..ec000575e863 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -97,7 +97,7 @@
 	typeof(sc) __sc = (sc);						\
 	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
 	     "pg_off %zu] " fmt, __sc,					\
-	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	     kref_read(&__sc->sc_kref), __sc->sc_sock,	\
 	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
 	    ##args);							\
 } while (0)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e7b760deefae..9b984cae4c4e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -81,7 +81,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
 	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
 	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-	       atomic_read(&lock->lock_refs.refcount),
+	       kref_read(&lock->lock_refs),
 	       (list_empty(&lock->ast_list) ? 'y' : 'n'),
 	       (lock->ast_pending ? 'y' : 'n'),
 	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
@@ -106,7 +106,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 	printk("lockres: %s, owner=%u, state=%u\n",
 	       buf, res->owner, res->state);
 	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
-	       res->last_used, atomic_read(&res->refs.refcount),
+	       res->last_used, kref_read(&res->refs),
 	       list_empty(&res->purge) ? "no" : "yes");
 	printk("  on dirty list: %s, on reco list: %s, "
 	       "migrating pending: %s\n",
@@ -298,7 +298,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 			mle_type, mle->master, mle->new_master,
 			!list_empty(&mle->hb_events),
 			!!mle->inuse,
-			atomic_read(&mle->mle_refs.refcount));
+			kref_read(&mle->mle_refs));
 
 	out += snprintf(buf + out, len - out, "Maybe=");
 	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
@@ -494,7 +494,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
 		       lock->ast_pending, lock->bast_pending,
 		       lock->convert_pending, lock->lock_pending,
 		       lock->cancel_pending, lock->unlock_pending,
-		       atomic_read(&lock->lock_refs.refcount));
+		       kref_read(&lock->lock_refs));
 	spin_unlock(&lock->spinlock);
 
 	return out;
@@ -521,7 +521,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
 			!list_empty(&res->recovering),
 			res->inflight_locks, res->migration_pending,
 			atomic_read(&res->asts_reserved),
-			atomic_read(&res->refs.refcount));
+			kref_read(&res->refs));
 
 	/* refmap */
 	out += snprintf(buf + out, len - out, "RMAP:");
@@ -777,7 +777,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 	/* Purge Count: xxx  Refs: xxx */
 	out += snprintf(buf + out, len - out,
 			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
-			atomic_read(&dlm->dlm_refs.refcount));
+			kref_read(&dlm->dlm_refs));
 
 	/* Dead Node: xxx */
 	out += snprintf(buf + out, len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 733e4e79c8e2..32fd261ae13d 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -2072,7 +2072,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
 
 	mlog(0, "context init: refcount %u\n",
-		  atomic_read(&dlm->dlm_refs.refcount));
+		  kref_read(&dlm->dlm_refs));
 
 leave:
 	if (ret < 0 && dlm) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a464c8088170..7025d8c27999 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -233,7 +233,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
-	if (!atomic_read(&mle->mle_refs.refcount)) {
+	if (!kref_read(&mle->mle_refs)) {
 		/* this may or may not crash, but who cares.
 		 * it's a BUG. */
 		mlog(ML_ERROR, "bad mle: %p\n", mle);
@@ -1124,9 +1124,9 @@ recheck:
 		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
 
 		/*
-		if (atomic_read(&mle->mle_refs.refcount) < 2)
+		if (kref_read(&mle->mle_refs) < 2)
 			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
-			atomic_read(&mle->mle_refs.refcount),
+			kref_read(&mle->mle_refs),
 			res->lockname.len, res->lockname.name);
 		*/
 		atomic_set(&mle->woken, 0);
@@ -1979,7 +1979,7 @@ ok:
 		 * on this mle. */
 		spin_lock(&dlm->master_lock);
 
-		rr = atomic_read(&mle->mle_refs.refcount);
+		rr = kref_read(&mle->mle_refs);
 		if (mle->inuse > 0) {
 			if (extra_ref && rr < 3)
 				err = 1;
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 1082b2c3014b..63d701cd1e2e 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -251,7 +251,7 @@ leave:
 		mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
 		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		     atomic_read(&lock->lock_refs.refcount)-1);
+		     kref_read(&lock->lock_refs)-1);
 		dlm_lock_put(lock);
 	}
 	if (actions & DLM_UNLOCK_CALL_AST)
diff --git a/include/drm/drm_framebuffer.h b/include/drm/drm_framebuffer.h
index 1ddfa2928802..a232e7f0c869 100644
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@@ -247,7 +247,7 @@ static inline void drm_framebuffer_unreference(struct drm_framebuffer *fb)
  */
 static inline uint32_t drm_framebuffer_read_refcount(struct drm_framebuffer *fb)
 {
-	return atomic_read(&fb->base.refcount.refcount);
+	return kref_read(&fb->base.refcount);
 }
 
 /**
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index cdbdb40eb5bd..feecf33a1212 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -878,7 +878,7 @@ static inline int ttm_bo_reserve(struct ttm_buffer_object *bo,
 {
 	int ret;
 
-	WARN_ON(!atomic_read(&bo->kref.refcount));
+	WARN_ON(!kref_read(&bo->kref));
 
 	ret = __ttm_bo_reserve(bo, interruptible, no_wait, ticket);
 	if (likely(ret == 0))
@@ -903,7 +903,7 @@ static inline int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo,
 {
 	int ret = 0;
 
-	WARN_ON(!atomic_read(&bo->kref.refcount));
+	WARN_ON(!kref_read(&bo->kref));
 
 	if (interruptible)
 		ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 9af255ad1e2f..7c88d865f82f 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -35,6 +35,11 @@ static inline void kref_init(struct kref *kref)
 	atomic_set(&kref->refcount, 1);
 }
 
+static inline int kref_read(const struct kref *kref)
+{
+	return atomic_read(&kref->refcount);
+}
+
 /**
  * kref_get - increment refcount for object.
  * @kref: object.
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 62a60eeacb0a..8a511c0985aa 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -198,7 +198,7 @@ static inline struct cache_head  *cache_get(struct cache_head *h)
 
 static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 {
-	if (atomic_read(&h->ref.refcount) <= 2 &&
+	if (kref_read(&h->ref) <= 2 &&
 	    h->expiry_time < cd->nextcheck)
 		cd->nextcheck = h->expiry_time;
 	kref_put(&h->ref, cd->cache_put);
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 554671c81f4a..90708f68cc02 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -987,7 +987,7 @@ static inline void hci_conn_drop(struct hci_conn *conn)
 static inline void hci_dev_put(struct hci_dev *d)
 {
 	BT_DBG("%s orig refcnt %d", d->name,
-	       atomic_read(&d->dev.kobj.kref.refcount));
+	       kref_read(&d->dev.kobj.kref));
 
 	put_device(&d->dev);
 }
@@ -995,7 +995,7 @@ static inline void hci_dev_put(struct hci_dev *d)
 static inline struct hci_dev *hci_dev_hold(struct hci_dev *d)
 {
 	BT_DBG("%s orig refcnt %d", d->name,
-	       atomic_read(&d->dev.kobj.kref.refcount));
+	       kref_read(&d->dev.kobj.kref));
 
 	get_device(&d->dev);
 	return d;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 1904a93f47d5..d491529332f4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
 			BT_DBG("dev %p removing %speer %p", dev,
 			       last ? "last " : "1 ", peer);
 			BT_DBG("chan %p orig refcnt %d", chan,
-			       atomic_read(&chan->kref.refcount));
+			       kref_read(&chan->kref));
 
 			l2cap_chan_put(chan);
 			break;
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5f123c3320a7..f0095fd79818 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
 /* AMP Manager functions */
 struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
 {
-	BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
 
 	kref_get(&mgr->kref);
 
@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref)
 
 int amp_mgr_put(struct amp_mgr *mgr)
 {
-	BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
 
 	return kref_put(&mgr->kref, &amp_mgr_destroy);
 }
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index e32f34189007..02a4ccc04e1e 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
 void amp_ctrl_get(struct amp_ctrl *ctrl)
 {
 	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       atomic_read(&ctrl->kref.refcount));
+	       kref_read(&ctrl->kref));
 
 	kref_get(&ctrl->kref);
 }
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref)
 int amp_ctrl_put(struct amp_ctrl *ctrl)
 {
 	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       atomic_read(&ctrl->kref.refcount));
+	       kref_read(&ctrl->kref));
 
 	return kref_put(&ctrl->kref, &amp_ctrl_destroy);
 }
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ce0b5dd01953..fc7f321a3823 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref)
 
 void l2cap_chan_hold(struct l2cap_chan *c)
 {
-	BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+	BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
 
 	kref_get(&c->kref);
 }
 
 void l2cap_chan_put(struct l2cap_chan *c)
 {
-	BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+	BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
 
 	kref_put(&c->kref, l2cap_chan_destroy);
 }
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 770c52701efa..bad3d4ae43f6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3425,7 +3425,7 @@ static void ceph_msg_release(struct kref *kref)
 struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
 {
 	dout("%s %p (was %d)\n", __func__, msg,
-	     atomic_read(&msg->kref.refcount));
+	     kref_read(&msg->kref));
 	kref_get(&msg->kref);
 	return msg;
 }
@@ -3434,7 +3434,7 @@ EXPORT_SYMBOL(ceph_msg_get);
 void ceph_msg_put(struct ceph_msg *msg)
 {
 	dout("%s %p (was %d)\n", __func__, msg,
-	     atomic_read(&msg->kref.refcount));
+	     kref_read(&msg->kref));
 	kref_put(&msg->kref, ceph_msg_release);
 }
 EXPORT_SYMBOL(ceph_msg_put);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 842f049abb86..f3378ba1a828 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref)
 void ceph_osdc_get_request(struct ceph_osd_request *req)
 {
 	dout("%s %p (was %d)\n", __func__, req,
-	     atomic_read(&req->r_kref.refcount));
+	     kref_read(&req->r_kref));
 	kref_get(&req->r_kref);
 }
 EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
 	if (req) {
 		dout("%s %p (was %d)\n", __func__, req,
-		     atomic_read(&req->r_kref.refcount));
+		     kref_read(&req->r_kref));
 		kref_put(&req->r_kref, ceph_osdc_release_request);
 	}
 }
@@ -487,11 +487,11 @@ static void request_reinit(struct ceph_osd_request *req)
 	struct ceph_msg *reply_msg = req->r_reply;
 
 	dout("%s req %p\n", __func__, req);
-	WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+	WARN_ON(kref_read(&req->r_kref) != 1);
 	request_release_checks(req);
 
-	WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
-	WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+	WARN_ON(kref_read(&request_msg->kref) != 1);
+	WARN_ON(kref_read(&reply_msg->kref) != 1);
 	target_destroy(&req->r_t);
 
 	request_init(req);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8147e8d56eb2..f39e3e11f9aa 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1358,7 +1358,7 @@ static int c_show(struct seq_file *m, void *p)
 	ifdebug(CACHE)
 		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
 			   convert_to_wallclock(cp->expiry_time),
-			   atomic_read(&cp->ref.refcount), cp->flags);
+			   kref_read(&cp->ref), cp->flags);
 	cache_get(cp);
 	if (cache_check(cd, cp, NULL))
 		/* cache_check does a cache_put on failure */
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3bc1d61694cb..04e7f8707d71 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 		svc_xprt_get(xprt);
 
 		dprintk("svc: transport %p dequeued, inuse=%d\n",
-			xprt, atomic_read(&xprt->xpt_ref.refcount));
+			xprt, kref_read(&xprt->xpt_ref));
 	}
 	spin_unlock_bh(&pool->sp_lock);
 out:
@@ -820,7 +820,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 		/* XPT_DATA|XPT_DEFERRED case: */
 		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
 			rqstp, rqstp->rq_pool->sp_id, xprt,
-			atomic_read(&xprt->xpt_ref.refcount));
+			kref_read(&xprt->xpt_ref));
 		rqstp->rq_deferred = svc_deferred_dequeue(xprt);
 		if (rqstp->rq_deferred)
 			len = svc_deferred_recv(rqstp);
@@ -978,7 +978,7 @@ static void svc_age_temp_xprts(unsigned long closure)
 		 * through, close it. */
 		if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
 			continue;
-		if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+		if (kref_read(&xprt->xpt_ref) > 1 ||
 		    test_bit(XPT_BUSY, &xprt->xpt_flags))
 			continue;
 		list_del_init(le);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ca2799af05a6..39652d390a9c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1201,9 +1201,9 @@ static void __svc_rdma_free(struct work_struct *work)
 		ib_drain_qp(rdma->sc_qp);
 
 	/* We should only be called from kref_put */
-	if (atomic_read(&xprt->xpt_ref.refcount) != 0)
+	if (kref_read(&xprt->xpt_ref) != 0)
 		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
-		       atomic_read(&xprt->xpt_ref.refcount));
+		       kref_read(&xprt->xpt_ref));
 
 	/*
 	 * Destroy queued, but not processed read completions. Note
-- 
cgit v1.2.3


From bdfafc4ffdd24e491119d81f85ddc4393fa49803 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 17:34:19 +0100
Subject: locking/atomic, kref: Kill kref_sub()

By general sentiment kref_sub() is a bad interface, make it go away.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_main.c         |  7 ++--
 drivers/block/drbd/drbd_req.c          | 31 ++++++------------
 drivers/gpu/drm/ttm/ttm_bo.c           | 59 +++++++++-------------------------
 drivers/gpu/drm/ttm/ttm_execbuf_util.c |  4 +--
 include/drm/ttm/ttm_bo_api.h           | 15 +--------
 include/linux/kref.h                   | 32 +++---------------
 6 files changed, 36 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 83482721bc01..c3ff60c30dde 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2948,7 +2948,6 @@ void drbd_delete_device(struct drbd_device *device)
 	struct drbd_resource *resource = device->resource;
 	struct drbd_connection *connection;
 	struct drbd_peer_device *peer_device;
-	int refs = 3;
 
 	/* move to free_peer_device() */
 	for_each_peer_device(peer_device, device)
@@ -2956,13 +2955,15 @@ void drbd_delete_device(struct drbd_device *device)
 	drbd_debugfs_device_cleanup(device);
 	for_each_connection(connection, resource) {
 		idr_remove(&connection->peer_devices, device->vnr);
-		refs++;
+		kref_put(&device->kref, drbd_destroy_device);
 	}
 	idr_remove(&resource->devices, device->vnr);
+	kref_put(&device->kref, drbd_destroy_device);
 	idr_remove(&drbd_devices, device_to_minor(device));
+	kref_put(&device->kref, drbd_destroy_device);
 	del_gendisk(device->vdisk);
 	synchronize_rcu();
-	kref_sub(&device->kref, refs, drbd_destroy_device);
+	kref_put(&device->kref, drbd_destroy_device);
 }
 
 static int __init drbd_init(void)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 74306c054983..b489ac2e9c44 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -421,7 +421,6 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	unsigned s = req->rq_state;
 	int c_put = 0;
-	int k_put = 0;
 
 	if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
 		set |= RQ_COMPLETION_SUSP;
@@ -437,6 +436,8 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	/* intent: get references */
 
+	kref_get(&req->kref);
+
 	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
 		atomic_inc(&req->completion_ref);
 
@@ -473,15 +474,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
 		D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
-		/* local completion may still come in later,
-		 * we need to keep the req object around. */
-		kref_get(&req->kref);
 		++c_put;
 	}
 
 	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
 		if (req->rq_state & RQ_LOCAL_ABORTED)
-			++k_put;
+			kref_put(&req->kref, drbd_req_destroy);
 		else
 			++c_put;
 		list_del_init(&req->req_pending_local);
@@ -503,7 +501,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		if (s & RQ_NET_SENT)
 			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
 		if (s & RQ_EXP_BARR_ACK)
-			++k_put;
+			kref_put(&req->kref, drbd_req_destroy);
 		req->net_done_jif = jiffies;
 
 		/* in ahead/behind mode, or just in case,
@@ -516,25 +514,16 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 
 	/* potentially complete and destroy */
 
-	if (k_put || c_put) {
-		/* Completion does it's own kref_put.  If we are going to
-		 * kref_sub below, we need req to be still around then. */
-		int at_least = k_put + !!c_put;
-		int refcount = kref_read(&req->kref);
-		if (refcount < at_least)
-			drbd_err(device,
-				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
-				s, req->rq_state, refcount, at_least);
-	}
-
 	/* If we made progress, retry conflicting peer requests, if any. */
 	if (req->i.waiting)
 		wake_up(&device->misc_wait);
 
-	if (c_put)
-		k_put += drbd_req_put_completion_ref(req, m, c_put);
-	if (k_put)
-		kref_sub(&req->kref, k_put, drbd_req_destroy);
+	if (c_put) {
+		if (drbd_req_put_completion_ref(req, m, c_put))
+			kref_put(&req->kref, drbd_req_destroy);
+	} else {
+		kref_put(&req->kref, drbd_req_destroy);
+	}
 }
 
 static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 30aefcc0c969..ffc6cb55c78c 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -181,61 +181,46 @@ void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
 }
 EXPORT_SYMBOL(ttm_bo_add_to_lru);
 
-int ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static void ttm_bo_ref_bug(struct kref *list_kref)
+{
+	BUG();
+}
+
+void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	int put_count = 0;
 
 	if (bdev->driver->lru_removal)
 		bdev->driver->lru_removal(bo);
 
 	if (!list_empty(&bo->swap)) {
 		list_del_init(&bo->swap);
-		++put_count;
+		kref_put(&bo->list_kref, ttm_bo_ref_bug);
 	}
 	if (!list_empty(&bo->lru)) {
 		list_del_init(&bo->lru);
-		++put_count;
+		kref_put(&bo->list_kref, ttm_bo_ref_bug);
 	}
-
-	return put_count;
-}
-
-static void ttm_bo_ref_bug(struct kref *list_kref)
-{
-	BUG();
-}
-
-void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count,
-			 bool never_free)
-{
-	kref_sub(&bo->list_kref, count,
-		 (never_free) ? ttm_bo_ref_bug : ttm_bo_release_list);
 }
 
 void ttm_bo_del_sub_from_lru(struct ttm_buffer_object *bo)
 {
-	int put_count;
-
 	spin_lock(&bo->glob->lru_lock);
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&bo->glob->lru_lock);
-	ttm_bo_list_ref_sub(bo, put_count, true);
 }
 EXPORT_SYMBOL(ttm_bo_del_sub_from_lru);
 
 void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	int put_count = 0;
 
 	lockdep_assert_held(&bo->resv->lock.base);
 
 	if (bdev->driver->lru_removal)
 		bdev->driver->lru_removal(bo);
 
-	put_count = ttm_bo_del_from_lru(bo);
-	ttm_bo_list_ref_sub(bo, put_count, true);
+	ttm_bo_del_from_lru(bo);
 	ttm_bo_add_to_lru(bo);
 }
 EXPORT_SYMBOL(ttm_bo_move_to_lru_tail);
@@ -447,7 +432,6 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
-	int put_count;
 	int ret;
 
 	spin_lock(&glob->lru_lock);
@@ -455,13 +439,10 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 
 	if (!ret) {
 		if (!ttm_bo_wait(bo, false, true)) {
-			put_count = ttm_bo_del_from_lru(bo);
-
+			ttm_bo_del_from_lru(bo);
 			spin_unlock(&glob->lru_lock);
 			ttm_bo_cleanup_memtype_use(bo);
 
-			ttm_bo_list_ref_sub(bo, put_count, true);
-
 			return;
 		} else
 			ttm_bo_flush_all_fences(bo);
@@ -504,7 +485,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 					  bool no_wait_gpu)
 {
 	struct ttm_bo_global *glob = bo->glob;
-	int put_count;
 	int ret;
 
 	ret = ttm_bo_wait(bo, false, true);
@@ -554,15 +534,13 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 		return ret;
 	}
 
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	list_del_init(&bo->ddestroy);
-	++put_count;
+	kref_put(&bo->list_kref, ttm_bo_ref_bug);
 
 	spin_unlock(&glob->lru_lock);
 	ttm_bo_cleanup_memtype_use(bo);
 
-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	return 0;
 }
 
@@ -740,7 +718,7 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	struct ttm_buffer_object *bo;
-	int ret = -EBUSY, put_count;
+	int ret = -EBUSY;
 
 	spin_lock(&glob->lru_lock);
 	list_for_each_entry(bo, &man->lru, lru) {
@@ -771,13 +749,11 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 		return ret;
 	}
 
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);
 
 	BUG_ON(ret != 0);
 
-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	ret = ttm_bo_evict(bo, interruptible, no_wait_gpu);
 	ttm_bo_unreserve(bo);
 
@@ -1669,7 +1645,6 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 	    container_of(shrink, struct ttm_bo_global, shrink);
 	struct ttm_buffer_object *bo;
 	int ret = -EBUSY;
-	int put_count;
 	uint32_t swap_placement = (TTM_PL_FLAG_CACHED | TTM_PL_FLAG_SYSTEM);
 
 	spin_lock(&glob->lru_lock);
@@ -1692,11 +1667,9 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 		return ret;
 	}
 
-	put_count = ttm_bo_del_from_lru(bo);
+	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);
 
-	ttm_bo_list_ref_sub(bo, put_count, true);
-
 	/**
 	 * Move to system cached
 	 */
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index d35bc491e8de..5e1bcabffef5 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -48,9 +48,7 @@ static void ttm_eu_del_from_lru_locked(struct list_head *list)
 
 	list_for_each_entry(entry, list, head) {
 		struct ttm_buffer_object *bo = entry->bo;
-		unsigned put_count = ttm_bo_del_from_lru(bo);
-
-		ttm_bo_list_ref_sub(bo, put_count, true);
+		ttm_bo_del_from_lru(bo);
 	}
 }
 
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 652e45be97c8..9a465314572c 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -332,19 +332,6 @@ extern int ttm_bo_validate(struct ttm_buffer_object *bo,
  */
 extern void ttm_bo_unref(struct ttm_buffer_object **bo);
 
-
-/**
- * ttm_bo_list_ref_sub
- *
- * @bo: The buffer object.
- * @count: The number of references with which to decrease @bo::list_kref;
- * @never_free: The refcount should not reach zero with this operation.
- *
- * Release @count lru list references to this buffer object.
- */
-extern void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count,
-				bool never_free);
-
 /**
  * ttm_bo_add_to_lru
  *
@@ -367,7 +354,7 @@ extern void ttm_bo_add_to_lru(struct ttm_buffer_object *bo);
  * and is usually called just immediately after the bo has been reserved to
  * avoid recursive reservation from lru lists.
  */
-extern int ttm_bo_del_from_lru(struct ttm_buffer_object *bo);
+extern void ttm_bo_del_from_lru(struct ttm_buffer_object *bo);
 
 /**
  * ttm_bo_move_to_lru_tail
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 7c88d865f82f..31c49a64cdf9 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -54,9 +54,8 @@ static inline void kref_get(struct kref *kref)
 }
 
 /**
- * kref_sub - subtract a number of refcounts for object.
+ * kref_put - decrement refcount for object.
  * @kref: object.
- * @count: Number of recounts to subtract.
  * @release: pointer to the function that will clean up the object when the
  *	     last reference to the object is released.
  *	     This pointer is required, and it is not acceptable to pass kfree
@@ -65,46 +64,23 @@ static inline void kref_get(struct kref *kref)
  *	     maintainer, and anyone else who happens to notice it.  You have
  *	     been warned.
  *
- * Subtract @count from the refcount, and if 0, call release().
+ * Decrement the refcount, and if 0, call release().
  * Return 1 if the object was removed, otherwise return 0.  Beware, if this
  * function returns 0, you still can not count on the kref from remaining in
  * memory.  Only use the return value if you want to see if the kref is now
  * gone, not present.
  */
-static inline int kref_sub(struct kref *kref, unsigned int count,
-	     void (*release)(struct kref *kref))
+static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
 {
 	WARN_ON(release == NULL);
 
-	if (atomic_sub_and_test((int) count, &kref->refcount)) {
+	if (atomic_dec_and_test(&kref->refcount)) {
 		release(kref);
 		return 1;
 	}
 	return 0;
 }
 
-/**
- * kref_put - decrement refcount for object.
- * @kref: object.
- * @release: pointer to the function that will clean up the object when the
- *	     last reference to the object is released.
- *	     This pointer is required, and it is not acceptable to pass kfree
- *	     in as this function.  If the caller does pass kfree to this
- *	     function, you will be publicly mocked mercilessly by the kref
- *	     maintainer, and anyone else who happens to notice it.  You have
- *	     been warned.
- *
- * Decrement the refcount, and if 0, call release().
- * Return 1 if the object was removed, otherwise return 0.  Beware, if this
- * function returns 0, you still can not count on the kref from remaining in
- * memory.  Only use the return value if you want to see if the kref is now
- * gone, not present.
- */
-static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
-{
-	return kref_sub(kref, 1, release);
-}
-
 static inline int kref_put_mutex(struct kref *kref,
 				 void (*release)(struct kref *kref),
 				 struct mutex *lock)
-- 
cgit v1.2.3


From 23b19ec3377924eb8f303880102e056e9214ba72 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 14 Jan 2017 12:11:59 +0100
Subject: locking/ww_mutex: Turn off __must_check for now
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the ww_mutex inline wrappers gone there's a lot of dormant
anti-patterns emerging in an x86 allyesconfig build:

  kernel/locking/test-ww_mutex.c:80:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:55:3: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:134:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:213:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:177:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  kernel/locking/test-ww_mutex.c:266:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:213:19: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  lib/locking-selftest.c:211:20: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/drm_modeset_lock.c:430:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c:70:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/vgem/vgem_fence.c:193:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/i915/i915_gem_batch_pool.c:125:4: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/i915/i915_gem_execbuffer.c:1302:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/radeon/radeon_prime.c:69:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]
  drivers/gpu/drm/nouveau/nouveau_prime.c:70:2: warning: ignoring return value of ‘ww_mutex_lock’, declared with attribute warn_unused_result [-Wunused-result]

... but we cannot just litter the kernel build log with such warnings.

These need to be fixed separately - turn off the warning for now.

Cc: Nicolai Hähnle <nicolai.haehnle@amd.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 083950fd5b0b..5dd9a7682227 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -215,8 +215,7 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-extern int __must_check ww_mutex_lock(struct ww_mutex *lock,
-				      struct ww_acquire_ctx *ctx);
+extern int /* __must_check */ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
-- 
cgit v1.2.3


From f21860bac05b609d71757338361d26209ff0759b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 14 Jan 2017 17:11:36 +0100
Subject: locking/mutex, sched/wait: Fix the mutex_lock_io_nested() define

Mike noticed this bogosity:

 > > +# define mutex_lock_nest_io(lock, nest_lock) mutex_io(lock)
 >                                                 ^^^^^^^^^^^^^^ typo

This new locking API is not used yet, so this didn't trigger in testing.

Fix it.

Reported-by: Mike Galbraith <efault@gmx.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 980ba16b8749..7fffbfcd5430 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -179,7 +179,7 @@ extern void mutex_lock_io(struct mutex *lock);
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
-# define mutex_lock_nest_io(lock, nest_lock) mutex_io(lock)
+# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock)
 #endif
 
 /*
-- 
cgit v1.2.3


From de901cc31d151c4c855346c29fb61eaf5ffac3ad Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Sun, 15 Jan 2017 14:51:12 -0800
Subject: Input: mpr121 - switch to device tree probe

This driver currently only supports legacy platform data probe.  This
change adds device tree support and gets rid of platform data probe code
since no one is actually using mpr121 platform data in the mainline.

The device tree property parsing code is based on the work of
atmel_captouch driver.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 .../devicetree/bindings/input/mpr121-touchkey.txt  |  30 +++++
 drivers/input/keyboard/mpr121_touchkey.c           | 137 ++++++++++++++-------
 include/linux/i2c/mpr121_touchkey.h                |  20 ---
 3 files changed, 124 insertions(+), 63 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/input/mpr121-touchkey.txt
 delete mode 100644 include/linux/i2c/mpr121_touchkey.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/input/mpr121-touchkey.txt b/Documentation/devicetree/bindings/input/mpr121-touchkey.txt
new file mode 100644
index 000000000000..b7c61ee5841b
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/mpr121-touchkey.txt
@@ -0,0 +1,30 @@
+* Freescale MPR121 Controllor
+
+Required Properties:
+- compatible:		Should be "fsl,mpr121-touchkey"
+- reg:			The I2C slave address of the device.
+- interrupts:		The interrupt number to the cpu.
+- vdd-supply:		Phandle to the Vdd power supply.
+- linux,keycodes:	Specifies an array of numeric keycode values to
+			be used for reporting button presses. The array can
+			contain up to 12 entries.
+
+Optional Properties:
+- wakeup-source:	Use any event on keypad as wakeup event.
+- autorepeat:		Enable autorepeat feature.
+
+Example:
+
+#include "dt-bindings/input/input.h"
+
+	touchkey: mpr121@5a {
+		compatible = "fsl,mpr121-touchkey";
+		reg = <0x5a>;
+		interrupt-parent = <&gpio1>;
+		interrupts = <28 2>;
+		autorepeat;
+		vdd-supply = <&ldo4_reg>;
+		linux,keycodes = <KEY_0>, <KEY_1>, <KEY_2>, <KEY_3>,
+				<KEY_4> <KEY_5>, <KEY_6>, <KEY_7>,
+				<KEY_8>, <KEY_9>, <KEY_A>, <KEY_B>;
+	};
diff --git a/drivers/input/keyboard/mpr121_touchkey.c b/drivers/input/keyboard/mpr121_touchkey.c
index 83dd5616e470..989ca66f63af 100644
--- a/drivers/input/keyboard/mpr121_touchkey.c
+++ b/drivers/input/keyboard/mpr121_touchkey.c
@@ -12,14 +12,16 @@
  *
  */
 
-#include <linux/module.h>
-#include <linux/input.h>
-#include <linux/i2c.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
 #include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/i2c.h>
+#include <linux/input.h>
 #include <linux/interrupt.h>
-#include <linux/i2c/mpr121_touchkey.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/property.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
 
 /* Register definitions */
 #define ELE_TOUCH_STATUS_0_ADDR	0x0
@@ -61,7 +63,7 @@ struct mpr121_touchkey {
 	struct input_dev	*input_dev;
 	unsigned int		statusbits;
 	unsigned int		keycount;
-	u16			keycodes[MPR121_MAX_KEY_COUNT];
+	u32			keycodes[MPR121_MAX_KEY_COUNT];
 };
 
 struct mpr121_init_register {
@@ -81,6 +83,42 @@ static const struct mpr121_init_register init_reg_table[] = {
 	{ AUTO_CONFIG_CTRL_ADDR, 0x0b },
 };
 
+static void mpr121_vdd_supply_disable(void *data)
+{
+	struct regulator *vdd_supply = data;
+
+	regulator_disable(vdd_supply);
+}
+
+static struct regulator *mpr121_vdd_supply_init(struct device *dev)
+{
+	struct regulator *vdd_supply;
+	int err;
+
+	vdd_supply = devm_regulator_get(dev, "vdd");
+	if (IS_ERR(vdd_supply)) {
+		dev_err(dev, "failed to get vdd regulator: %ld\n",
+			PTR_ERR(vdd_supply));
+		return vdd_supply;
+	}
+
+	err = regulator_enable(vdd_supply);
+	if (err) {
+		dev_err(dev, "failed to enable vdd regulator: %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	err = devm_add_action(dev, mpr121_vdd_supply_disable, vdd_supply);
+	if (err) {
+		regulator_disable(vdd_supply);
+		dev_err(dev, "failed to add disable regulator action: %d\n",
+			err);
+		return ERR_PTR(err);
+	}
+
+	return vdd_supply;
+}
+
 static irqreturn_t mpr_touchkey_interrupt(int irq, void *dev_id)
 {
 	struct mpr121_touchkey *mpr121 = dev_id;
@@ -126,9 +164,8 @@ out:
 	return IRQ_HANDLED;
 }
 
-static int mpr121_phys_init(const struct mpr121_platform_data *pdata,
-				      struct mpr121_touchkey *mpr121,
-				      struct i2c_client *client)
+static int mpr121_phys_init(struct mpr121_touchkey *mpr121,
+			    struct i2c_client *client, int vdd_uv)
 {
 	const struct mpr121_init_register *reg;
 	unsigned char usl, lsl, tl, eleconf;
@@ -158,9 +195,9 @@ static int mpr121_phys_init(const struct mpr121_platform_data *pdata,
 	/*
 	 * Capacitance on sensing input varies and needs to be compensated.
 	 * The internal MPR121-auto-configuration can do this if it's
-	 * registers are set properly (based on pdata->vdd_uv).
+	 * registers are set properly (based on vdd_uv).
 	 */
-	vdd = pdata->vdd_uv / 1000;
+	vdd = vdd_uv / 1000;
 	usl = ((vdd - 700) * 256) / vdd;
 	lsl = (usl * 65) / 100;
 	tl = (usl * 90) / 100;
@@ -191,35 +228,26 @@ err_i2c_write:
 static int mpr_touchkey_probe(struct i2c_client *client,
 			      const struct i2c_device_id *id)
 {
-	const struct mpr121_platform_data *pdata =
-			dev_get_platdata(&client->dev);
+	struct device *dev = &client->dev;
+	struct regulator *vdd_supply;
+	int vdd_uv;
 	struct mpr121_touchkey *mpr121;
 	struct input_dev *input_dev;
 	int error;
 	int i;
 
-	if (!pdata) {
-		dev_err(&client->dev, "no platform data defined\n");
-		return -EINVAL;
-	}
-
-	if (!pdata->keymap || !pdata->keymap_size) {
-		dev_err(&client->dev, "missing keymap data\n");
-		return -EINVAL;
-	}
-
-	if (pdata->keymap_size > MPR121_MAX_KEY_COUNT) {
-		dev_err(&client->dev, "too many keys defined\n");
-		return -EINVAL;
-	}
-
 	if (!client->irq) {
 		dev_err(&client->dev, "irq number should not be zero\n");
 		return -EINVAL;
 	}
 
-	mpr121 = devm_kzalloc(&client->dev, sizeof(*mpr121),
-			      GFP_KERNEL);
+	vdd_supply = mpr121_vdd_supply_init(dev);
+	if (IS_ERR(vdd_supply))
+		return PTR_ERR(vdd_supply);
+
+	vdd_uv = regulator_get_voltage(vdd_supply);
+
+	mpr121 = devm_kzalloc(&client->dev, sizeof(*mpr121), GFP_KERNEL);
 	if (!mpr121)
 		return -ENOMEM;
 
@@ -229,33 +257,46 @@ static int mpr_touchkey_probe(struct i2c_client *client,
 
 	mpr121->client = client;
 	mpr121->input_dev = input_dev;
-	mpr121->keycount = pdata->keymap_size;
+	mpr121->keycount = device_property_read_u32_array(dev, "linux,keycodes",
+							  NULL, 0);
+	if (mpr121->keycount > MPR121_MAX_KEY_COUNT) {
+		dev_err(dev, "too many keys defined (%d)\n", mpr121->keycount);
+		return -EINVAL;
+	}
+
+	error = device_property_read_u32_array(dev, "linux,keycodes",
+					       mpr121->keycodes,
+					       mpr121->keycount);
+	if (error) {
+		dev_err(dev,
+			"failed to read linux,keycode property: %d\n", error);
+		return error;
+	}
 
 	input_dev->name = "Freescale MPR121 Touchkey";
 	input_dev->id.bustype = BUS_I2C;
 	input_dev->dev.parent = &client->dev;
-	input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP);
+	if (device_property_read_bool(dev, "autorepeat"))
+		__set_bit(EV_REP, input_dev->evbit);
 	input_set_capability(input_dev, EV_MSC, MSC_SCAN);
 
 	input_dev->keycode = mpr121->keycodes;
 	input_dev->keycodesize = sizeof(mpr121->keycodes[0]);
 	input_dev->keycodemax = mpr121->keycount;
 
-	for (i = 0; i < pdata->keymap_size; i++) {
-		input_set_capability(input_dev, EV_KEY, pdata->keymap[i]);
-		mpr121->keycodes[i] = pdata->keymap[i];
-	}
+	for (i = 0; i < mpr121->keycount; i++)
+		input_set_capability(input_dev, EV_KEY, mpr121->keycodes[i]);
 
-	error = mpr121_phys_init(pdata, mpr121, client);
+	error = mpr121_phys_init(mpr121, client, vdd_uv);
 	if (error) {
 		dev_err(&client->dev, "Failed to init register\n");
 		return error;
 	}
 
-	error = devm_request_threaded_irq(&client->dev, client->irq, NULL,
-				     mpr_touchkey_interrupt,
-				     IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
-				     client->dev.driver->name, mpr121);
+	error = devm_request_threaded_irq(&client->dev, client->irq,
+					  NULL, mpr_touchkey_interrupt,
+					  IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+					  client->dev.driver->name, mpr121);
 	if (error) {
 		dev_err(&client->dev, "Failed to register interrupt\n");
 		return error;
@@ -266,7 +307,8 @@ static int mpr_touchkey_probe(struct i2c_client *client,
 		return error;
 
 	i2c_set_clientdata(client, mpr121);
-	device_init_wakeup(&client->dev, pdata->wakeup);
+	device_init_wakeup(dev,
+			device_property_read_bool(dev, "wakeup-source"));
 
 	return 0;
 }
@@ -305,10 +347,19 @@ static const struct i2c_device_id mpr121_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, mpr121_id);
 
+#ifdef CONFIG_OF
+static const struct of_device_id mpr121_touchkey_dt_match_table[] = {
+	{ .compatible = "fsl,mpr121-touchkey" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, mpr121_touchkey_dt_match_table);
+#endif
+
 static struct i2c_driver mpr_touchkey_driver = {
 	.driver = {
 		.name	= "mpr121",
 		.pm	= &mpr121_touchkey_pm_ops,
+		.of_match_table = of_match_ptr(mpr121_touchkey_dt_match_table),
 	},
 	.id_table	= mpr121_id,
 	.probe		= mpr_touchkey_probe,
diff --git a/include/linux/i2c/mpr121_touchkey.h b/include/linux/i2c/mpr121_touchkey.h
deleted file mode 100644
index f0bcc38bbb97..000000000000
--- a/include/linux/i2c/mpr121_touchkey.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Header file for Freescale MPR121 Capacitive Touch Sensor */
-
-#ifndef _MPR121_TOUCHKEY_H
-#define _MPR121_TOUCHKEY_H
-
-/**
- * struct mpr121_platform_data - platform data for mpr121 sensor
- * @keymap: pointer to array of KEY_* values representing keymap
- * @keymap_size: size of the keymap
- * @wakeup: configure the button as a wake-up source
- * @vdd_uv: VDD voltage in uV
- */
-struct mpr121_platform_data {
-	const unsigned short *keymap;
-	unsigned int keymap_size;
-	bool wakeup;
-	int vdd_uv;
-};
-
-#endif /* _MPR121_TOUCHKEY_H */
-- 
cgit v1.2.3


From e12799125074c67cd30bc3d7a5ff5e2f29ea1411 Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Tue, 22 Nov 2016 19:03:09 +0200
Subject: firmware: qcom: scm: Add empty functions to help compile testing

This will help to compile testing drivers which depends on scm
functions with COMPILE_TEST Kconfig option.

Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 include/linux/qcom_scm.h | 50 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index cc32ab852fbc..7e004fb57fc4 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -13,9 +13,9 @@
 #ifndef __QCOM_SCM_H
 #define __QCOM_SCM_H
 
-extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus);
-extern int qcom_scm_set_warm_boot_addr(void *entry, const cpumask_t *cpus);
-
+#define QCOM_SCM_VERSION(major, minor)	(((major) << 16) | ((minor) & 0xFF))
+#define QCOM_SCM_CPU_PWR_DOWN_L2_ON	0x0
+#define QCOM_SCM_CPU_PWR_DOWN_L2_OFF	0x1
 #define QCOM_SCM_HDCP_MAX_REQ_CNT	5
 
 struct qcom_scm_hdcp_req {
@@ -23,27 +23,47 @@ struct qcom_scm_hdcp_req {
 	u32 val;
 };
 
+#if IS_ENABLED(CONFIG_QCOM_SCM)
+extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus);
+extern int qcom_scm_set_warm_boot_addr(void *entry, const cpumask_t *cpus);
 extern bool qcom_scm_is_available(void);
-
 extern bool qcom_scm_hdcp_available(void);
 extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
-		u32 *resp);
-
+			     u32 *resp);
 extern bool qcom_scm_pas_supported(u32 peripheral);
 extern int qcom_scm_pas_init_image(u32 peripheral, const void *metadata,
-		size_t size);
+				   size_t size);
 extern int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr,
-		phys_addr_t size);
+				  phys_addr_t size);
 extern int qcom_scm_pas_auth_and_reset(u32 peripheral);
 extern int qcom_scm_pas_shutdown(u32 peripheral);
-
-#define QCOM_SCM_CPU_PWR_DOWN_L2_ON	0x0
-#define QCOM_SCM_CPU_PWR_DOWN_L2_OFF	0x1
-
 extern void qcom_scm_cpu_power_down(u32 flags);
-
-#define QCOM_SCM_VERSION(major, minor) (((major) << 16) | ((minor) & 0xFF))
-
 extern u32 qcom_scm_get_version(void);
+#else
+static inline
+int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus)
+{
+	return -ENODEV;
+}
+static inline
+int qcom_scm_set_warm_boot_addr(void *entry, const cpumask_t *cpus)
+{
+	return -ENODEV;
+}
+static inline bool qcom_scm_is_available(void) { return false; }
+static inline bool qcom_scm_hdcp_available(void) { return false; }
+static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
+				    u32 *resp) { return -ENODEV; }
+static inline bool qcom_scm_pas_supported(u32 peripheral) { return false; }
+static inline int qcom_scm_pas_init_image(u32 peripheral, const void *metadata,
+					  size_t size) { return -ENODEV; }
+static inline int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr,
+					 phys_addr_t size) { return -ENODEV; }
+static inline int
+qcom_scm_pas_auth_and_reset(u32 peripheral) { return -ENODEV; }
+static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; }
+static inline void qcom_scm_cpu_power_down(u32 flags) {}
+static inline u32 qcom_scm_get_version(void) { return 0; }
+#endif
 
 #endif
-- 
cgit v1.2.3


From a811b420b6c13759540070c0e9541b7cd8569168 Mon Sep 17 00:00:00 2001
From: Andy Gross <andy.gross@linaro.org>
Date: Mon, 16 Jan 2017 23:24:15 -0600
Subject: firmware: qcom_scm: Add set remote state API

This patch adds a set remote state SCM API.  This will be used by the
Venus and GPU subsystems to set state on the remote processors.

This work was based on two patch sets by Jordan Crouse and Stanimir
Varbanov.

Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/firmware/qcom_scm-32.c | 18 ++++++++++++++++++
 drivers/firmware/qcom_scm-64.c | 16 ++++++++++++++++
 drivers/firmware/qcom_scm.c    |  6 ++++++
 drivers/firmware/qcom_scm.h    |  2 ++
 include/linux/qcom_scm.h       |  4 +++-
 5 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/qcom_scm-32.c b/drivers/firmware/qcom_scm-32.c
index c6aeedbdcbb0..8ad226c60374 100644
--- a/drivers/firmware/qcom_scm-32.c
+++ b/drivers/firmware/qcom_scm-32.c
@@ -560,3 +560,21 @@ int __qcom_scm_pas_mss_reset(struct device *dev, bool reset)
 
 	return ret ? : le32_to_cpu(out);
 }
+
+int __qcom_scm_set_remote_state(struct device *dev, u32 state, u32 id)
+{
+	struct {
+		__le32 state;
+		__le32 id;
+	} req;
+	__le32 scm_ret = 0;
+	int ret;
+
+	req.state = cpu_to_le32(state);
+	req.id = cpu_to_le32(id);
+
+	ret = qcom_scm_call(dev, QCOM_SCM_SVC_BOOT, QCOM_SCM_SET_REMOTE_STATE,
+			    &req, sizeof(req), &scm_ret, sizeof(scm_ret));
+
+	return ret ? : le32_to_cpu(scm_ret);
+}
diff --git a/drivers/firmware/qcom_scm-64.c b/drivers/firmware/qcom_scm-64.c
index 4a0f5ead4fb5..4b220abaf363 100644
--- a/drivers/firmware/qcom_scm-64.c
+++ b/drivers/firmware/qcom_scm-64.c
@@ -358,3 +358,19 @@ int __qcom_scm_pas_mss_reset(struct device *dev, bool reset)
 
 	return ret ? : res.a1;
 }
+
+int __qcom_scm_set_remote_state(struct device *dev, u32 state, u32 id)
+{
+	struct qcom_scm_desc desc = {0};
+	struct arm_smccc_res res;
+	int ret;
+
+	desc.args[0] = state;
+	desc.args[1] = id;
+	desc.arginfo = QCOM_SCM_ARGS(2);
+
+	ret = qcom_scm_call(dev, QCOM_SCM_SVC_BOOT, QCOM_SCM_SET_REMOTE_STATE,
+			    &desc, &res);
+
+	return ret ? : res.a1;
+}
diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index 65d0d9d64ebb..d987bcc7489d 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -324,6 +324,12 @@ bool qcom_scm_is_available(void)
 }
 EXPORT_SYMBOL(qcom_scm_is_available);
 
+int qcom_scm_set_remote_state(u32 state, u32 id)
+{
+	return __qcom_scm_set_remote_state(__scm->dev, state, id);
+}
+EXPORT_SYMBOL(qcom_scm_set_remote_state);
+
 static int qcom_scm_probe(struct platform_device *pdev)
 {
 	struct qcom_scm *scm;
diff --git a/drivers/firmware/qcom_scm.h b/drivers/firmware/qcom_scm.h
index 3584b00fe7e6..6a0f15469344 100644
--- a/drivers/firmware/qcom_scm.h
+++ b/drivers/firmware/qcom_scm.h
@@ -15,6 +15,8 @@
 #define QCOM_SCM_SVC_BOOT		0x1
 #define QCOM_SCM_BOOT_ADDR		0x1
 #define QCOM_SCM_BOOT_ADDR_MC		0x11
+#define QCOM_SCM_SET_REMOTE_STATE	0xa
+extern int __qcom_scm_set_remote_state(struct device *dev, u32 state, u32 id);
 
 #define QCOM_SCM_FLAG_HLOS		0x01
 #define QCOM_SCM_FLAG_COLDBOOT_MC	0x02
diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index 7e004fb57fc4..d32f6f1a5225 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -39,6 +39,7 @@ extern int qcom_scm_pas_auth_and_reset(u32 peripheral);
 extern int qcom_scm_pas_shutdown(u32 peripheral);
 extern void qcom_scm_cpu_power_down(u32 flags);
 extern u32 qcom_scm_get_version(void);
+extern int qcom_scm_set_remote_state(u32 state, u32 id);
 #else
 static inline
 int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus)
@@ -64,6 +65,7 @@ qcom_scm_pas_auth_and_reset(u32 peripheral) { return -ENODEV; }
 static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; }
 static inline void qcom_scm_cpu_power_down(u32 flags) {}
 static inline u32 qcom_scm_get_version(void) { return 0; }
+static inline u32
+qcom_scm_set_remote_state(u32 state,u32 id) { return -ENODEV; }
 #endif
-
 #endif
-- 
cgit v1.2.3


From 5fa23530d4fcc7e84be9a557c58d0e670a15c042 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 16 Jan 2017 10:40:43 +0000
Subject: of: base: add support to find the level of the last cache

It is useful to have helper function just to get the number of cache
levels for a given logical cpu. We can obtain the same by just checking
the level at which the last cache is present. This patch adds support
to find the level of the last cache for a given cpu.

It will be used on ARM64 platform where the device tree provides the
information for the additional non-architected/transparent/external
last level caches that are not integrated with the processors.

Cc: Mark Rutland <mark.rutland@arm.com>
Suggested-by: Rob Herring <robh+dt@kernel.org>
Acked-by: Rob Herring <robh+dt@kernel.org>
Tested-by: Tan Xiaojun <tanxiaojun@huawei.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
[will: use u32 instead of int for cache_level]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/of/base.c  | 26 ++++++++++++++++++++++++++
 include/linux/of.h |  1 +
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d4bea3c797d6..a641b1faf057 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -25,6 +25,7 @@
 #include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/of_graph.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -2267,6 +2268,31 @@ struct device_node *of_find_next_cache_node(const struct device_node *np)
 	return NULL;
 }
 
+/**
+ * of_find_last_cache_level - Find the level at which the last cache is
+ * 		present for the given logical cpu
+ *
+ * @cpu: cpu number(logical index) for which the last cache level is needed
+ *
+ * Returns the the level at which the last cache is present. It is exactly
+ * same as  the total number of cache levels for the given logical cpu.
+ */
+int of_find_last_cache_level(unsigned int cpu)
+{
+	u32 cache_level = 0;
+	struct device_node *prev = NULL, *np = of_cpu_device_node_get(cpu);
+
+	while (np) {
+		prev = np;
+		of_node_put(np);
+		np = of_find_next_cache_node(np);
+	}
+
+	of_property_read_u32(prev, "cache-level", &cache_level);
+
+	return cache_level;
+}
+
 /**
  * of_graph_parse_endpoint() - parse common endpoint node properties
  * @node: pointer to endpoint device_node
diff --git a/include/linux/of.h b/include/linux/of.h
index d72f01009297..21e6323de0f3 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -280,6 +280,7 @@ extern struct device_node *of_get_child_by_name(const struct device_node *node,
 
 /* cache lookup */
 extern struct device_node *of_find_next_cache_node(const struct device_node *);
+extern int of_find_last_cache_level(unsigned int cpu);
 extern struct device_node *of_find_node_with_property(
 	struct device_node *from, const char *prop_name);
 
-- 
cgit v1.2.3


From c51ca6cf545bc51ad38bd50816bde37c647d608d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Sat, 10 Dec 2016 15:13:59 -0700
Subject: block: move existing elevator ops to union

Prep patch for adding MQ ops as well, since doing anon unions with
named initializers doesn't work on older compilers.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-ioc.c          |  8 +++----
 block/blk-merge.c        |  4 ++--
 block/blk.h              | 10 ++++----
 block/cfq-iosched.c      |  2 +-
 block/deadline-iosched.c |  2 +-
 block/elevator.c         | 60 ++++++++++++++++++++++++------------------------
 block/noop-iosched.c     |  2 +-
 include/linux/elevator.h |  4 +++-
 8 files changed, 47 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 381cb50a673c..ab372092a57d 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -43,8 +43,8 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (icq->flags & ICQ_EXITED)
 		return;
 
-	if (et->ops.elevator_exit_icq_fn)
-		et->ops.elevator_exit_icq_fn(icq);
+	if (et->ops.sq.elevator_exit_icq_fn)
+		et->ops.sq.elevator_exit_icq_fn(icq);
 
 	icq->flags |= ICQ_EXITED;
 }
@@ -383,8 +383,8 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.elevator_init_icq_fn)
-			et->ops.elevator_init_icq_fn(icq);
+		if (et->ops.sq.elevator_init_icq_fn)
+			et->ops.sq.elevator_init_icq_fn(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
 		icq = ioc_lookup_icq(ioc, q);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 182398cb1524..480570b691dc 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -763,8 +763,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_allow_rq_merge_fn)
-		if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next))
+	if (e->type->ops.sq.elevator_allow_rq_merge_fn)
+		if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
 			return 0;
 
 	return attempt_merge(q, rq, next);
diff --git a/block/blk.h b/block/blk.h
index 041185e5f129..f46c0ac8ae3d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -167,7 +167,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			return NULL;
 		}
 		if (unlikely(blk_queue_bypass(q)) ||
-		    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
+		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
@@ -176,16 +176,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_activate_req_fn)
-		e->type->ops.elevator_activate_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_activate_req_fn)
+		e->type->ops.sq.elevator_activate_req_fn(q, rq);
 }
 
 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_deactivate_req_fn)
-		e->type->ops.elevator_deactivate_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_deactivate_req_fn)
+		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c73a6fcaeb9d..37aeb20fa454 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4837,7 +4837,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 };
 
 static struct elevator_type iosched_cfq = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_fn = 		cfq_merge,
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 55e0bb6d7da7..05fc0ea25a98 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -439,7 +439,7 @@ static struct elv_fs_entry deadline_attrs[] = {
 };
 
 static struct elevator_type iosched_deadline = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_fn = 		deadline_merge,
 		.elevator_merged_fn =		deadline_merged_request,
 		.elevator_merge_req_fn =	deadline_merged_requests,
diff --git a/block/elevator.c b/block/elevator.c
index 40f0c04e5ad3..022a26830297 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -58,8 +58,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_allow_bio_merge_fn)
-		return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
+	if (e->type->ops.sq.elevator_allow_bio_merge_fn)
+		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
 }
@@ -224,7 +224,7 @@ int elevator_init(struct request_queue *q, char *name)
 		}
 	}
 
-	err = e->ops.elevator_init_fn(q, e);
+	err = e->ops.sq.elevator_init_fn(q, e);
 	if (err)
 		elevator_put(e);
 	return err;
@@ -234,8 +234,8 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->type->ops.elevator_exit_fn)
-		e->type->ops.elevator_exit_fn(e);
+	if (e->type->ops.sq.elevator_exit_fn)
+		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
 	kobject_put(&e->kobj);
@@ -443,8 +443,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->type->ops.elevator_merge_fn)
-		return e->type->ops.elevator_merge_fn(q, req, bio);
+	if (e->type->ops.sq.elevator_merge_fn)
+		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
@@ -495,8 +495,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_merged_fn)
-		e->type->ops.elevator_merged_fn(q, rq, type);
+	if (e->type->ops.sq.elevator_merged_fn)
+		e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
@@ -510,8 +510,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	struct elevator_queue *e = q->elevator;
 	const int next_sorted = next->rq_flags & RQF_SORTED;
 
-	if (next_sorted && e->type->ops.elevator_merge_req_fn)
-		e->type->ops.elevator_merge_req_fn(q, rq, next);
+	if (next_sorted && e->type->ops.sq.elevator_merge_req_fn)
+		e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
 
 	elv_rqhash_reposition(q, rq);
 
@@ -528,8 +528,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_bio_merged_fn)
-		e->type->ops.elevator_bio_merged_fn(q, rq, bio);
+	if (e->type->ops.sq.elevator_bio_merged_fn)
+		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
 }
 
 #ifdef CONFIG_PM
@@ -578,7 +578,7 @@ void elv_drain_elevator(struct request_queue *q)
 
 	lockdep_assert_held(q->queue_lock);
 
-	while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
+	while (q->elevator->type->ops.sq.elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
@@ -653,7 +653,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		 * rq cannot be accessed after calling
 		 * elevator_add_req_fn.
 		 */
-		q->elevator->type->ops.elevator_add_req_fn(q, rq);
+		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
 		break;
 
 	case ELEVATOR_INSERT_FLUSH:
@@ -682,8 +682,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_latter_req_fn)
-		return e->type->ops.elevator_latter_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_latter_req_fn)
+		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
 	return NULL;
 }
 
@@ -691,8 +691,8 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_former_req_fn)
-		return e->type->ops.elevator_former_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_former_req_fn)
+		return e->type->ops.sq.elevator_former_req_fn(q, rq);
 	return NULL;
 }
 
@@ -701,8 +701,8 @@ int elv_set_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_set_req_fn)
-		return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
+	if (e->type->ops.sq.elevator_set_req_fn)
+		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
 }
 
@@ -710,16 +710,16 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_put_req_fn)
-		e->type->ops.elevator_put_req_fn(rq);
+	if (e->type->ops.sq.elevator_put_req_fn)
+		e->type->ops.sq.elevator_put_req_fn(rq);
 }
 
 int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_may_queue_fn)
-		return e->type->ops.elevator_may_queue_fn(q, op);
+	if (e->type->ops.sq.elevator_may_queue_fn)
+		return e->type->ops.sq.elevator_may_queue_fn(q, op);
 
 	return ELV_MQUEUE_MAY;
 }
@@ -734,8 +734,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
 		if ((rq->rq_flags & RQF_SORTED) &&
-		    e->type->ops.elevator_completed_req_fn)
-			e->type->ops.elevator_completed_req_fn(q, rq);
+		    e->type->ops.sq.elevator_completed_req_fn)
+			e->type->ops.sq.elevator_completed_req_fn(q, rq);
 	}
 }
 
@@ -803,8 +803,8 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
-		if (e->type->ops.elevator_registered_fn)
-			e->type->ops.elevator_registered_fn(q);
+		if (e->type->ops.sq.elevator_registered_fn)
+			e->type->ops.sq.elevator_registered_fn(q);
 	}
 	return error;
 }
@@ -912,7 +912,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	spin_unlock_irq(q->queue_lock);
 
 	/* allocate, init and register new elevator */
-	err = new_e->ops.elevator_init_fn(q, new_e);
+	err = new_e->ops.sq.elevator_init_fn(q, new_e);
 	if (err)
 		goto fail_init;
 
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index a163c487cf38..2d1b15d89b45 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e)
 }
 
 static struct elevator_type elevator_noop = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_req_fn		= noop_merged_requests,
 		.elevator_dispatch_fn		= noop_dispatch,
 		.elevator_add_req_fn		= noop_add_request,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b276e9ef0e0b..2a9e966eed03 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -94,7 +94,9 @@ struct elevator_type
 	struct kmem_cache *icq_cache;
 
 	/* fields provided by elevator implementation */
-	struct elevator_ops ops;
+	union {
+		struct elevator_ops sq;
+	} ops;
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
-- 
cgit v1.2.3


From 16a3c2a70cad5ccdc2dc0a4544bff82554807493 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 15 Dec 2016 14:27:46 -0700
Subject: blk-mq: un-export blk_mq_free_hctx_request()

It's only used in blk-mq, kill it from the main exported header
and kill the symbol export as well.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-mq.c         | 5 ++---
 include/linux/blk-mq.h | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 79e1cb0f7b15..f49f6325b332 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -337,15 +337,14 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	blk_queue_exit(q);
 }
 
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx,
+				     struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
 	__blk_mq_free_request(hctx, ctx, rq);
-
 }
-EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 
 void blk_mq_free_request(struct request *rq)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index afc81d77e471..2686f9e7302a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -181,7 +181,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
-- 
cgit v1.2.3


From fd2d332677c687ca90c12a47d6c377c547100b56 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 12 Jan 2017 10:04:45 -0700
Subject: blk-mq: add support for carrying internal tag information in blk_qc_t

No functional change in this patch, just in preparation for having
two types of tags available to the block layer for a single request.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-mq.c            | 11 ++++++++---
 include/linux/blk_types.h | 22 +++++++++++++++++-----
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fcdeadc55753..d40be641f3d5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1308,6 +1308,11 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
+static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
+}
+
 static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
 	int ret;
@@ -1318,7 +1323,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 		.list = NULL,
 		.last = 1
 	};
-	blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+	blk_qc_t new_cookie = request_to_qc_t(hctx, rq);
 
 	if (blk_mq_hctx_stopped(hctx))
 		goto insert;
@@ -1387,7 +1392,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	wbt_track(&rq->issue_stat, wb_acct);
 
-	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+	cookie = request_to_qc_t(data.hctx, rq);
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
@@ -1496,7 +1501,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 
 	wbt_track(&rq->issue_stat, wb_acct);
 
-	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+	cookie = request_to_qc_t(data.hctx, rq);
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 519ea2c9df61..0e5b1cd5113c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -232,22 +232,29 @@ static inline bool op_is_sync(unsigned int op)
 }
 
 typedef unsigned int blk_qc_t;
-#define BLK_QC_T_NONE	-1U
-#define BLK_QC_T_SHIFT	16
+#define BLK_QC_T_NONE		-1U
+#define BLK_QC_T_SHIFT		16
+#define BLK_QC_T_INTERNAL	(1U << 31)
 
 static inline bool blk_qc_t_valid(blk_qc_t cookie)
 {
 	return cookie != BLK_QC_T_NONE;
 }
 
-static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num)
+static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num,
+				       bool internal)
 {
-	return tag | (queue_num << BLK_QC_T_SHIFT);
+	blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT);
+
+	if (internal)
+		ret |= BLK_QC_T_INTERNAL;
+
+	return ret;
 }
 
 static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
 {
-	return cookie >> BLK_QC_T_SHIFT;
+	return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
 }
 
 static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
@@ -255,6 +262,11 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
 	return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
 }
 
+static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
+{
+	return (cookie & BLK_QC_T_INTERNAL) != 0;
+}
+
 struct blk_issue_stat {
 	u64 time;
 };
-- 
cgit v1.2.3


From bd166ef183c263c5ced656d49ef19c7da4adc774 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 17 Jan 2017 06:03:22 -0700
Subject: blk-mq-sched: add framework for MQ capable IO schedulers

This adds a set of hooks that intercepts the blk-mq path of
allocating/inserting/issuing/completing requests, allowing
us to develop a scheduler within that framework.

We reuse the existing elevator scheduler API on the registration
side, but augment that with the scheduler flagging support for
the blk-mq interfce, and with a separate set of ops hooks for MQ
devices.

We split driver and scheduler tags, so we can run the scheduling
independently of device queue depth.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
---
 block/Makefile           |   2 +-
 block/blk-cgroup.c       |  24 +++-
 block/blk-core.c         |   4 +-
 block/blk-exec.c         |   3 +-
 block/blk-flush.c        |  12 +-
 block/blk-ioc.c          |   8 +-
 block/blk-merge.c        |   2 +-
 block/blk-mq-sched.c     | 368 +++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-sched.h     | 170 ++++++++++++++++++++++
 block/blk-mq-sysfs.c     |  13 ++
 block/blk-mq.c           | 318 +++++++++++++++++++++++-----------------
 block/blk-mq.h           |   8 +-
 block/blk-tag.c          |   1 +
 block/elevator.c         | 204 ++++++++++++++++++++------
 include/linux/blk-mq.h   |   5 +-
 include/linux/blkdev.h   |   4 +-
 include/linux/elevator.h |  32 +++++
 17 files changed, 984 insertions(+), 194 deletions(-)
 create mode 100644 block/blk-mq-sched.c
 create mode 100644 block/blk-mq-sched.h

(limited to 'include/linux')

diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..2eee9e1bb6db 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
-			blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ba0af780e88..2630f64bed19 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1223,7 +1223,11 @@ int blkcg_activate_policy(struct request_queue *q,
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	blk_queue_bypass_start(q);
+	if (q->mq_ops) {
+		blk_mq_freeze_queue(q);
+		blk_mq_quiesce_queue(q);
+	} else
+		blk_queue_bypass_start(q);
 pd_prealloc:
 	if (!pd_prealloc) {
 		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1261,7 +1265,10 @@ pd_prealloc:
 
 	spin_unlock_irq(q->queue_lock);
 out_bypass_end:
-	blk_queue_bypass_end(q);
+	if (q->mq_ops)
+		blk_mq_unfreeze_queue(q);
+	else
+		blk_queue_bypass_end(q);
 	if (pd_prealloc)
 		pol->pd_free_fn(pd_prealloc);
 	return ret;
@@ -1284,7 +1291,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
 	if (!blkcg_policy_enabled(q, pol))
 		return;
 
-	blk_queue_bypass_start(q);
+	if (q->mq_ops) {
+		blk_mq_freeze_queue(q);
+		blk_mq_quiesce_queue(q);
+	} else
+		blk_queue_bypass_start(q);
+
 	spin_lock_irq(q->queue_lock);
 
 	__clear_bit(pol->plid, q->blkcg_pols);
@@ -1304,7 +1316,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
 	}
 
 	spin_unlock_irq(q->queue_lock);
-	blk_queue_bypass_end(q);
+
+	if (q->mq_ops)
+		blk_mq_unfreeze_queue(q);
+	else
+		blk_queue_bypass_end(q);
 }
 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 92baea07acbc..a61f1407f4f6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
 #include "blk-wbt.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -134,6 +135,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->cmd = rq->__cmd;
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
+	rq->internal_tag = -1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
@@ -2127,7 +2129,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	if (q->mq_ops) {
 		if (blk_queue_io_stat(q))
 			blk_account_io_start(rq, true);
-		blk_mq_insert_request(rq, false, true, false);
+		blk_mq_sched_insert_request(rq, false, true, false);
 		return 0;
 	}
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3ecb00a6cf45..86656fdfa637 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 /*
  * for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * be reused after dying flag is set
 	 */
 	if (q->mq_ops) {
-		blk_mq_insert_request(rq, at_head, true, false);
+		blk_mq_sched_insert_request(rq, at_head, true, false);
 		return;
 	}
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 20b7c7a02f1c..d7de34ee39c2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -391,9 +392,10 @@ static void mq_flush_data_end_io(struct request *rq, int error)
 	 * the comment in flush_end_io().
 	 */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
-	if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
-		blk_mq_run_hw_queue(hctx, true);
+	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+
+	blk_mq_run_hw_queue(hctx, true);
 }
 
 /**
@@ -453,9 +455,9 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		if (q->mq_ops) {
-			blk_mq_insert_request(rq, false, true, false);
-		} else
+		if (q->mq_ops)
+			blk_mq_sched_insert_request(rq, false, true, false);
+		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index ab372092a57d..fe186a9eade9 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -43,7 +43,9 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (icq->flags & ICQ_EXITED)
 		return;
 
-	if (et->ops.sq.elevator_exit_icq_fn)
+	if (et->uses_mq && et->ops.mq.exit_icq)
+		et->ops.mq.exit_icq(icq);
+	else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
 		et->ops.sq.elevator_exit_icq_fn(icq);
 
 	icq->flags |= ICQ_EXITED;
@@ -383,7 +385,9 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.sq.elevator_init_icq_fn)
+		if (et->uses_mq && et->ops.mq.init_icq)
+			et->ops.mq.init_icq(icq);
+		else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
 			et->ops.sq.elevator_init_icq_fn(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 480570b691dc..6aa43dec5af4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -763,7 +763,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_allow_rq_merge_fn)
+	if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
 		if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
 			return 0;
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644
index 000000000000..26759798a0b3
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,368 @@
+/*
+ * blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (exit && hctx->sched_data)
+			exit(hctx);
+		kfree(hctx->sched_data);
+		hctx->sched_data = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				int (*init)(struct blk_mq_hw_ctx *),
+				void (*exit)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int ret;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+		if (!hctx->sched_data) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (init) {
+			ret = init(hctx);
+			if (ret) {
+				/*
+				 * We don't want to give exit() a partially
+				 * initialized sched_data. init() must clean up
+				 * if it fails.
+				 */
+				kfree(hctx->sched_data);
+				hctx->sched_data = NULL;
+				goto error;
+			}
+		}
+	}
+
+	return 0;
+error:
+	blk_mq_sched_free_hctx_data(q, exit);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+static void __blk_mq_sched_assign_ioc(struct request_queue *q,
+				      struct request *rq, struct io_context *ioc)
+{
+	struct io_cq *icq;
+
+	spin_lock_irq(q->queue_lock);
+	icq = ioc_lookup_icq(ioc, q);
+	spin_unlock_irq(q->queue_lock);
+
+	if (!icq) {
+		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+		if (!icq)
+			return;
+	}
+
+	rq->elv.icq = icq;
+	if (!blk_mq_sched_get_rq_priv(q, rq)) {
+		rq->rq_flags |= RQF_ELVPRIV;
+		get_io_context(icq->ioc);
+		return;
+	}
+
+	rq->elv.icq = NULL;
+}
+
+static void blk_mq_sched_assign_ioc(struct request_queue *q,
+				    struct request *rq, struct bio *bio)
+{
+	struct io_context *ioc;
+
+	ioc = rq_ioc(bio);
+	if (ioc)
+		__blk_mq_sched_assign_ioc(q, rq, ioc);
+}
+
+struct request *blk_mq_sched_get_request(struct request_queue *q,
+					 struct bio *bio,
+					 unsigned int op,
+					 struct blk_mq_alloc_data *data)
+{
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	struct request *rq;
+	const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA);
+
+	blk_queue_enter_live(q);
+	ctx = blk_mq_get_ctx(q);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+
+	if (e) {
+		data->flags |= BLK_MQ_REQ_INTERNAL;
+
+		/*
+		 * Flush requests are special and go directly to the
+		 * dispatch list.
+		 */
+		if (!is_flush && e->type->ops.mq.get_request) {
+			rq = e->type->ops.mq.get_request(q, op, data);
+			if (rq)
+				rq->rq_flags |= RQF_QUEUED;
+		} else
+			rq = __blk_mq_alloc_request(data, op);
+	} else {
+		rq = __blk_mq_alloc_request(data, op);
+		data->hctx->tags->rqs[rq->tag] = rq;
+	}
+
+	if (rq) {
+		if (!is_flush) {
+			rq->elv.icq = NULL;
+			if (e && e->type->icq_cache)
+				blk_mq_sched_assign_ioc(q, rq, bio);
+		}
+		data->hctx->queued++;
+		return rq;
+	}
+
+	blk_queue_exit(q);
+	return NULL;
+}
+
+void blk_mq_sched_put_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (rq->rq_flags & RQF_ELVPRIV) {
+		blk_mq_sched_put_rq_priv(rq->q, rq);
+		if (rq->elv.icq) {
+			put_io_context(rq->elv.icq->ioc);
+			rq->elv.icq = NULL;
+		}
+	}
+
+	if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
+		e->type->ops.mq.put_request(rq);
+	else
+		blk_mq_finish_request(rq);
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+	LIST_HEAD(rq_list);
+
+	if (unlikely(blk_mq_hctx_stopped(hctx)))
+		return;
+
+	hctx->run++;
+
+	/*
+	 * If we have previous entries on our dispatch list, grab them first for
+	 * more fair dispatch.
+	 */
+	if (!list_empty_careful(&hctx->dispatch)) {
+		spin_lock(&hctx->lock);
+		if (!list_empty(&hctx->dispatch))
+			list_splice_init(&hctx->dispatch, &rq_list);
+		spin_unlock(&hctx->lock);
+	}
+
+	/*
+	 * Only ask the scheduler for requests, if we didn't have residual
+	 * requests from the dispatch list. This is to avoid the case where
+	 * we only ever dispatch a fraction of the requests available because
+	 * of low device queue depth. Once we pull requests out of the IO
+	 * scheduler, we can no longer merge or sort them. So it's best to
+	 * leave them there for as long as we can. Mark the hw queue as
+	 * needing a restart in that case.
+	 */
+	if (list_empty(&rq_list)) {
+		if (e && e->type->ops.mq.dispatch_requests)
+			e->type->ops.mq.dispatch_requests(hctx, &rq_list);
+		else
+			blk_mq_flush_busy_ctxs(hctx, &rq_list);
+	} else
+		blk_mq_sched_mark_restart(hctx);
+
+	blk_mq_dispatch_rq_list(hctx, &rq_list);
+}
+
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+				   struct list_head *rq_list,
+				   struct request *(*get_rq)(struct blk_mq_hw_ctx *))
+{
+	do {
+		struct request *rq;
+
+		rq = get_rq(hctx);
+		if (!rq)
+			break;
+
+		list_add_tail(&rq->queuelist, rq_list);
+	} while (1);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio)
+{
+	struct request *rq;
+	int ret;
+
+	ret = elv_merge(q, &rq, bio);
+	if (ret == ELEVATOR_BACK_MERGE) {
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			return false;
+		if (bio_attempt_back_merge(q, rq, bio)) {
+			if (!attempt_back_merge(q, rq))
+				elv_merged_request(q, rq, ret);
+			return true;
+		}
+	} else if (ret == ELEVATOR_FRONT_MERGE) {
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			return false;
+		if (bio_attempt_front_merge(q, rq, bio)) {
+			if (!attempt_front_merge(q, rq))
+				elv_merged_request(q, rq, ret);
+			return true;
+		}
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
+
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->type->ops.mq.bio_merge) {
+		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+		struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+		blk_mq_put_ctx(ctx);
+		return e->type->ops.mq.bio_merge(hctx, bio);
+	}
+
+	return false;
+}
+
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+{
+	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
+
+void blk_mq_sched_request_inserted(struct request *rq)
+{
+	trace_block_rq_insert(rq->q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
+
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	if (rq->tag == -1) {
+		rq->rq_flags |= RQF_SORTED;
+		return false;
+	}
+
+	/*
+	 * If we already have a real request tag, send directly to
+	 * the dispatch list.
+	 */
+	spin_lock(&hctx->lock);
+	list_add(&rq->queuelist, &hctx->dispatch);
+	spin_unlock(&hctx->lock);
+	return true;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
+
+static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
+				   struct blk_mq_hw_ctx *hctx,
+				   unsigned int hctx_idx)
+{
+	if (hctx->sched_tags) {
+		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
+		blk_mq_free_rq_map(hctx->sched_tags);
+		hctx->sched_tags = NULL;
+	}
+}
+
+int blk_mq_sched_setup(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int ret, i;
+
+	/*
+	 * Default to 256, since we don't split into sync/async like the
+	 * old code did. Additionally, this is a per-hw queue depth.
+	 */
+	q->nr_requests = 2 * BLKDEV_MAX_RQ;
+
+	/*
+	 * We're switching to using an IO scheduler, so setup the hctx
+	 * scheduler tags and switch the request map from the regular
+	 * tags to scheduler tags. First allocate what we need, so we
+	 * can safely fail and fallback, if needed.
+	 */
+	ret = 0;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
+		if (!hctx->sched_tags) {
+			ret = -ENOMEM;
+			break;
+		}
+		ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * If we failed, free what we did allocate
+	 */
+	if (ret) {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (!hctx->sched_tags)
+				continue;
+			blk_mq_sched_free_tags(set, hctx, i);
+		}
+
+		return ret;
+	}
+
+	return 0;
+}
+
+void blk_mq_sched_teardown(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_sched_free_tags(set, hctx, i);
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644
index 000000000000..35c49e2e008a
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,170 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				int (*init)(struct blk_mq_hw_ctx *),
+				void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *));
+
+struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
+void blk_mq_sched_put_request(struct request *rq);
+
+void blk_mq_sched_request_inserted(struct request *rq);
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq);
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+			struct list_head *rq_list,
+			struct request *(*get_rq)(struct blk_mq_hw_ctx *));
+
+int blk_mq_sched_setup(struct request_queue *q);
+void blk_mq_sched_teardown(struct request_queue *q);
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
+		return false;
+
+	return __blk_mq_sched_bio_merge(q, bio);
+}
+
+static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
+					   struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.get_rq_priv)
+		return e->type->ops.mq.get_rq_priv(q, rq);
+
+	return 0;
+}
+
+static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
+					    struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.put_rq_priv)
+		e->type->ops.mq.put_rq_priv(q, rq);
+}
+
+static inline void
+blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue,
+			    bool async)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	if (e && e->type->ops.mq.insert_requests) {
+		LIST_HEAD(list);
+
+		list_add(&rq->queuelist, &list);
+		e->type->ops.mq.insert_requests(hctx, &list, at_head);
+	} else {
+		spin_lock(&ctx->lock);
+		__blk_mq_insert_request(hctx, rq, at_head);
+		spin_unlock(&ctx->lock);
+	}
+
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, async);
+}
+
+static inline void
+blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx,
+			     struct list_head *list, bool run_queue_async)
+{
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.insert_requests)
+		e->type->ops.mq.insert_requests(hctx, list, false);
+	else
+		blk_mq_insert_requests(hctx, ctx, list);
+
+	blk_mq_run_hw_queue(hctx, run_queue_async);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+			 struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.allow_merge)
+		return e->type->ops.mq.allow_merge(q, rq, bio);
+
+	return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.completed_request)
+		e->type->ops.mq.completed_request(hctx, rq);
+
+	BUG_ON(rq->internal_tag == -1);
+
+	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+
+	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+		blk_mq_run_hw_queue(hctx, true);
+	}
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.started_request)
+		e->type->ops.mq.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.requeue_request)
+		e->type->ops.mq.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.has_work)
+		return e->type->ops.mq.has_work(hctx);
+
+	return false;
+}
+
+static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
+{
+	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
+{
+	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index eacd3af72099..2caecaa98e40 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -231,6 +231,14 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+static ssize_t blk_mq_hw_sysfs_sched_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	if (hctx->sched_tags)
+		return blk_mq_tag_sysfs_show(hctx->sched_tags, page);
+
+	return 0;
+}
+
 static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
 	return blk_mq_tag_sysfs_show(hctx->tags, page);
@@ -345,6 +353,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
 	.attr = {.name = "pending", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_rq_list_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_sched_tags = {
+	.attr = {.name = "sched_tags", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_sched_tags_show,
+};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
 	.attr = {.name = "tags", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_tags_show,
@@ -370,6 +382,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_dispatched.attr,
 	&blk_mq_hw_sysfs_pending.attr,
 	&blk_mq_hw_sysfs_tags.attr,
+	&blk_mq_hw_sysfs_sched_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
 	&blk_mq_hw_sysfs_active.attr,
 	&blk_mq_hw_sysfs_poll.attr,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 89b81254201b..45e1707a9f86 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -41,7 +42,9 @@ static LIST_HEAD(all_q_list);
  */
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	return sbitmap_any_bit_set(&hctx->ctx_map);
+	return sbitmap_any_bit_set(&hctx->ctx_map) ||
+			!list_empty_careful(&hctx->dispatch) ||
+			blk_mq_sched_has_work(hctx);
 }
 
 /*
@@ -223,15 +226,23 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 
 	tag = blk_mq_get_tag(data);
 	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = data->hctx->tags->static_rqs[tag];
+		struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+
+		rq = tags->static_rqs[tag];
 
 		if (blk_mq_tag_busy(data->hctx)) {
 			rq->rq_flags = RQF_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
 		}
 
-		rq->tag = tag;
-		data->hctx->tags->rqs[tag] = rq;
+		if (data->flags & BLK_MQ_REQ_INTERNAL) {
+			rq->tag = -1;
+			rq->internal_tag = tag;
+		} else {
+			rq->tag = tag;
+			rq->internal_tag = -1;
+		}
+
 		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
 		return rq;
 	}
@@ -243,26 +254,21 @@ EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		unsigned int flags)
 {
-	struct blk_mq_ctx *ctx;
-	struct blk_mq_hw_ctx *hctx;
-	struct request *rq;
 	struct blk_mq_alloc_data alloc_data;
+	struct request *rq;
 	int ret;
 
 	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = blk_mq_get_ctx(q);
-	hctx = blk_mq_map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	blk_mq_put_ctx(ctx);
+	rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-	if (!rq) {
-		blk_queue_exit(q);
+	blk_mq_put_ctx(alloc_data.ctx);
+	blk_queue_exit(q);
+
+	if (!rq)
 		return ERR_PTR(-EWOULDBLOCK);
-	}
 
 	rq->__data_len = 0;
 	rq->__sector = (sector_t) -1;
@@ -322,10 +328,10 @@ out_queue_exit:
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-			   struct request *rq)
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			     struct request *rq)
 {
-	const int tag = rq->tag;
+	const int sched_tag = rq->internal_tag;
 	struct request_queue *q = rq->q;
 
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
@@ -336,22 +342,30 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+	if (rq->tag != -1)
+		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+	if (sched_tag != -1)
+		blk_mq_sched_completed_request(hctx, rq);
 	blk_queue_exit(q);
 }
 
-static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
 				     struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
-	__blk_mq_free_request(hctx, ctx, rq);
+	__blk_mq_finish_request(hctx, ctx, rq);
+}
+
+void blk_mq_finish_request(struct request *rq)
+{
+	blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
 
 void blk_mq_free_request(struct request *rq)
 {
-	blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
+	blk_mq_sched_put_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -469,6 +483,8 @@ void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_sched_started_request(rq);
+
 	trace_block_rq_issue(q, rq);
 
 	rq->resid_len = blk_rq_bytes(rq);
@@ -517,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 
 	trace_block_rq_requeue(q, rq);
 	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	blk_mq_sched_requeue_request(rq);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -551,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
 
 		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, true, false, false);
+		blk_mq_sched_insert_request(rq, true, false, false);
 	}
 
 	while (!list_empty(&rq_list)) {
 		rq = list_entry(rq_list.next, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, false, false, false);
+		blk_mq_sched_insert_request(rq, false, false, false);
 	}
 
 	blk_mq_run_hw_queues(q, false);
@@ -765,6 +782,12 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 			continue;
 
 		el_ret = blk_try_merge(rq, bio);
+		if (el_ret == ELEVATOR_NO_MERGE)
+			continue;
+
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			break;
+
 		if (el_ret == ELEVATOR_BACK_MERGE) {
 			if (bio_attempt_back_merge(q, rq, bio)) {
 				ctx->rq_merged++;
@@ -824,6 +847,59 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+static bool blk_mq_get_driver_tag(struct request *rq,
+				  struct blk_mq_hw_ctx **hctx, bool wait)
+{
+	struct blk_mq_alloc_data data = {
+		.q = rq->q,
+		.ctx = rq->mq_ctx,
+		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+	};
+
+	if (blk_mq_hctx_stopped(data.hctx))
+		return false;
+
+	if (rq->tag != -1) {
+done:
+		if (hctx)
+			*hctx = data.hctx;
+		return true;
+	}
+
+	rq->tag = blk_mq_get_tag(&data);
+	if (rq->tag >= 0) {
+		data.hctx->tags->rqs[rq->tag] = rq;
+		goto done;
+	}
+
+	return false;
+}
+
+/*
+ * If we fail getting a driver tag because all the driver tags are already
+ * assigned and on the dispatch list, BUT the first entry does not have a
+ * tag, then we could deadlock. For that case, move entries with assigned
+ * driver tags to the front, leaving the set of tagged requests in the
+ * same order, and the untagged set in the same order.
+ */
+static bool reorder_tags_to_front(struct list_head *list)
+{
+	struct request *rq, *tmp, *first = NULL;
+
+	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
+		if (rq == first)
+			break;
+		if (rq->tag != -1) {
+			list_move(&rq->queuelist, list);
+			if (!first)
+				first = rq;
+		}
+	}
+
+	return first != NULL;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
@@ -846,6 +922,12 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		struct blk_mq_queue_data bd;
 
 		rq = list_first_entry(list, struct request, queuelist);
+		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+			if (!queued && reorder_tags_to_front(list))
+				continue;
+			blk_mq_sched_mark_restart(hctx);
+			break;
+		}
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
@@ -899,48 +981,17 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		 * the requests in rq_list might get lost.
 		 *
 		 * blk_mq_run_hw_queue() already checks the STOPPED bit
-		 **/
-		blk_mq_run_hw_queue(hctx, true);
+		 *
+		 * If RESTART is set, then let completion restart the queue
+		 * instead of potentially looping here.
+		 */
+		if (!blk_mq_sched_needs_restart(hctx))
+			blk_mq_run_hw_queue(hctx, true);
 	}
 
 	return ret != BLK_MQ_RQ_QUEUE_BUSY;
 }
 
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
-{
-	LIST_HEAD(rq_list);
-	LIST_HEAD(driver_list);
-
-	if (unlikely(blk_mq_hctx_stopped(hctx)))
-		return;
-
-	hctx->run++;
-
-	/*
-	 * Touch any software queue that has pending entries.
-	 */
-	blk_mq_flush_busy_ctxs(hctx, &rq_list);
-
-	/*
-	 * If we have previous entries on our dispatch list, grab them
-	 * and stuff them at the front for more fair dispatch.
-	 */
-	if (!list_empty_careful(&hctx->dispatch)) {
-		spin_lock(&hctx->lock);
-		if (!list_empty(&hctx->dispatch))
-			list_splice_init(&hctx->dispatch, &rq_list);
-		spin_unlock(&hctx->lock);
-	}
-
-	blk_mq_dispatch_rq_list(hctx, &rq_list);
-}
-
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
@@ -950,11 +1001,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		blk_mq_process_rq_list(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-		blk_mq_process_rq_list(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
 	}
 }
@@ -1010,8 +1061,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if ((!blk_mq_hctx_has_pending(hctx) &&
-		    list_empty_careful(&hctx->dispatch)) ||
+		if (!blk_mq_hctx_has_pending(hctx) ||
 		    blk_mq_hctx_stopped(hctx))
 			continue;
 
@@ -1148,32 +1198,10 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-			   bool async)
-{
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	spin_lock(&ctx->lock);
-	__blk_mq_insert_request(hctx, rq, at_head);
-	spin_unlock(&ctx->lock);
-
-	if (run_queue)
-		blk_mq_run_hw_queue(hctx, async);
-}
-
-static void blk_mq_insert_requests(struct request_queue *q,
-				     struct blk_mq_ctx *ctx,
-				     struct list_head *list,
-				     int depth,
-				     bool from_schedule)
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			    struct list_head *list)
 
 {
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	trace_block_unplug(q, depth, !from_schedule);
-
 	/*
 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
 	 * offline now
@@ -1189,8 +1217,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
 	}
 	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
-
-	blk_mq_run_hw_queue(hctx, from_schedule);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1226,9 +1252,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		BUG_ON(!rq->q);
 		if (rq->mq_ctx != this_ctx) {
 			if (this_ctx) {
-				blk_mq_insert_requests(this_q, this_ctx,
-							&ctx_list, depth,
-							from_schedule);
+				trace_block_unplug(this_q, depth, from_schedule);
+				blk_mq_sched_insert_requests(this_q, this_ctx,
+								&ctx_list,
+								from_schedule);
 			}
 
 			this_ctx = rq->mq_ctx;
@@ -1245,8 +1272,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 * on 'ctx_list'. Do those.
 	 */
 	if (this_ctx) {
-		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
-				       from_schedule);
+		trace_block_unplug(this_q, depth, from_schedule);
+		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+						from_schedule);
 	}
 }
 
@@ -1284,51 +1312,39 @@ insert_rq:
 		}
 
 		spin_unlock(&ctx->lock);
-		__blk_mq_free_request(hctx, ctx, rq);
+		__blk_mq_finish_request(hctx, ctx, rq);
 		return true;
 	}
 }
 
-static struct request *blk_mq_map_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data)
-{
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	struct request *rq;
-
-	blk_queue_enter_live(q);
-	ctx = blk_mq_get_ctx(q);
-	hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	trace_block_getrq(q, bio, bio->bi_opf);
-	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-	rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
-	data->hctx->queued++;
-	return rq;
-}
-
 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-	return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
+	if (rq->tag != -1)
+		return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
+
+	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
 static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
-	int ret;
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.list = NULL,
 		.last = 1
 	};
-	blk_qc_t new_cookie = request_to_qc_t(hctx, rq);
+	struct blk_mq_hw_ctx *hctx;
+	blk_qc_t new_cookie;
+	int ret;
 
-	if (blk_mq_hctx_stopped(hctx))
+	if (q->elevator)
 		goto insert;
 
+	if (!blk_mq_get_driver_tag(rq, &hctx, false))
+		goto insert;
+
+	new_cookie = request_to_qc_t(hctx, rq);
+
 	/*
 	 * For OK queue, we are done. For error, kill it. Any other
 	 * error (busy), just add it to our list as we previously
@@ -1350,7 +1366,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 	}
 
 insert:
-	blk_mq_insert_request(rq, false, true, true);
+	blk_mq_sched_insert_request(rq, false, true, true);
 }
 
 /*
@@ -1383,9 +1399,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
+	if (blk_mq_sched_bio_merge(q, bio))
+		return BLK_QC_T_NONE;
+
 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-	rq = blk_mq_map_request(q, bio, &data);
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
@@ -1397,6 +1418,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
+		blk_mq_get_driver_tag(rq, NULL, true);
 		blk_insert_flush(rq);
 		goto run_queue;
 	}
@@ -1447,6 +1469,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto done;
 	}
 
+	if (q->elevator) {
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1492,9 +1520,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	} else
 		request_count = blk_plug_queued_count(q);
 
+	if (blk_mq_sched_bio_merge(q, bio))
+		return BLK_QC_T_NONE;
+
 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-	rq = blk_mq_map_request(q, bio, &data);
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
@@ -1506,6 +1539,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
+		blk_mq_get_driver_tag(rq, NULL, true);
 		blk_insert_flush(rq);
 		goto run_queue;
 	}
@@ -1544,6 +1578,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return cookie;
 	}
 
+	if (q->elevator) {
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1556,6 +1596,7 @@ run_queue:
 	}
 
 	blk_mq_put_ctx(data.ctx);
+done:
 	return cookie;
 }
 
@@ -1925,9 +1966,11 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 					 unsigned int hctx_idx)
 {
-	blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-	blk_mq_free_rq_map(set->tags[hctx_idx]);
-	set->tags[hctx_idx] = NULL;
+	if (set->tags[hctx_idx]) {
+		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+		blk_mq_free_rq_map(set->tags[hctx_idx]);
+		set->tags[hctx_idx] = NULL;
+	}
 }
 
 static void blk_mq_map_swqueue(struct request_queue *q,
@@ -2084,6 +2127,8 @@ void blk_mq_release(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 
+	blk_mq_sched_teardown(q);
+
 	/* hctx kobj stays in hctx */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx)
@@ -2504,14 +2549,22 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret;
 
-	if (!set || nr > set->queue_depth)
+	if (!set)
 		return -EINVAL;
 
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->tags)
 			continue;
-		ret = blk_mq_tag_update_depth(hctx->tags, nr);
+		/*
+		 * If we're using an MQ scheduler, just update the scheduler
+		 * queue depth. This is similar to what the old code would do.
+		 */
+		if (!hctx->sched_tags)
+			ret = blk_mq_tag_update_depth(hctx->tags,
+							min(nr, set->queue_depth));
+		else
+			ret = blk_mq_tag_update_depth(hctx->sched_tags, nr);
 		if (ret)
 			break;
 	}
@@ -2704,7 +2757,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 		blk_flush_plug_list(plug, false);
 
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-	rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+	if (!blk_qc_t_is_internal(cookie))
+		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+	else
+		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
 
 	return __blk_mq_poll(hctx, rq);
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1b279b02d0f6..0c7c034d9ddd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -52,6 +52,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
  */
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 				bool at_head);
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+				struct list_head *list);
 /*
  * CPU hotplug helpers
  */
@@ -124,6 +126,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 
 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
 {
+	if (data->flags & BLK_MQ_REQ_INTERNAL)
+		return data->hctx->sched_tags;
+
 	return data->hctx->tags;
 }
 
@@ -132,8 +137,9 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
  */
 void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 			struct request *rq, unsigned int op);
-void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct request *rq);
+void blk_mq_finish_request(struct request *rq);
 struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 					unsigned int op);
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index bae1decb6ec3..07cc329fa4b0 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 	list_del_init(&rq->queuelist);
 	rq->rq_flags &= ~RQF_QUEUED;
 	rq->tag = -1;
+	rq->internal_tag = -1;
 
 	if (unlikely(bqt->tag_index[tag] == NULL))
 		printk(KERN_ERR "%s: tag %d is missing\n",
diff --git a/block/elevator.c b/block/elevator.c
index 022a26830297..0e1ccddab8a2 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -58,7 +59,9 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_allow_bio_merge_fn)
+	if (e->uses_mq && e->type->ops.mq.allow_merge)
+		return e->type->ops.mq.allow_merge(q, rq, bio);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
 		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
@@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 	kobject_init(&eq->kobj, &elv_ktype);
 	mutex_init(&eq->sysfs_lock);
 	hash_init(eq->hash);
+	eq->uses_mq = e->uses_mq;
 
 	return eq;
 }
@@ -219,14 +223,26 @@ int elevator_init(struct request_queue *q, char *name)
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
-				"Using noop.\n");
+				"Using noop/none.\n");
+			if (q->mq_ops) {
+				elevator_put(e);
+				return 0;
+			}
 			e = elevator_get("noop", false);
 		}
 	}
 
-	err = e->ops.sq.elevator_init_fn(q, e);
-	if (err)
+	if (e->uses_mq) {
+		err = blk_mq_sched_setup(q);
+		if (!err)
+			err = e->ops.mq.init_sched(q, e);
+	} else
+		err = e->ops.sq.elevator_init_fn(q, e);
+	if (err) {
+		if (e->uses_mq)
+			blk_mq_sched_teardown(q);
 		elevator_put(e);
+	}
 	return err;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -234,7 +250,9 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->type->ops.sq.elevator_exit_fn)
+	if (e->uses_mq && e->type->ops.mq.exit_sched)
+		e->type->ops.mq.exit_sched(e);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
 		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
@@ -253,6 +271,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq)
 	if (ELV_ON_HASH(rq))
 		__elv_rqhash_del(rq);
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_del);
 
 void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
@@ -262,6 +281,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq)
 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
 	rq->rq_flags |= RQF_HASHED;
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_add);
 
 void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
@@ -443,7 +463,9 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->type->ops.sq.elevator_merge_fn)
+	if (e->uses_mq && e->type->ops.mq.request_merge)
+		return e->type->ops.mq.request_merge(q, req, bio);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
 		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
@@ -456,8 +478,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
  *
  * Returns true if we merged, false otherwise
  */
-static bool elv_attempt_insert_merge(struct request_queue *q,
-				     struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 {
 	struct request *__rq;
 	bool ret;
@@ -495,7 +516,9 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_merged_fn)
+	if (e->uses_mq && e->type->ops.mq.request_merged)
+		e->type->ops.mq.request_merged(q, rq, type);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
 		e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
@@ -508,10 +531,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
-	const int next_sorted = next->rq_flags & RQF_SORTED;
-
-	if (next_sorted && e->type->ops.sq.elevator_merge_req_fn)
-		e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+	bool next_sorted = false;
+
+	if (e->uses_mq && e->type->ops.mq.requests_merged)
+		e->type->ops.mq.requests_merged(q, rq, next);
+	else if (e->type->ops.sq.elevator_merge_req_fn) {
+		next_sorted = next->rq_flags & RQF_SORTED;
+		if (next_sorted)
+			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+	}
 
 	elv_rqhash_reposition(q, rq);
 
@@ -528,6 +556,9 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	if (e->type->ops.sq.elevator_bio_merged_fn)
 		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
 }
@@ -574,11 +605,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 
 void elv_drain_elevator(struct request_queue *q)
 {
+	struct elevator_queue *e = q->elevator;
 	static int printed;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	lockdep_assert_held(q->queue_lock);
 
-	while (q->elevator->type->ops.sq.elevator_dispatch_fn(q, 1))
+	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
@@ -682,8 +717,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_latter_req_fn)
+	if (e->uses_mq && e->type->ops.mq.next_request)
+		return e->type->ops.mq.next_request(q, rq);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
 		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
+
 	return NULL;
 }
 
@@ -691,7 +729,9 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_former_req_fn)
+	if (e->uses_mq && e->type->ops.mq.former_request)
+		return e->type->ops.mq.former_request(q, rq);
+	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
 		return e->type->ops.sq.elevator_former_req_fn(q, rq);
 	return NULL;
 }
@@ -701,6 +741,9 @@ int elv_set_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
 	if (e->type->ops.sq.elevator_set_req_fn)
 		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
@@ -710,6 +753,9 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	if (e->type->ops.sq.elevator_put_req_fn)
 		e->type->ops.sq.elevator_put_req_fn(rq);
 }
@@ -718,6 +764,9 @@ int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
 	if (e->type->ops.sq.elevator_may_queue_fn)
 		return e->type->ops.sq.elevator_may_queue_fn(q, op);
 
@@ -728,6 +777,9 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	/*
 	 * request is released from the driver, io must be done
 	 */
@@ -803,7 +855,7 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
-		if (e->type->ops.sq.elevator_registered_fn)
+		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
 			e->type->ops.sq.elevator_registered_fn(q);
 	}
 	return error;
@@ -891,9 +943,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old = q->elevator;
-	bool registered = old->registered;
+	bool old_registered = false;
 	int err;
 
+	if (q->mq_ops) {
+		blk_mq_freeze_queue(q);
+		blk_mq_quiesce_queue(q);
+	}
+
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
 	 * Block layer doesn't call into a quiesced elevator - all requests
@@ -901,42 +958,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
 	 * merge happens either.
 	 */
-	blk_queue_bypass_start(q);
+	if (old) {
+		old_registered = old->registered;
+
+		if (old->uses_mq)
+			blk_mq_sched_teardown(q);
 
-	/* unregister and clear all auxiliary data of the old elevator */
-	if (registered)
-		elv_unregister_queue(q);
+		if (!q->mq_ops)
+			blk_queue_bypass_start(q);
 
-	spin_lock_irq(q->queue_lock);
-	ioc_clear_queue(q);
-	spin_unlock_irq(q->queue_lock);
+		/* unregister and clear all auxiliary data of the old elevator */
+		if (old_registered)
+			elv_unregister_queue(q);
+
+		spin_lock_irq(q->queue_lock);
+		ioc_clear_queue(q);
+		spin_unlock_irq(q->queue_lock);
+	}
 
 	/* allocate, init and register new elevator */
-	err = new_e->ops.sq.elevator_init_fn(q, new_e);
-	if (err)
-		goto fail_init;
+	if (new_e) {
+		if (new_e->uses_mq) {
+			err = blk_mq_sched_setup(q);
+			if (!err)
+				err = new_e->ops.mq.init_sched(q, new_e);
+		} else
+			err = new_e->ops.sq.elevator_init_fn(q, new_e);
+		if (err)
+			goto fail_init;
 
-	if (registered) {
 		err = elv_register_queue(q);
 		if (err)
 			goto fail_register;
-	}
+	} else
+		q->elevator = NULL;
 
 	/* done, kill the old one and finish */
-	elevator_exit(old);
-	blk_queue_bypass_end(q);
+	if (old) {
+		elevator_exit(old);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
+
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
 
-	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	if (new_e)
+		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	else
+		blk_add_trace_msg(q, "elv switch: none");
 
 	return 0;
 
 fail_register:
+	if (q->mq_ops)
+		blk_mq_sched_teardown(q);
 	elevator_exit(q->elevator);
 fail_init:
 	/* switch failed, restore and re-register old elevator */
-	q->elevator = old;
-	elv_register_queue(q);
-	blk_queue_bypass_end(q);
+	if (old) {
+		q->elevator = old;
+		elv_register_queue(q);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
 
 	return err;
 }
@@ -949,8 +1040,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
 
-	if (!q->elevator)
-		return -ENXIO;
+	/*
+	 * Special case for mq, turn off scheduling
+	 */
+	if (q->mq_ops && !strncmp(name, "none", 4))
+		return elevator_switch(q, NULL);
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1053,21 @@ static int __elevator_change(struct request_queue *q, const char *name)
 		return -EINVAL;
 	}
 
-	if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+	if (q->elevator &&
+	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
 
+	if (!e->uses_mq && q->mq_ops) {
+		elevator_put(e);
+		return -EINVAL;
+	}
+	if (e->uses_mq && !q->mq_ops) {
+		elevator_put(e);
+		return -EINVAL;
+	}
+
 	return elevator_switch(q, e);
 }
 
@@ -985,7 +1089,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 {
 	int ret;
 
-	if (!q->elevator)
+	if (!(q->mq_ops || q->request_fn))
 		return count;
 
 	ret = __elevator_change(q, name);
@@ -999,24 +1103,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	struct elevator_queue *e = q->elevator;
-	struct elevator_type *elv;
+	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!q->elevator || !blk_queue_stackable(q))
+	if (!blk_queue_stackable(q))
 		return sprintf(name, "none\n");
 
-	elv = e->type;
+	if (!q->elevator)
+		len += sprintf(name+len, "[none] ");
+	else
+		elv = e->type;
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (!strcmp(elv->elevator_name, __e->elevator_name))
+		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
-		else
+			continue;
+		}
+		if (__e->uses_mq && q->mq_ops)
+			len += sprintf(name+len, "%s ", __e->elevator_name);
+		else if (!__e->uses_mq && !q->mq_ops)
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
 
+	if (q->mq_ops && q->elevator)
+		len += sprintf(name+len, "none");
+
 	len += sprintf(len+name, "\n");
 	return len;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2686f9e7302a..63569eb46d15 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,7 @@ struct blk_mq_hw_ctx {
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
+	void			*sched_data;
 	struct request_queue	*queue;
 	struct blk_flush_queue	*fq;
 
@@ -35,6 +36,7 @@ struct blk_mq_hw_ctx {
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
+	struct blk_mq_tags	*sched_tags;
 
 	struct srcu_struct	queue_rq_srcu;
 
@@ -156,6 +158,7 @@ enum {
 
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
+	BLK_MQ_S_SCHED_RESTART	= 2,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
@@ -179,13 +182,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
 	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
 	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
+	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2e99d659b0f1..25564857f5f8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -154,6 +154,7 @@ struct request {
 
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
+	int tag;
 	sector_t __sector;		/* sector cursor */
 
 	struct bio *bio;
@@ -220,9 +221,10 @@ struct request {
 
 	unsigned short ioprio;
 
+	int internal_tag;
+
 	void *special;		/* opaque pointer available for LLD use */
 
-	int tag;
 	int errors;
 
 	/*
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2a9e966eed03..ecb96fd67c6d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -77,6 +77,34 @@ struct elevator_ops
 	elevator_registered_fn *elevator_registered_fn;
 };
 
+struct blk_mq_alloc_data;
+struct blk_mq_hw_ctx;
+
+struct elevator_mq_ops {
+	int (*init_sched)(struct request_queue *, struct elevator_type *);
+	void (*exit_sched)(struct elevator_queue *);
+
+	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
+	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
+	int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
+	void (*request_merged)(struct request_queue *, struct request *, int);
+	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
+	struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
+	void (*put_request)(struct request *);
+	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
+	void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *);
+	bool (*has_work)(struct blk_mq_hw_ctx *);
+	void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+	void (*started_request)(struct request *);
+	void (*requeue_request)(struct request *);
+	struct request *(*former_request)(struct request_queue *, struct request *);
+	struct request *(*next_request)(struct request_queue *, struct request *);
+	int (*get_rq_priv)(struct request_queue *, struct request *);
+	void (*put_rq_priv)(struct request_queue *, struct request *);
+	void (*init_icq)(struct io_cq *);
+	void (*exit_icq)(struct io_cq *);
+};
+
 #define ELV_NAME_MAX	(16)
 
 struct elv_fs_entry {
@@ -96,12 +124,14 @@ struct elevator_type
 	/* fields provided by elevator implementation */
 	union {
 		struct elevator_ops sq;
+		struct elevator_mq_ops mq;
 	} ops;
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
+	bool uses_mq;
 
 	/* managed by elevator core */
 	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
@@ -125,6 +155,7 @@ struct elevator_queue
 	struct kobject kobj;
 	struct mutex sysfs_lock;
 	unsigned int registered:1;
+	unsigned int uses_mq:1;
 	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
 };
 
@@ -141,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
 extern void elv_merged_request(struct request_queue *, struct request *, int);
 extern void elv_bio_merged(struct request_queue *q, struct request *,
 				struct bio *);
+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-- 
cgit v1.2.3


From d34849913819a5e0cbfbe724dbe79df89278c524 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 13 Jan 2017 14:43:58 -0700
Subject: blk-mq-sched: allow setting of default IO scheduler

Add Kconfig entries to manage what devices get assigned an MQ
scheduler, and add a blk-mq flag for drivers to opt out of scheduling.
The latter is useful for admin type queues that still allocate a blk-mq
queue and tag set, but aren't use for normal IO.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
---
 block/Kconfig.iosched   | 56 +++++++++++++++++++++++++++++++++++++++++++------
 block/blk-mq-sched.c    | 20 ++++++++++++++++++
 block/blk-mq-sched.h    |  2 ++
 block/blk-mq.c          |  8 +++++++
 block/elevator.c        |  8 ++++++-
 drivers/nvme/host/pci.c |  1 +
 include/linux/blk-mq.h  |  1 +
 7 files changed, 89 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 490ef2850fae..0715ce93daef 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -32,12 +32,6 @@ config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
-config MQ_IOSCHED_DEADLINE
-	tristate "MQ deadline I/O scheduler"
-	default y
-	---help---
-	  MQ version of the deadline IO scheduler.
-
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
 	depends on IOSCHED_CFQ && BLK_CGROUP
@@ -69,6 +63,56 @@ config DEFAULT_IOSCHED
 	default "cfq" if DEFAULT_CFQ
 	default "noop" if DEFAULT_NOOP
 
+config MQ_IOSCHED_DEADLINE
+	tristate "MQ deadline I/O scheduler"
+	default y
+	---help---
+	  MQ version of the deadline IO scheduler.
+
+config MQ_IOSCHED_NONE
+	bool
+	default y
+
+choice
+	prompt "Default single-queue blk-mq I/O scheduler"
+	default DEFAULT_SQ_NONE
+	help
+	  Select the I/O scheduler which will be used by default for blk-mq
+	  managed block devices with a single queue.
+
+	config DEFAULT_SQ_DEADLINE
+		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+	config DEFAULT_SQ_NONE
+		bool "None"
+
+endchoice
+
+config DEFAULT_SQ_IOSCHED
+	string
+	default "mq-deadline" if DEFAULT_SQ_DEADLINE
+	default "none" if DEFAULT_SQ_NONE
+
+choice
+	prompt "Default multi-queue blk-mq I/O scheduler"
+	default DEFAULT_MQ_NONE
+	help
+	  Select the I/O scheduler which will be used by default for blk-mq
+	  managed block devices with multiple queues.
+
+	config DEFAULT_MQ_DEADLINE
+		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+	config DEFAULT_MQ_NONE
+		bool "None"
+
+endchoice
+
+config DEFAULT_MQ_IOSCHED
+	string
+	default "mq-deadline" if DEFAULT_MQ_DEADLINE
+	default "none" if DEFAULT_MQ_NONE
+
 endmenu
 
 endif
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 26759798a0b3..d05061f27bb1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -366,3 +366,23 @@ void blk_mq_sched_teardown(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_sched_free_tags(set, hctx, i);
 }
+
+int blk_mq_sched_init(struct request_queue *q)
+{
+	int ret;
+
+#if defined(CONFIG_DEFAULT_SQ_NONE)
+	if (q->nr_hw_queues == 1)
+		return 0;
+#endif
+#if defined(CONFIG_DEFAULT_MQ_NONE)
+	if (q->nr_hw_queues > 1)
+		return 0;
+#endif
+
+	mutex_lock(&q->sysfs_lock);
+	ret = elevator_init(q, NULL);
+	mutex_unlock(&q->sysfs_lock);
+
+	return ret;
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 35c49e2e008a..6b465bc7014c 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -28,6 +28,8 @@ void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
 int blk_mq_sched_setup(struct request_queue *q);
 void blk_mq_sched_teardown(struct request_queue *q);
 
+int blk_mq_sched_init(struct request_queue *q);
+
 static inline bool
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 45e1707a9f86..fa1f8619bfe7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2285,6 +2285,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	mutex_unlock(&all_q_mutex);
 	put_online_cpus();
 
+	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
+		int ret;
+
+		ret = blk_mq_sched_init(q);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
 	return q;
 
 err_hctxs:
diff --git a/block/elevator.c b/block/elevator.c
index 0e1ccddab8a2..bcba2dd5cb5c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -219,7 +219,13 @@ int elevator_init(struct request_queue *q, char *name)
 	}
 
 	if (!e) {
-		e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+		if (q->mq_ops && q->nr_hw_queues == 1)
+			e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false);
+		else if (q->mq_ops)
+			e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false);
+		else
+			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 19beeb7b2ac2..e1b4e603b1cf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1181,6 +1181,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
+		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
 		dev->admin_tagset.driver_data = dev;
 
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 63569eb46d15..8e4df3d6c8cd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -153,6 +153,7 @@ enum {
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
+	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
-- 
cgit v1.2.3


From d45ae1f7041ac52ade6c5ec76d96bbed765d67aa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Jan 2017 12:29:35 -0500
Subject: tracing: Process constants for (un)likely() profiler

When running the likely/unlikely profiler, one of the results did not look
accurate. It noted that the unlikely() in link_path_walk() was 100%
incorrect. When I added a trace_printk() to see what was happening there, it
became 80% correct! Looking deeper into what whas happening, I found that
gcc split that if statement into two paths. One where the if statement
became a constant, the other path a variable. The other path had the if
statement always hit (making the unlikely there, always false), but since
the #define unlikely() has:

  #define unlikely() (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))

Where constants are ignored by the branch profiler, the "constant" path
made by the compiler was ignored, even though it was hit 80% of the time.

By just passing the constant value to the __branch_check__() function and
tracing it out of line (as always correct, as likely/unlikely isn't a factor
for constants), then we get back the accurate readings of branches that were
optimized by gcc causing part of the execution to become constant.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/compiler.h    | 14 ++++++++------
 kernel/trace/trace_branch.c |  6 +++++-
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index cf0fa5d86059..bbbe1570de1c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -107,12 +107,13 @@ struct ftrace_branch_data {
  */
 #if defined(CONFIG_TRACE_BRANCH_PROFILING) \
     && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
+void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+			  int expect, int is_constant);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
-#define __branch_check__(x, expect) ({					\
+#define __branch_check__(x, expect, is_constant) ({			\
 			int ______r;					\
 			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
@@ -122,8 +123,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 				.file = __FILE__,			\
 				.line = __LINE__,			\
 			};						\
-			______r = likely_notrace(x);			\
-			ftrace_likely_update(&______f, ______r, expect); \
+			______r = __builtin_expect(!!(x), expect);	\
+			ftrace_likely_update(&______f, ______r,		\
+					     expect, is_constant);	\
 			______r;					\
 		})
 
@@ -133,10 +135,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  * written by Daniel Walker.
  */
 # ifndef likely
-#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
+#  define likely(x)	(__branch_check__(x, 1, __builtin_constant_p(x)))
 # endif
 # ifndef unlikely
-#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
+#  define unlikely(x)	(__branch_check__(x, 0, __builtin_constant_p(x)))
 # endif
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 75489de546b6..7afe426ea528 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -200,8 +200,12 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+			  int expect, int is_constant)
 {
+	/* A constant is always correct */
+	if (is_constant)
+		val = expect;
 	/*
 	 * I would love to have a trace point here instead, but the
 	 * trace point code is so inundated with unlikely and likely
-- 
cgit v1.2.3


From 0a13cd1a05e7a549259d4f803d2ec2efda07ed7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 18:03:11 +0100
Subject: locking/atomic, kref: Implement kref_put_lock()

Because home-rolling your own is _awesome_, stop doing it. Provide
kref_put_lock(), just like kref_put_mutex() but for a spinlock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kref.h | 22 ++++++++++++++++------
 net/sunrpc/svcauth.c | 15 ++++++++++-----
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 31c49a64cdf9..9db9685fed84 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -19,6 +19,7 @@
 #include <linux/atomic.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 
 struct kref {
 	atomic_t refcount;
@@ -86,12 +87,21 @@ static inline int kref_put_mutex(struct kref *kref,
 				 struct mutex *lock)
 {
 	WARN_ON(release == NULL);
-	if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
-		mutex_lock(lock);
-		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
-			mutex_unlock(lock);
-			return 0;
-		}
+
+	if (atomic_dec_and_mutex_lock(&kref->refcount, lock)) {
+		release(kref);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int kref_put_lock(struct kref *kref,
+				void (*release)(struct kref *kref),
+				spinlock_t *lock)
+{
+	WARN_ON(release == NULL);
+
+	if (atomic_dec_and_lock(&kref->refcount, lock)) {
 		release(kref);
 		return 1;
 	}
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index e112da8005b5..bb8db3cb8032 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -126,13 +126,18 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister);
 static struct hlist_head	auth_domain_table[DN_HASHMAX];
 static DEFINE_SPINLOCK(auth_domain_lock);
 
+static void auth_domain_release(struct kref *kref)
+{
+	struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
+
+	hlist_del(&dom->hash);
+	dom->flavour->domain_release(dom);
+	spin_unlock(&auth_domain_lock);
+}
+
 void auth_domain_put(struct auth_domain *dom)
 {
-	if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
-		hlist_del(&dom->hash);
-		dom->flavour->domain_release(dom);
-		spin_unlock(&auth_domain_lock);
-	}
+	kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock);
 }
 EXPORT_SYMBOL_GPL(auth_domain_put);
 
-- 
cgit v1.2.3


From 84d58132d285b2edef951d6633c1e5224e8b5283 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Wed, 11 Jan 2017 06:35:10 -0800
Subject: rpmsg: Introduce "poll" to endpoint ops

This allows rpmsg backends to implement polling of the outgoing buffer,
which provides poll support to user space when using the rpmsg character
device.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/rpmsg_core.c     | 20 ++++++++++++++++++++
 drivers/rpmsg/rpmsg_internal.h |  3 +++
 include/linux/rpmsg.h          | 13 +++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
index 1cfb775e8e82..3bf1418683b1 100644
--- a/drivers/rpmsg/rpmsg_core.c
+++ b/drivers/rpmsg/rpmsg_core.c
@@ -239,6 +239,26 @@ int rpmsg_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst)
 }
 EXPORT_SYMBOL(rpmsg_trysendto);
 
+/**
+ * rpmsg_poll() - poll the endpoint's send buffers
+ * @ept:	the rpmsg endpoint
+ * @filp:	file for poll_wait()
+ * @wait:	poll_table for poll_wait()
+ *
+ * Returns mask representing the current state of the endpoint's send buffers
+ */
+unsigned int rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp,
+			poll_table *wait)
+{
+	if (WARN_ON(!ept))
+		return 0;
+	if (!ept->ops->poll)
+		return 0;
+
+	return ept->ops->poll(ept, filp, wait);
+}
+EXPORT_SYMBOL(rpmsg_poll);
+
 /**
  * rpmsg_send_offchannel() - send a message using explicit src/dst addresses
  * @ept: the rpmsg endpoint
diff --git a/drivers/rpmsg/rpmsg_internal.h b/drivers/rpmsg/rpmsg_internal.h
index 8075a20f919b..6176f2457b6b 100644
--- a/drivers/rpmsg/rpmsg_internal.h
+++ b/drivers/rpmsg/rpmsg_internal.h
@@ -21,6 +21,7 @@
 #define __RPMSG_INTERNAL_H__
 
 #include <linux/rpmsg.h>
+#include <linux/poll.h>
 
 #define to_rpmsg_device(d) container_of(d, struct rpmsg_device, dev)
 #define to_rpmsg_driver(d) container_of(d, struct rpmsg_driver, drv)
@@ -70,6 +71,8 @@ struct rpmsg_endpoint_ops {
 	int (*trysendto)(struct rpmsg_endpoint *ept, void *data, int len, u32 dst);
 	int (*trysend_offchannel)(struct rpmsg_endpoint *ept, u32 src, u32 dst,
 			     void *data, int len);
+	unsigned int (*poll)(struct rpmsg_endpoint *ept, struct file *filp,
+			     poll_table *wait);
 };
 
 int rpmsg_register_device(struct rpmsg_device *rpdev);
diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index 18f9e1ae4b7e..10d6ae8bbb7d 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -41,6 +41,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
+#include <linux/poll.h>
 
 #define RPMSG_ADDR_ANY		0xFFFFFFFF
 
@@ -156,6 +157,9 @@ int rpmsg_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst);
 int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst,
 			     void *data, int len);
 
+unsigned int rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp,
+			poll_table *wait);
+
 #else
 
 static inline int register_rpmsg_device(struct rpmsg_device *dev)
@@ -254,6 +258,15 @@ static inline int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src,
 	return -ENXIO;
 }
 
+static inline unsigned int rpmsg_poll(struct rpmsg_endpoint *ept,
+				      struct file *filp, poll_table *wait)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+
+	return 0;
+}
+
 #endif /* IS_ENABLED(CONFIG_RPMSG) */
 
 /* use a macro to avoid include chaining to get THIS_MODULE */
-- 
cgit v1.2.3


From cc16f00f6529aa2378f2b949a6f68e9dc6dec363 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 18 Jan 2017 00:44:42 +0800
Subject: sctp: add support for generating stream reconf ssn reset request
 chunk

This patch is to add asoc strreset_outseq and strreset_inseq for
saving the reconf request sequence, initialize them when create
assoc and process init, and also to define Incoming and Outgoing
SSN Reset Request Parameter described in rfc6525 section 4.1 and
4.2, As they can be in one same chunk as section rfc6525 3.1-3
describes, it makes them in one function.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       |  27 ++++++++++
 include/net/sctp/sm.h      |   5 +-
 include/net/sctp/structs.h |   3 ++
 net/sctp/associola.c       |   1 +
 net/sctp/sm_make_chunk.c   | 121 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 156 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index fcb4c3646173..a9e790685af3 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -108,6 +108,7 @@ typedef enum {
 	/* Use hex, as defined in ADDIP sec. 3.1 */
 	SCTP_CID_ASCONF			= 0xC1,
 	SCTP_CID_ASCONF_ACK		= 0x80,
+	SCTP_CID_RECONF			= 0x82,
 } sctp_cid_t; /* enum */
 
 
@@ -199,6 +200,13 @@ typedef enum {
 	SCTP_PARAM_SUCCESS_REPORT	= cpu_to_be16(0xc005),
 	SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006),
 
+	/* RE-CONFIG. Section 4 */
+	SCTP_PARAM_RESET_OUT_REQUEST		= cpu_to_be16(0x000d),
+	SCTP_PARAM_RESET_IN_REQUEST		= cpu_to_be16(0x000e),
+	SCTP_PARAM_RESET_TSN_REQUEST		= cpu_to_be16(0x000f),
+	SCTP_PARAM_RESET_RESPONSE		= cpu_to_be16(0x0010),
+	SCTP_PARAM_RESET_ADD_OUT_STREAMS	= cpu_to_be16(0x0011),
+	SCTP_PARAM_RESET_ADD_IN_STREAMS		= cpu_to_be16(0x0012),
 } sctp_param_t; /* enum */
 
 
@@ -710,4 +718,23 @@ struct sctp_infox {
 	struct sctp_association *asoc;
 };
 
+struct sctp_reconf_chunk {
+	sctp_chunkhdr_t chunk_hdr;
+	__u8 params[0];
+} __packed;
+
+struct sctp_strreset_outreq {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+	__u32 response_seq;
+	__u32 send_reset_at_tsn;
+	__u16 list_of_streams[0];
+} __packed;
+
+struct sctp_strreset_inreq {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+	__u16 list_of_streams[0];
+} __packed;
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index ca6c971dd74a..3462cb08f51a 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -259,7 +259,10 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 				    __u32 new_cum_tsn, size_t nstreams,
 				    struct sctp_fwdtsn_skip *skiplist);
 struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc);
-
+struct sctp_chunk *sctp_make_strreset_req(
+				const struct sctp_association *asoc,
+				__u16 stream_num, __u16 *stream_list,
+				bool out, bool in);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 4741ec240caf..3dc983e97564 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1865,6 +1865,9 @@ struct sctp_association {
 	     temp:1,		/* Is it a temporary association? */
 	     prsctp_enable:1;
 
+	__u32 strreset_outseq; /* Update after receiving response */
+	__u32 strreset_inseq; /* Update after receiving request */
+
 	struct sctp_priv_assoc_stats stats;
 
 	int sent_cnt_removable;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 36294f7fb9a7..42ece6f35b98 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -207,6 +207,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * association to the same value as the initial TSN.
 	 */
 	asoc->addip_serial = asoc->c.initial_tsn;
+	asoc->strreset_outseq = asoc->c.initial_tsn;
 
 	INIT_LIST_HEAD(&asoc->addip_chunk_list);
 	INIT_LIST_HEAD(&asoc->asconf_ack_list);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 80a9088084ac..df81fce08890 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1844,6 +1844,7 @@ no_hmac:
 	retval->next_tsn = retval->c.initial_tsn;
 	retval->ctsn_ack_point = retval->next_tsn - 1;
 	retval->addip_serial = retval->c.initial_tsn;
+	retval->strreset_outseq = retval->c.initial_tsn;
 	retval->adv_peer_ack_point = retval->ctsn_ack_point;
 	retval->peer.prsctp_capable = retval->c.prsctp_capable;
 	retval->peer.adaptation_ind = retval->c.adaptation_ind;
@@ -2387,6 +2388,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 	asoc->peer.i.initial_tsn =
 		ntohl(peer_init->init_hdr.initial_tsn);
 
+	asoc->strreset_inseq = asoc->peer.i.initial_tsn;
+
 	/* Apply the upper bounds for output streams based on peer's
 	 * number of inbound streams.
 	 */
@@ -3524,3 +3527,121 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 
 	return retval;
 }
+
+/* RE-CONFIG 3.1 (RE-CONFIG chunk)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  | Type = 130    |  Chunk Flags  |      Chunk Length             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \                                                               \
+ *  /                  Re-configuration Parameter                   /
+ *  \                                                               \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \                                                               \
+ *  /             Re-configuration Parameter (optional)             /
+ *  \                                                               \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+static struct sctp_chunk *sctp_make_reconf(
+				const struct sctp_association *asoc,
+				int length)
+{
+	struct sctp_reconf_chunk *reconf;
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
+				   GFP_ATOMIC);
+	if (!retval)
+		return NULL;
+
+	reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
+	retval->param_hdr.v = reconf->params;
+
+	return retval;
+}
+
+/* RE-CONFIG 4.1 (STREAM OUT RESET)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 13       | Parameter Length = 16 + 2 * N |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |           Re-configuration Request Sequence Number            |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |           Re-configuration Response Sequence Number           |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                Sender's Last Assigned TSN                     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  /                            ......                             /
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * RE-CONFIG 4.2 (STREAM IN RESET)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 14       |  Parameter Length = 8 + 2 * N |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Re-configuration Request Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  /                            ......                             /
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_req(
+				const struct sctp_association *asoc,
+				__u16 stream_num, __u16 *stream_list,
+				bool out, bool in)
+{
+	struct sctp_strreset_outreq outreq;
+	__u16 stream_len = stream_num * 2;
+	struct sctp_strreset_inreq inreq;
+	struct sctp_chunk *retval;
+	__u16 outlen, inlen, i;
+
+	outlen = (sizeof(outreq) + stream_len) * out;
+	inlen = (sizeof(inreq) + stream_len) * in;
+
+	retval = sctp_make_reconf(asoc, outlen + inlen);
+	if (!retval)
+		return NULL;
+
+	for (i = 0; i < stream_num; i++)
+		stream_list[i] = htons(stream_list[i]);
+
+	if (outlen) {
+		outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
+		outreq.param_hdr.length = htons(outlen);
+		outreq.request_seq = htonl(asoc->strreset_outseq);
+		outreq.response_seq = htonl(asoc->strreset_inseq - 1);
+		outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1);
+
+		sctp_addto_chunk(retval, sizeof(outreq), &outreq);
+
+		if (stream_len)
+			sctp_addto_chunk(retval, stream_len, stream_list);
+	}
+
+	if (inlen) {
+		inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST;
+		inreq.param_hdr.length = htons(inlen);
+		inreq.request_seq = htonl(asoc->strreset_outseq + out);
+
+		sctp_addto_chunk(retval, sizeof(inreq), &inreq);
+
+		if (stream_len)
+			sctp_addto_chunk(retval, stream_len, stream_list);
+	}
+
+	for (i = 0; i < stream_num; i++)
+		stream_list[i] = ntohs(stream_list[i]);
+
+	return retval;
+}
-- 
cgit v1.2.3


From c61f13eaa1ee17728c41370100d2d45c254ce76f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Jan 2017 11:14:39 -0800
Subject: gcc-plugins: Add structleak for more stack initialization

This plugin detects any structures that contain __user attributes and
makes sure it is being fully initialized so that a specific class of
information exposure is eliminated. (This plugin was originally designed
to block the exposure of siginfo in CVE-2013-2141.)

Ported from grsecurity/PaX. This version adds a verbose option to the
plugin and the Kconfig.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/Kconfig                            |  22 +++
 include/linux/compiler.h                |   6 +-
 scripts/Makefile.gcc-plugins            |   4 +
 scripts/gcc-plugins/structleak_plugin.c | 246 ++++++++++++++++++++++++++++++++
 4 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100644 scripts/gcc-plugins/structleak_plugin.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 99839c23d453..646ba0f42c5f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -410,6 +410,28 @@ config GCC_PLUGIN_LATENT_ENTROPY
 	   * https://grsecurity.net/
 	   * https://pax.grsecurity.net/
 
+config GCC_PLUGIN_STRUCTLEAK
+	bool "Force initialization of variables containing userspace addresses"
+	depends on GCC_PLUGINS
+	help
+	  This plugin zero-initializes any structures that containing a
+	  __user attribute. This can prevent some classes of information
+	  exposures.
+
+	  This plugin was ported from grsecurity/PaX. More information at:
+	   * https://grsecurity.net/
+	   * https://pax.grsecurity.net/
+
+config GCC_PLUGIN_STRUCTLEAK_VERBOSE
+	bool "Report forcefully initialized variables"
+	depends on GCC_PLUGIN_STRUCTLEAK
+	depends on !COMPILE_TEST
+	help
+	  This option will cause a warning to be printed each time the
+	  structleak plugin finds a variable it thinks needs to be
+	  initialized. Since not all existing initializers are detected
+	  by the plugin, this can produce false positive warnings.
+
 config HAVE_CC_STACKPROTECTOR
 	bool
 	help
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index cf0fa5d86059..91c30cba984e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -27,7 +27,11 @@ extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
 # define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
 #else /* __CHECKER__ */
-# define __user
+# ifdef STRUCTLEAK_PLUGIN
+#  define __user __attribute__((user))
+# else
+#  define __user
+# endif
 # define __kernel
 # define __safe
 # define __force
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 060d2cb373db..a084f7a511d8 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -25,6 +25,10 @@ ifdef CONFIG_GCC_PLUGINS
     endif
   endif
 
+  gcc-plugin-$(CONFIG_GCC_PLUGIN_STRUCTLEAK)	+= structleak_plugin.so
+  gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE)	+= -fplugin-arg-structleak_plugin-verbose
+  gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK)	+= -DSTRUCTLEAK_PLUGIN
+
   GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
 
   export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR
diff --git a/scripts/gcc-plugins/structleak_plugin.c b/scripts/gcc-plugins/structleak_plugin.c
new file mode 100644
index 000000000000..fa3d7a4b26f2
--- /dev/null
+++ b/scripts/gcc-plugins/structleak_plugin.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright 2013-2017 by PaX Team <pageexec@freemail.hu>
+ * Licensed under the GPL v2
+ *
+ * Note: the choice of the license means that the compilation process is
+ *       NOT 'eligible' as defined by gcc's library exception to the GPL v3,
+ *       but for the kernel it doesn't matter since it doesn't link against
+ *       any of the gcc libraries
+ *
+ * gcc plugin to forcibly initialize certain local variables that could
+ * otherwise leak kernel stack to userland if they aren't properly initialized
+ * by later code
+ *
+ * Homepage: http://pax.grsecurity.net/
+ *
+ * Options:
+ * -fplugin-arg-structleak_plugin-disable
+ * -fplugin-arg-structleak_plugin-verbose
+ *
+ * Usage:
+ * $ # for 4.5/4.6/C based 4.7
+ * $ gcc -I`gcc -print-file-name=plugin`/include -I`gcc -print-file-name=plugin`/include/c-family -fPIC -shared -O2 -o structleak_plugin.so structleak_plugin.c
+ * $ # for C++ based 4.7/4.8+
+ * $ g++ -I`g++ -print-file-name=plugin`/include -I`g++ -print-file-name=plugin`/include/c-family -fPIC -shared -O2 -o structleak_plugin.so structleak_plugin.c
+ * $ gcc -fplugin=./structleak_plugin.so test.c -O2
+ *
+ * TODO: eliminate redundant initializers
+ *       increase type coverage
+ */
+
+#include "gcc-common.h"
+
+/* unused C type flag in all versions 4.5-6 */
+#define TYPE_USERSPACE(TYPE) TYPE_LANG_FLAG_5(TYPE)
+
+__visible int plugin_is_GPL_compatible;
+
+static struct plugin_info structleak_plugin_info = {
+	.version	= "201607271510vanilla",
+	.help		= "disable\tdo not activate plugin\n"
+			   "verbose\tprint all initialized variables\n",
+};
+
+static bool verbose;
+
+static tree handle_user_attribute(tree *node, tree name, tree args, int flags, bool *no_add_attrs)
+{
+	*no_add_attrs = true;
+
+	/* check for types? for now accept everything linux has to offer */
+	if (TREE_CODE(*node) != FIELD_DECL)
+		return NULL_TREE;
+
+	*no_add_attrs = false;
+	return NULL_TREE;
+}
+
+static struct attribute_spec user_attr = {
+	.name			= "user",
+	.min_length		= 0,
+	.max_length		= 0,
+	.decl_required		= false,
+	.type_required		= false,
+	.function_type_required	= false,
+	.handler		= handle_user_attribute,
+#if BUILDING_GCC_VERSION >= 4007
+	.affects_type_identity	= true
+#endif
+};
+
+static void register_attributes(void *event_data, void *data)
+{
+	register_attribute(&user_attr);
+}
+
+static tree get_field_type(tree field)
+{
+	return strip_array_types(TREE_TYPE(field));
+}
+
+static bool is_userspace_type(tree type)
+{
+	tree field;
+
+	for (field = TYPE_FIELDS(type); field; field = TREE_CHAIN(field)) {
+		tree fieldtype = get_field_type(field);
+		enum tree_code code = TREE_CODE(fieldtype);
+
+		if (code == RECORD_TYPE || code == UNION_TYPE)
+			if (is_userspace_type(fieldtype))
+				return true;
+
+		if (lookup_attribute("user", DECL_ATTRIBUTES(field)))
+			return true;
+	}
+	return false;
+}
+
+static void finish_type(void *event_data, void *data)
+{
+	tree type = (tree)event_data;
+
+	if (type == NULL_TREE || type == error_mark_node)
+		return;
+
+#if BUILDING_GCC_VERSION >= 5000
+	if (TREE_CODE(type) == ENUMERAL_TYPE)
+		return;
+#endif
+
+	if (TYPE_USERSPACE(type))
+		return;
+
+	if (is_userspace_type(type))
+		TYPE_USERSPACE(type) = 1;
+}
+
+static void initialize(tree var)
+{
+	basic_block bb;
+	gimple_stmt_iterator gsi;
+	tree initializer;
+	gimple init_stmt;
+
+	/* this is the original entry bb before the forced split */
+	bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun));
+
+	/* first check if variable is already initialized, warn otherwise */
+	for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) {
+		gimple stmt = gsi_stmt(gsi);
+		tree rhs1;
+
+		/* we're looking for an assignment of a single rhs... */
+		if (!gimple_assign_single_p(stmt))
+			continue;
+		rhs1 = gimple_assign_rhs1(stmt);
+#if BUILDING_GCC_VERSION >= 4007
+		/* ... of a non-clobbering expression... */
+		if (TREE_CLOBBER_P(rhs1))
+			continue;
+#endif
+		/* ... to our variable... */
+		if (gimple_get_lhs(stmt) != var)
+			continue;
+		/* if it's an initializer then we're good */
+		if (TREE_CODE(rhs1) == CONSTRUCTOR)
+			return;
+	}
+
+	/* these aren't the 0days you're looking for */
+	if (verbose)
+		inform(DECL_SOURCE_LOCATION(var),
+			"userspace variable will be forcibly initialized");
+
+	/* build the initializer expression */
+	initializer = build_constructor(TREE_TYPE(var), NULL);
+
+	/* build the initializer stmt */
+	init_stmt = gimple_build_assign(var, initializer);
+	gsi = gsi_after_labels(single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)));
+	gsi_insert_before(&gsi, init_stmt, GSI_NEW_STMT);
+	update_stmt(init_stmt);
+}
+
+static unsigned int structleak_execute(void)
+{
+	basic_block bb;
+	unsigned int ret = 0;
+	tree var;
+	unsigned int i;
+
+	/* split the first bb where we can put the forced initializers */
+	gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun)));
+	bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun));
+	if (!single_pred_p(bb)) {
+		split_edge(single_succ_edge(ENTRY_BLOCK_PTR_FOR_FN(cfun)));
+		gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun)));
+	}
+
+	/* enumerate all local variables and forcibly initialize our targets */
+	FOR_EACH_LOCAL_DECL(cfun, i, var) {
+		tree type = TREE_TYPE(var);
+
+		gcc_assert(DECL_P(var));
+		if (!auto_var_in_fn_p(var, current_function_decl))
+			continue;
+
+		/* only care about structure types */
+		if (TREE_CODE(type) != RECORD_TYPE && TREE_CODE(type) != UNION_TYPE)
+			continue;
+
+		/* if the type is of interest, examine the variable */
+		if (TYPE_USERSPACE(type))
+			initialize(var);
+	}
+
+	return ret;
+}
+
+#define PASS_NAME structleak
+#define NO_GATE
+#define PROPERTIES_REQUIRED PROP_cfg
+#define TODO_FLAGS_FINISH TODO_verify_il | TODO_verify_ssa | TODO_verify_stmts | TODO_dump_func | TODO_remove_unused_locals | TODO_update_ssa | TODO_ggc_collect | TODO_verify_flow
+#include "gcc-generate-gimple-pass.h"
+
+__visible int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version)
+{
+	int i;
+	const char * const plugin_name = plugin_info->base_name;
+	const int argc = plugin_info->argc;
+	const struct plugin_argument * const argv = plugin_info->argv;
+	bool enable = true;
+
+	PASS_INFO(structleak, "early_optimizations", 1, PASS_POS_INSERT_BEFORE);
+
+	if (!plugin_default_version_check(version, &gcc_version)) {
+		error(G_("incompatible gcc/plugin versions"));
+		return 1;
+	}
+
+	if (strncmp(lang_hooks.name, "GNU C", 5) && !strncmp(lang_hooks.name, "GNU C+", 6)) {
+		inform(UNKNOWN_LOCATION, G_("%s supports C only, not %s"), plugin_name, lang_hooks.name);
+		enable = false;
+	}
+
+	for (i = 0; i < argc; ++i) {
+		if (!strcmp(argv[i].key, "disable")) {
+			enable = false;
+			continue;
+		}
+		if (!strcmp(argv[i].key, "verbose")) {
+			verbose = true;
+			continue;
+		}
+		error(G_("unknown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key);
+	}
+
+	register_callback(plugin_name, PLUGIN_INFO, NULL, &structleak_plugin_info);
+	if (enable) {
+		register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, &structleak_pass_info);
+		register_callback(plugin_name, PLUGIN_FINISH_TYPE, finish_type, NULL);
+	}
+	register_callback(plugin_name, PLUGIN_ATTRIBUTES, register_attributes, NULL);
+
+	return 0;
+}
-- 
cgit v1.2.3


From 62bc306e2083436675e33b5bdeb6a77907d35971 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 17 Jan 2017 11:07:15 -0500
Subject: audit: log 32-bit socketcalls

32-bit socketcalls were not being logged by audit on x86_64 systems.
Log them.  This is basically a duplicate of the call from
net/socket.c:sys_socketcall(), but it addresses the impedance mismatch
between 32-bit userspace process and 64-bit kernel audit.

See: https://github.com/linux-audit/audit-kernel/issues/14

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 20 ++++++++++++++++++++
 net/compat.c          | 17 ++++++++++++++---
 2 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 9d4443f93db6..2be99b276d29 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -387,6 +387,20 @@ static inline int audit_socketcall(int nargs, unsigned long *args)
 		return __audit_socketcall(nargs, args);
 	return 0;
 }
+
+static inline int audit_socketcall_compat(int nargs, u32 *args)
+{
+	unsigned long a[AUDITSC_ARGS];
+	int i;
+
+	if (audit_dummy_context())
+		return 0;
+
+	for (i = 0; i < nargs; i++)
+		a[i] = (unsigned long)args[i];
+	return __audit_socketcall(nargs, a);
+}
+
 static inline int audit_sockaddr(int len, void *addr)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -513,6 +527,12 @@ static inline int audit_socketcall(int nargs, unsigned long *args)
 {
 	return 0;
 }
+
+static inline int audit_socketcall_compat(int nargs, u32 *args)
+{
+	return 0;
+}
+
 static inline void audit_fd_pair(int fd1, int fd2)
 { }
 static inline int audit_sockaddr(int len, void *addr)
diff --git a/net/compat.c b/net/compat.c
index 1cd2ec046164..a96fd2f3507b 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -22,6 +22,7 @@
 #include <linux/filter.h>
 #include <linux/compat.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 #include <linux/export.h>
 
 #include <net/scm.h>
@@ -781,14 +782,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
 
 COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 {
-	int ret;
-	u32 a[6];
+	u32 a[AUDITSC_ARGS];
+	unsigned int len;
 	u32 a0, a1;
+	int ret;
 
 	if (call < SYS_SOCKET || call > SYS_SENDMMSG)
 		return -EINVAL;
-	if (copy_from_user(a, args, nas[call]))
+	len = nas[call];
+	if (len > sizeof(a))
+		return -EINVAL;
+
+	if (copy_from_user(a, args, len))
 		return -EFAULT;
+
+	ret = audit_socketcall_compat(len / sizeof(a[0]), a);
+	if (ret)
+		return ret;
+
 	a0 = a[0];
 	a1 = a[1];
 
-- 
cgit v1.2.3


From fd61c6ba313de6758aeeab58fe03bd9fbbc8cea9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 17 Jan 2017 15:51:07 -0800
Subject: net: ipv6: remove nowait arg to rt6_fill_node

All callers of rt6_fill_node pass 0 for nowait arg. Remove the arg and
simplify rt6_fill_node accordingly.

rt6_fill_node passes the nowait of 0 to ip6mr_get_route. Remove the
nowait arg from it as well.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute6.h |  2 +-
 net/ipv6/ip6mr.c        |  9 ++-------
 net/ipv6/route.c        | 27 ++++++++++-----------------
 3 files changed, 13 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 19a1c0c2993b..ce44e3e96d27 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -116,7 +116,7 @@ struct mfc6_cache {
 
 struct rtmsg;
 extern int ip6mr_get_route(struct net *net, struct sk_buff *skb,
-			   struct rtmsg *rtm, int nowait, u32 portid);
+			   struct rtmsg *rtm, u32 portid);
 
 #ifdef CONFIG_IPV6_MROUTE
 extern struct sock *mroute6_socket(struct net *net, struct sk_buff *skb);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e275077e8af2..babaf3ec2742 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2288,7 +2288,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 }
 
 int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
-		    int nowait, u32 portid)
+		    u32 portid)
 {
 	int err;
 	struct mr6_table *mrt;
@@ -2315,11 +2315,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		struct net_device *dev;
 		int vif;
 
-		if (nowait) {
-			read_unlock(&mrt_lock);
-			return -EAGAIN;
-		}
-
 		dev = skb->dev;
 		if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
 			read_unlock(&mrt_lock);
@@ -2357,7 +2352,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		return err;
 	}
 
-	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		cache->mfc_flags |= MFC_NOTIFY;
 
 	err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4f6b067c8753..b2044dd71724 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3169,7 +3169,7 @@ static int rt6_fill_node(struct net *net,
 			 struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *dst, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
-			 int prefix, int nowait, unsigned int flags)
+			 int prefix, unsigned int flags)
 {
 	u32 metrics[RTAX_MAX];
 	struct rtmsg *rtm;
@@ -3261,19 +3261,12 @@ static int rt6_fill_node(struct net *net,
 	if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
-			int err = ip6mr_get_route(net, skb, rtm, nowait,
-						  portid);
-
-			if (err <= 0) {
-				if (!nowait) {
-					if (err == 0)
-						return 0;
-					goto nla_put_failure;
-				} else {
-					if (err == -EMSGSIZE)
-						goto nla_put_failure;
-				}
-			}
+			int err = ip6mr_get_route(net, skb, rtm, portid);
+
+			if (err == 0)
+				return 0;
+			if (err < 0)
+				goto nla_put_failure;
 		} else
 #endif
 			if (nla_put_u32(skb, RTA_IIF, iif))
@@ -3342,7 +3335,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 	return rt6_fill_node(arg->net,
 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
-		     prefix, 0, NLM_F_MULTI);
+		     prefix, NLM_F_MULTI);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
@@ -3433,7 +3426,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 
 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
-			    nlh->nlmsg_seq, 0, 0, 0);
+			    nlh->nlmsg_seq, 0, 0);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
@@ -3460,7 +3453,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
 		goto errout;
 
 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-				event, info->portid, seq, 0, 0, nlm_flags);
+				event, info->portid, seq, 0, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
cgit v1.2.3


From 4a7c972644c1151f6dd34ff4b5f7eacb239e22ee Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed, 18 Jan 2017 17:45:01 +0100
Subject: net: Remove usage of net_device last_rx member

The network stack no longer uses the last_rx member of struct net_device
since the bonding driver switched to use its own private last_rx in
commit 9f242738376d ("bonding: use last_arp_rx in slave_last_rx()").

However, some drivers still (ab)use the field for their own purposes and
some driver just update it without actually using it.

Previously, there was an accompanying comment for the last_rx member
added in commit 4dc89133f49b ("net: add a comment on netdev->last_rx")
which asked drivers not to update is, unless really needed. However,
this commend was removed in commit f8ff080dacec ("bonding: remove
useless updating of slave->dev->last_rx"), so some drivers added later
on still did update last_rx.

Remove all usage of last_rx and switch three drivers (sky2, atp and
smc91c92_cs) which actually read and write it to use their own private
copy in netdev_priv.

Compile-tested with allyesconfig and allmodconfig on x86 and arm.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Mirko Lindner <mlindner@marvell.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/m68k/emu/nfeth.c                              | 1 -
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 1 -
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 1 -
 drivers/net/ethernet/hisilicon/hns/hns_enet.c      | 1 -
 drivers/net/ethernet/intel/e1000e/netdev.c         | 6 +++---
 drivers/net/ethernet/intel/igb/igb_main.c          | 6 +++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      | 7 +++----
 drivers/net/ethernet/marvell/sky2.c                | 6 +++---
 drivers/net/ethernet/marvell/sky2.h                | 1 +
 drivers/net/ethernet/qualcomm/emac/emac-mac.c      | 1 -
 drivers/net/ethernet/realtek/atp.c                 | 7 +++----
 drivers/net/ethernet/smsc/smc91c92_cs.c            | 6 ++++--
 drivers/net/irda/bfin_sir.c                        | 5 ++---
 drivers/net/irda/sh_sir.c                          | 1 -
 drivers/staging/ks7010/ks_hostif.c                 | 2 --
 drivers/staging/netlogic/xlr_net.c                 | 1 -
 drivers/staging/rtl8192e/rtllib_rx.c               | 1 -
 drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c  | 4 ----
 drivers/staging/wlan-ng/hfa384x_usb.c              | 1 -
 drivers/staging/wlan-ng/p80211netdev.c             | 2 --
 include/linux/netdevice.h                          | 3 ---
 net/batman-adv/bridge_loop_avoidance.c             | 1 -
 net/batman-adv/distributed-arp-table.c             | 1 -
 net/batman-adv/soft-interface.c                    | 2 --
 24 files changed, 22 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/emu/nfeth.c b/arch/m68k/emu/nfeth.c
index fc4be028c418..e45ce4243aaa 100644
--- a/arch/m68k/emu/nfeth.c
+++ b/arch/m68k/emu/nfeth.c
@@ -124,7 +124,6 @@ static inline void recv_packet(struct net_device *dev)
 
 	skb->protocol = eth_type_trans(skb, dev);
 	netif_rx(skb);
-	dev->last_rx = jiffies;
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += pktlen;
 
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 2b89ec291b8b..5ee3f007c613 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2360,7 +2360,6 @@ liquidio_push_packet(u32 octeon_id __attribute__((unused)),
 		if (packet_was_received) {
 			droq->stats.rx_bytes_received += len;
 			droq->stats.rx_pkts_received++;
-			netdev->last_rx = jiffies;
 		} else {
 			droq->stats.rx_dropped++;
 			netif_info(lio, rx_err, lio->netdev,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 19d88fb387ce..e96cf6cdecfd 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1571,7 +1571,6 @@ liquidio_push_packet(u32 octeon_id __attribute__((unused)),
 		if (packet_was_received) {
 			droq->stats.rx_bytes_received += len;
 			droq->stats.rx_pkts_received++;
-			netdev->last_rx = jiffies;
 		} else {
 			droq->stats.rx_dropped++;
 			netif_info(lio, rx_err, lio->netdev,
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index b7cb61385ad8..f7b75e96c1c3 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -797,7 +797,6 @@ static void hns_nic_rx_up_pro(struct hns_nic_ring_data *ring_data,
 
 	skb->protocol = eth_type_trans(skb, ndev);
 	(void)napi_gro_receive(&ring_data->napi, skb);
-	ndev->last_rx = jiffies;
 }
 
 static int hns_desc_unused(struct hnae_ring *ring)
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 79651eb608ff..2175cced402f 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -240,9 +240,9 @@ static void e1000e_dump(struct e1000_adapter *adapter)
 	/* Print netdevice Info */
 	if (netdev) {
 		dev_info(&adapter->pdev->dev, "Net device Info\n");
-		pr_info("Device Name     state            trans_start      last_rx\n");
-		pr_info("%-15s %016lX %016lX %016lX\n", netdev->name,
-			netdev->state, dev_trans_start(netdev), netdev->last_rx);
+		pr_info("Device Name     state            trans_start\n");
+		pr_info("%-15s %016lX %016lX\n", netdev->name,
+			netdev->state, dev_trans_start(netdev));
 	}
 
 	/* Print Registers */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 7fc95493b692..be456bae8169 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -383,9 +383,9 @@ static void igb_dump(struct igb_adapter *adapter)
 	/* Print netdevice Info */
 	if (netdev) {
 		dev_info(&adapter->pdev->dev, "Net device Info\n");
-		pr_info("Device Name     state            trans_start      last_rx\n");
-		pr_info("%-15s %016lX %016lX %016lX\n", netdev->name,
-			netdev->state, dev_trans_start(netdev), netdev->last_rx);
+		pr_info("Device Name     state            trans_start\n");
+		pr_info("%-15s %016lX %016lX\n", netdev->name,
+			netdev->state, dev_trans_start(netdev));
 	}
 
 	/* Print Registers */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ffe7d940d9ff..3b3b52b62a5f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -611,12 +611,11 @@ static void ixgbe_dump(struct ixgbe_adapter *adapter)
 	if (netdev) {
 		dev_info(&adapter->pdev->dev, "Net device Info\n");
 		pr_info("Device Name     state            "
-			"trans_start      last_rx\n");
-		pr_info("%-15s %016lX %016lX %016lX\n",
+			"trans_start\n");
+		pr_info("%-15s %016lX %016lX\n",
 			netdev->name,
 			netdev->state,
-			dev_trans_start(netdev),
-			netdev->last_rx);
+			dev_trans_start(netdev));
 	}
 
 	/* Print Registers */
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index be003c5a4f5f..2b2cc3f3ca10 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2666,7 +2666,7 @@ static inline void sky2_rx_done(struct sky2_hw *hw, unsigned port,
 	sky2->rx_stats.bytes += bytes;
 	u64_stats_update_end(&sky2->rx_stats.syncp);
 
-	dev->last_rx = jiffies;
+	sky2->last_rx = jiffies;
 	sky2_rx_update(netdev_priv(dev), rxqaddr[port]);
 }
 
@@ -2953,7 +2953,7 @@ static int sky2_rx_hung(struct net_device *dev)
 	u8 fifo_lev = sky2_read8(hw, Q_ADDR(rxq, Q_RL));
 
 	/* If idle and MAC or PCI is stuck */
-	if (sky2->check.last == dev->last_rx &&
+	if (sky2->check.last == sky2->last_rx &&
 	    ((mac_rp == sky2->check.mac_rp &&
 	      mac_lev != 0 && mac_lev >= sky2->check.mac_lev) ||
 	     /* Check if the PCI RX hang */
@@ -2965,7 +2965,7 @@ static int sky2_rx_hung(struct net_device *dev)
 			      fifo_rp, sky2_read8(hw, Q_ADDR(rxq, Q_WP)));
 		return 1;
 	} else {
-		sky2->check.last = dev->last_rx;
+		sky2->check.last = sky2->last_rx;
 		sky2->check.mac_rp = mac_rp;
 		sky2->check.mac_lev = mac_lev;
 		sky2->check.fifo_rp = fifo_rp;
diff --git a/drivers/net/ethernet/marvell/sky2.h b/drivers/net/ethernet/marvell/sky2.h
index ec6dcd80152b..0fe160796842 100644
--- a/drivers/net/ethernet/marvell/sky2.h
+++ b/drivers/net/ethernet/marvell/sky2.h
@@ -2247,6 +2247,7 @@ struct sky2_port {
 	u16		     rx_data_size;
 	u16		     rx_nfrags;
 
+	unsigned long	     last_rx;
 	struct {
 		unsigned long last;
 		u32	mac_rp;
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
index 0b4deb31e742..d297ed961da6 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
@@ -1213,7 +1213,6 @@ void emac_mac_rx_process(struct emac_adapter *adpt, struct emac_rx_queue *rx_q,
 		emac_receive_skb(rx_q, skb, (u16)RRD_CVALN_TAG(&rrd),
 				 (bool)RRD_CVTAG(&rrd));
 
-		netdev->last_rx = jiffies;
 		(*num_pkts)++;
 	} while (*num_pkts < max_pkts);
 
diff --git a/drivers/net/ethernet/realtek/atp.c b/drivers/net/ethernet/realtek/atp.c
index 570ed3bd3cbf..9bcd4aefc9c5 100644
--- a/drivers/net/ethernet/realtek/atp.c
+++ b/drivers/net/ethernet/realtek/atp.c
@@ -170,7 +170,7 @@ struct net_local {
     spinlock_t lock;
     struct net_device *next_module;
     struct timer_list timer;	/* Media selection timer. */
-    long last_rx_time;		/* Last Rx, in jiffies, to handle Rx hang. */
+    unsigned long last_rx_time;	/* Last Rx, in jiffies, to handle Rx hang. */
     int saved_tx_size;
     unsigned int tx_unit_busy:1;
     unsigned char re_tx,	/* Number of packet retransmissions. */
@@ -668,11 +668,11 @@ static irqreturn_t atp_interrupt(int irq, void *dev_instance)
 			}
 			num_tx_since_rx++;
 		} else if (num_tx_since_rx > 8 &&
-			   time_after(jiffies, dev->last_rx + HZ)) {
+			   time_after(jiffies, lp->last_rx_time + HZ)) {
 			if (net_debug > 2)
 				printk(KERN_DEBUG "%s: Missed packet? No Rx after %d Tx and "
 					   "%ld jiffies status %02x  CMR1 %02x.\n", dev->name,
-					   num_tx_since_rx, jiffies - dev->last_rx, status,
+					   num_tx_since_rx, jiffies - lp->last_rx_time, status,
 					   (read_nibble(ioaddr, CMR1) >> 3) & 15);
 			dev->stats.rx_missed_errors++;
 			hardware_init(dev);
@@ -789,7 +789,6 @@ static void net_rx(struct net_device *dev)
 		read_block(ioaddr, pkt_len, skb_put(skb,pkt_len), dev->if_port);
 		skb->protocol = eth_type_trans(skb, dev);
 		netif_rx(skb);
-		dev->last_rx = jiffies;
 		dev->stats.rx_packets++;
 		dev->stats.rx_bytes += pkt_len;
 	}
diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c
index 67154621abcf..97280daba27f 100644
--- a/drivers/net/ethernet/smsc/smc91c92_cs.c
+++ b/drivers/net/ethernet/smsc/smc91c92_cs.c
@@ -113,6 +113,7 @@ struct smc_private {
     struct mii_if_info		mii_if;
     int				duplex;
     int				rx_ovrn;
+    unsigned long		last_rx;
 };
 
 /* Special definitions for Megahertz multifunction cards */
@@ -1491,6 +1492,7 @@ static void smc_rx(struct net_device *dev)
     if (!(rx_status & RS_ERRORS)) {
 	/* do stuff to make a new packet */
 	struct sk_buff *skb;
+	struct smc_private *smc = netdev_priv(dev);
 	
 	/* Note: packet_length adds 5 or 6 extra bytes here! */
 	skb = netdev_alloc_skb(dev, packet_length+2);
@@ -1509,7 +1511,7 @@ static void smc_rx(struct net_device *dev)
 	skb->protocol = eth_type_trans(skb, dev);
 	
 	netif_rx(skb);
-	dev->last_rx = jiffies;
+	smc->last_rx = jiffies;
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += packet_length;
 	if (rx_status & RS_MULTICAST)
@@ -1790,7 +1792,7 @@ static void media_check(u_long arg)
     }
 
     /* Ignore collisions unless we've had no rx's recently */
-    if (time_after(jiffies, dev->last_rx + HZ)) {
+    if (time_after(jiffies, smc->last_rx + HZ)) {
 	if (smc->tx_err || (smc->media_status & EPH_16COL))
 	    media |= EPH_16COL;
     }
diff --git a/drivers/net/irda/bfin_sir.c b/drivers/net/irda/bfin_sir.c
index be5bb0b7f29c..3151b580dbd6 100644
--- a/drivers/net/irda/bfin_sir.c
+++ b/drivers/net/irda/bfin_sir.c
@@ -22,7 +22,7 @@ static int max_rate = 57600;
 static int max_rate = 115200;
 #endif
 
-static void turnaround_delay(unsigned long last_jif, int mtt)
+static void turnaround_delay(int mtt)
 {
 	long ticks;
 
@@ -209,7 +209,6 @@ static void bfin_sir_rx_chars(struct net_device *dev)
 	UART_CLEAR_LSR(port);
 	ch = UART_GET_CHAR(port);
 	async_unwrap_char(dev, &self->stats, &self->rx_buff, ch);
-	dev->last_rx = jiffies;
 }
 
 static irqreturn_t bfin_sir_rx_int(int irq, void *dev_id)
@@ -510,7 +509,7 @@ static void bfin_sir_send_work(struct work_struct *work)
 	int tx_cnt = 10;
 
 	while (bfin_sir_is_receiving(dev) && --tx_cnt)
-		turnaround_delay(dev->last_rx, self->mtt);
+		turnaround_delay(self->mtt);
 
 	bfin_sir_stop_rx(port);
 
diff --git a/drivers/net/irda/sh_sir.c b/drivers/net/irda/sh_sir.c
index e3fe9a286136..fede6864c737 100644
--- a/drivers/net/irda/sh_sir.c
+++ b/drivers/net/irda/sh_sir.c
@@ -547,7 +547,6 @@ static void sh_sir_rx(struct sh_sir_self *self)
 
 		async_unwrap_char(self->ndev, &self->ndev->stats,
 				  &self->rx_buff, (u8)data);
-		self->ndev->last_rx = jiffies;
 
 		if (EOFD & sh_sir_read(self, IRIF_SIR_FRM))
 			continue;
diff --git a/drivers/staging/ks7010/ks_hostif.c b/drivers/staging/ks7010/ks_hostif.c
index 1fbd495e5e63..c7652c35be19 100644
--- a/drivers/staging/ks7010/ks_hostif.c
+++ b/drivers/staging/ks7010/ks_hostif.c
@@ -461,7 +461,6 @@ void hostif_data_indication(struct ks_wlan_private *priv)
 			skb->protocol = eth_type_trans(skb, skb->dev);
 			priv->nstats.rx_packets++;
 			priv->nstats.rx_bytes += rx_ind_size;
-			skb->dev->last_rx = jiffies;
 			netif_rx(skb);
 		} else {
 			priv->nstats.rx_dropped++;
@@ -494,7 +493,6 @@ void hostif_data_indication(struct ks_wlan_private *priv)
 			skb->protocol = eth_type_trans(skb, skb->dev);
 			priv->nstats.rx_packets++;
 			priv->nstats.rx_bytes += rx_ind_size;
-			skb->dev->last_rx = jiffies;
 			netif_rx(skb);
 		} else {
 			priv->nstats.rx_dropped++;
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index f84069ffa8c6..781ef623233e 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -155,7 +155,6 @@ static void xlr_net_fmn_handler(int bkt, int src_stnid, int size, int code,
 		skb_reserve(skb, BYTE_OFFSET);
 		skb_put(skb, length);
 		skb->protocol = eth_type_trans(skb, skb->dev);
-		skb->dev->last_rx = jiffies;
 		netif_rx(skb);
 		/* Fill rx ring */
 		skb_data = xlr_alloc_skb();
diff --git a/drivers/staging/rtl8192e/rtllib_rx.c b/drivers/staging/rtl8192e/rtllib_rx.c
index e5ba7d1a809f..43a77745e6fb 100644
--- a/drivers/staging/rtl8192e/rtllib_rx.c
+++ b/drivers/staging/rtl8192e/rtllib_rx.c
@@ -1375,7 +1375,6 @@ static int rtllib_rx_InfraAdhoc(struct rtllib_device *ieee, struct sk_buff *skb,
 		ieee->LinkDetectInfo.NumRecvDataInPeriod++;
 		ieee->LinkDetectInfo.NumRxOkInPeriod++;
 	}
-	dev->last_rx = jiffies;
 
 	/* Data frame - extract src/dst addresses */
 	rtllib_rx_extract_addr(ieee, hdr, dst, src, bssid);
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
index 82f654305414..b1f2fdfcb718 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
@@ -1103,11 +1103,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
 		stats = hostap_get_stats(dev);
 		from_assoc_ap = 1;
 	}
-#endif
-
-	dev->last_rx = jiffies;
 
-#ifdef NOT_YET
 	if ((ieee->iw_mode == IW_MODE_MASTER ||
 	     ieee->iw_mode == IW_MODE_REPEAT) &&
 	    !from_assoc_ap) {
diff --git a/drivers/staging/wlan-ng/hfa384x_usb.c b/drivers/staging/wlan-ng/hfa384x_usb.c
index 4fe037aeef12..6134eba5cad4 100644
--- a/drivers/staging/wlan-ng/hfa384x_usb.c
+++ b/drivers/staging/wlan-ng/hfa384x_usb.c
@@ -3409,7 +3409,6 @@ static void hfa384x_usbin_rx(struct wlandevice *wlandev, struct sk_buff *skb)
 			&usbin->rxfrm.desc.frame_control, hdrlen);
 
 		skb->dev = wlandev->netdev;
-		skb->dev->last_rx = jiffies;
 
 		/* And set the frame length properly */
 		skb_trim(skb, data_len + hdrlen);
diff --git a/drivers/staging/wlan-ng/p80211netdev.c b/drivers/staging/wlan-ng/p80211netdev.c
index 73fcf07254fe..53dbbd69e552 100644
--- a/drivers/staging/wlan-ng/p80211netdev.c
+++ b/drivers/staging/wlan-ng/p80211netdev.c
@@ -252,7 +252,6 @@ static int p80211_convert_to_ether(struct wlandevice *wlandev,
 	}
 
 	if (skb_p80211_to_ether(wlandev, wlandev->ethconv, skb) == 0) {
-		skb->dev->last_rx = jiffies;
 		wlandev->netdev->stats.rx_packets++;
 		wlandev->netdev->stats.rx_bytes += skb->len;
 		netif_rx_ni(skb);
@@ -287,7 +286,6 @@ static void p80211netdev_rx_bh(unsigned long arg)
 				skb->ip_summed = CHECKSUM_NONE;
 				skb->pkt_type = PACKET_OTHERHOST;
 				skb->protocol = htons(ETH_P_80211_RAW);
-				dev->last_rx = jiffies;
 
 				dev->stats.rx_packets++;
 				dev->stats.rx_bytes += skb->len;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97ae0ac513ee..3868c32d98af 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1551,7 +1551,6 @@ enum netdev_priv_flags {
  *	@ax25_ptr:	AX.25 specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
  *
- *	@last_rx:	Time of last Rx
  *	@dev_addr:	Hw address (before bcast,
  *			because most packets are unicast)
  *
@@ -1777,8 +1776,6 @@ struct net_device {
 /*
  * Cache lines mostly used on receive path (including eth_type_trans())
  */
-	unsigned long		last_rx;
-
 	/* Interface address info used in eth_type_trans() */
 	unsigned char		*dev_addr;
 
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..36917a7b1b59 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -449,7 +449,6 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
 	batadv_inc_counter(bat_priv, BATADV_CNT_RX);
 	batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
 			   skb->len + ETH_HLEN);
-	soft_iface->last_rx = jiffies;
 
 	netif_rx(skb);
 out:
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 49576c5a3fe3..6394206bfcae 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1050,7 +1050,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
 						   bat_priv->soft_iface);
 		bat_priv->stats.rx_packets++;
 		bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
-		bat_priv->soft_iface->last_rx = jiffies;
 
 		netif_rx(skb_new);
 		batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7b3494ae6ad9..420e19b501f2 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -481,8 +481,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
 	batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
 			   skb->len + ETH_HLEN);
 
-	soft_iface->last_rx = jiffies;
-
 	/* Let the bridge loop avoidance check the packet. If will
 	 * not handle it, we can safely push it up.
 	 */
-- 
cgit v1.2.3


From d69dece5f5b6bc7a5e39d2b6136ddc69469331fe Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 18 Jan 2017 17:09:05 -0800
Subject: LSM: Add /sys/kernel/security/lsm

I am still tired of having to find indirect ways to determine
what security modules are active on a system. I have added
/sys/kernel/security/lsm, which contains a comma separated
list of the active security modules. No more groping around
in /proc/filesystems or other clever hacks.

Unchanged from previous versions except for being updated
to the latest security next branch.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 Documentation/security/LSM.txt |  7 +++++++
 include/linux/lsm_hooks.h      | 12 ++++--------
 security/apparmor/lsm.c        |  3 ++-
 security/commoncap.c           |  3 ++-
 security/inode.c               | 26 ++++++++++++++++++++++++--
 security/loadpin/loadpin.c     |  2 +-
 security/security.c            | 38 ++++++++++++++++++++++++++++++++++++++
 security/selinux/hooks.c       |  2 +-
 security/smack/smack_lsm.c     |  2 +-
 security/tomoyo/tomoyo.c       |  2 +-
 security/yama/yama_lsm.c       |  2 +-
 11 files changed, 82 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/LSM.txt b/Documentation/security/LSM.txt
index 3db7e671c440..c2683f28ed36 100644
--- a/Documentation/security/LSM.txt
+++ b/Documentation/security/LSM.txt
@@ -22,6 +22,13 @@ system, building their checks on top of the defined capability hooks.
 For more details on capabilities, see capabilities(7) in the Linux
 man-pages project.
 
+A list of the active security modules can be found by reading
+/sys/kernel/security/lsm. This is a comma separated list, and
+will always include the capability module. The list reflects the
+order in which checks are made. The capability module will always
+be first, followed by any "minor" modules (e.g. Yama) and then
+the one "major" module (e.g. SELinux) if there is one configured.
+
 Based on https://lkml.org/lkml/2007/10/26/215,
 a new LSM is accepted into the kernel when its intent (a description of
 what it tries to protect against and in what cases one would expect to
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9cf50ad2fe20..0f3c309065d7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1875,6 +1875,7 @@ struct security_hook_list {
 	struct list_head		list;
 	struct list_head		*head;
 	union security_list_options	hook;
+	char				*lsm;
 };
 
 /*
@@ -1887,15 +1888,10 @@ struct security_hook_list {
 	{ .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } }
 
 extern struct security_hook_heads security_hook_heads;
+extern char *lsm_names;
 
-static inline void security_add_hooks(struct security_hook_list *hooks,
-				      int count)
-{
-	int i;
-
-	for (i = 0; i < count; i++)
-		list_add_tail_rcu(&hooks[i].list, hooks[i].head);
-}
+extern void security_add_hooks(struct security_hook_list *hooks, int count,
+				char *lsm);
 
 #ifdef CONFIG_SECURITY_SELINUX_DISABLE
 /*
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 5217a0a54047..b63d39ca6278 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -999,7 +999,8 @@ static int __init apparmor_init(void)
 		aa_free_root_ns();
 		goto buffers_out;
 	}
-	security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks));
+	security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
+				"apparmor");
 
 	/* Report that AppArmor successfully initialized */
 	apparmor_initialized = 1;
diff --git a/security/commoncap.c b/security/commoncap.c
index 8df676fbd393..6d4d586b9356 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -1093,7 +1093,8 @@ struct security_hook_list capability_hooks[] = {
 
 void __init capability_add_hooks(void)
 {
-	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks));
+	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
+				"capability");
 }
 
 #endif /* CONFIG_SECURITY */
diff --git a/security/inode.c b/security/inode.c
index c83db05c15ab..2cb14162ff8d 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/magic.h>
 
 static struct vfsmount *mount;
@@ -204,6 +205,21 @@ void securityfs_remove(struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(securityfs_remove);
 
+#ifdef CONFIG_SECURITY
+static struct dentry *lsm_dentry;
+static ssize_t lsm_read(struct file *filp, char __user *buf, size_t count,
+			loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, count, ppos, lsm_names,
+		strlen(lsm_names));
+}
+
+static const struct file_operations lsm_ops = {
+	.read = lsm_read,
+	.llseek = generic_file_llseek,
+};
+#endif
+
 static int __init securityfs_init(void)
 {
 	int retval;
@@ -213,9 +229,15 @@ static int __init securityfs_init(void)
 		return retval;
 
 	retval = register_filesystem(&fs_type);
-	if (retval)
+	if (retval) {
 		sysfs_remove_mount_point(kernel_kobj, "security");
-	return retval;
+		return retval;
+	}
+#ifdef CONFIG_SECURITY
+	lsm_dentry = securityfs_create_file("lsm", 0444, NULL, NULL,
+						&lsm_ops);
+#endif
+	return 0;
 }
 
 core_initcall(securityfs_init);
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 89a46f10b8a7..1d82eae3a5b8 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -182,7 +182,7 @@ static struct security_hook_list loadpin_hooks[] = {
 void __init loadpin_add_hooks(void)
 {
 	pr_info("ready to pin (currently %sabled)", enabled ? "en" : "dis");
-	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks));
+	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin");
 }
 
 /* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
diff --git a/security/security.c b/security/security.c
index f825304f04a7..f0a802ee29b6 100644
--- a/security/security.c
+++ b/security/security.c
@@ -32,6 +32,7 @@
 /* Maximum number of letters for an LSM name string */
 #define SECURITY_NAME_MAX	10
 
+char *lsm_names;
 /* Boot-time LSM user choice */
 static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
 	CONFIG_DEFAULT_SECURITY;
@@ -78,6 +79,22 @@ static int __init choose_lsm(char *str)
 }
 __setup("security=", choose_lsm);
 
+static int lsm_append(char *new, char **result)
+{
+	char *cp;
+
+	if (*result == NULL) {
+		*result = kstrdup(new, GFP_KERNEL);
+	} else {
+		cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
+		if (cp == NULL)
+			return -ENOMEM;
+		kfree(*result);
+		*result = cp;
+	}
+	return 0;
+}
+
 /**
  * security_module_enable - Load given security module on boot ?
  * @module: the name of the module
@@ -97,6 +114,27 @@ int __init security_module_enable(const char *module)
 	return !strcmp(module, chosen_lsm);
 }
 
+/**
+ * security_add_hooks - Add a modules hooks to the hook lists.
+ * @hooks: the hooks to add
+ * @count: the number of hooks to add
+ * @lsm: the name of the security module
+ *
+ * Each LSM has to register its hooks with the infrastructure.
+ */
+void __init security_add_hooks(struct security_hook_list *hooks, int count,
+				char *lsm)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		hooks[i].lsm = lsm;
+		list_add_tail_rcu(&hooks[i].list, hooks[i].head);
+	}
+	if (lsm_append(lsm, &lsm_names) < 0)
+		panic("%s - Cannot get early memory.\n", __func__);
+}
+
 /*
  * Hook list operation macros.
  *
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c7c6619431d5..fdc24ea65fd0 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6349,7 +6349,7 @@ static __init int selinux_init(void)
 					    0, SLAB_PANIC, NULL);
 	avc_init();
 
-	security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks));
+	security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux");
 
 	if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
 		panic("SELinux: Unable to register AVC netcache callback\n");
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 94dc9d406ce3..839150e40eb1 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4819,7 +4819,7 @@ static __init int smack_init(void)
 	/*
 	 * Register with LSM
 	 */
-	security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks));
+	security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
 
 	return 0;
 }
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 75c998700190..edc52d620f29 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -542,7 +542,7 @@ static int __init tomoyo_init(void)
 	if (!security_module_enable("tomoyo"))
 		return 0;
 	/* register ourselves with the security framework */
-	security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks));
+	security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), "tomoyo");
 	printk(KERN_INFO "TOMOYO Linux initialized\n");
 	cred->security = &tomoyo_kernel_domain;
 	tomoyo_mm_init();
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 968e5e0a3f81..88271a3bf37f 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -485,6 +485,6 @@ static inline void yama_init_sysctl(void) { }
 void __init yama_add_hooks(void)
 {
 	pr_info("Yama: becoming mindful.\n");
-	security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks));
+	security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), "yama");
 	yama_init_sysctl();
 }
-- 
cgit v1.2.3


From 85b36c931ff328297572a3e6136fac573795ad79 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 18 Jan 2017 09:38:04 -0800
Subject: jump_labels: Move header guard #endif down where it belongs

The ending header guard is misplaced. This has no
functional change, this is just an eye-sore.

Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: bp@suse.de
Cc: catalin.marinas@arm.com
Cc: jbaron@akamai.com
Cc: pbonzini@redhat.com
Cc: tony.luck@intel.com
Link: http://lkml.kernel.org/r/20170118173804.16281-1-mcgrof@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index a0547c571800..b63d6b7b0db0 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -402,6 +402,6 @@ extern bool ____wrong_branch_error(void);
 #define static_branch_enable(x)		static_key_enable(&(x)->key)
 #define static_branch_disable(x)	static_key_disable(&(x)->key)
 
-#endif	/* _LINUX_JUMP_LABEL_H */
-
 #endif /* __ASSEMBLY__ */
+
+#endif	/* _LINUX_JUMP_LABEL_H */
-- 
cgit v1.2.3


From f95bd041203b9f27a14e6ab733b9598049d7ffef Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Fri, 13 Jan 2017 11:00:25 +0100
Subject: memory: aemif: allow passing device lookup table as platform data

TI aemif driver creates its own subnodes of the device tree in order
to guarantee that all child devices are probed after the AEMIF timing
parameters are configured.

Some devices (e.g. da850) use struct of_dev_auxdata for clock lookup
but nodes created from within the aemif driver can't access the lookup
table.

Create a platform data structure that holds a pointer to
of_dev_auxdata so that we can use it with of_platform_populate().

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Acked-by: Sekhar Nori <nsekhar@ti.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/memory/ti-aemif.c              |  8 +++++++-
 include/linux/platform_data/ti-aemif.h | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/platform_data/ti-aemif.h

(limited to 'include/linux')

diff --git a/drivers/memory/ti-aemif.c b/drivers/memory/ti-aemif.c
index a579a0f25840..22c1aeeb6421 100644
--- a/drivers/memory/ti-aemif.c
+++ b/drivers/memory/ti-aemif.c
@@ -20,6 +20,7 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/ti-aemif.h>
 
 #define TA_SHIFT	2
 #define RHOLD_SHIFT	4
@@ -335,6 +336,8 @@ static int aemif_probe(struct platform_device *pdev)
 	struct device_node *np = dev->of_node;
 	struct device_node *child_np;
 	struct aemif_device *aemif;
+	struct aemif_platform_data *pdata;
+	struct of_dev_auxdata *dev_lookup;
 
 	if (np == NULL)
 		return 0;
@@ -343,6 +346,9 @@ static int aemif_probe(struct platform_device *pdev)
 	if (!aemif)
 		return -ENOMEM;
 
+	pdata = dev_get_platdata(&pdev->dev);
+	dev_lookup = pdata ? pdata->dev_lookup : NULL;
+
 	platform_set_drvdata(pdev, aemif);
 
 	aemif->clk = devm_clk_get(dev, NULL);
@@ -390,7 +396,7 @@ static int aemif_probe(struct platform_device *pdev)
 	 * parameters are set.
 	 */
 	for_each_available_child_of_node(np, child_np) {
-		ret = of_platform_populate(child_np, NULL, NULL, dev);
+		ret = of_platform_populate(child_np, NULL, dev_lookup, dev);
 		if (ret < 0)
 			goto error;
 	}
diff --git a/include/linux/platform_data/ti-aemif.h b/include/linux/platform_data/ti-aemif.h
new file mode 100644
index 000000000000..ac72e115093c
--- /dev/null
+++ b/include/linux/platform_data/ti-aemif.h
@@ -0,0 +1,23 @@
+/*
+ * TI DaVinci AEMIF platform glue.
+ *
+ * Copyright (C) 2017 BayLibre SAS
+ *
+ * Author:
+ *   Bartosz Golaszewski <bgolaszewski@baylibre.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __TI_DAVINCI_AEMIF_DATA_H__
+#define __TI_DAVINCI_AEMIF_DATA_H__
+
+#include <linux/of_platform.h>
+
+struct aemif_platform_data {
+	struct of_dev_auxdata *dev_lookup;
+};
+
+#endif /* __TI_DAVINCI_AEMIF_DATA_H__ */
-- 
cgit v1.2.3


From 6d2c5d6c46dd3d9924831efd6c913fdf4d484985 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 16 Jan 2017 17:50:02 +0100
Subject: kmod: make usermodehelper path a const string

This is in preparation for making it so that usermode helper programs
can't be changed, if desired, by userspace.  We will tackle the mess of
cleaning up the write-ability of argv and env later, that's going to
take more work, for much less gain...

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kmod.h | 7 ++++---
 kernel/kmod.c        | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index fcfd2bf14d3f..c4e441e00db5 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -56,7 +56,7 @@ struct file;
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
-	char *path;
+	const char *path;
 	char **argv;
 	char **envp;
 	int wait;
@@ -67,10 +67,11 @@ struct subprocess_info {
 };
 
 extern int
-call_usermodehelper(char *path, char **argv, char **envp, int wait);
+call_usermodehelper(const char *path, char **argv, char **envp, int wait);
 
 extern struct subprocess_info *
-call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask,
+call_usermodehelper_setup(const char *path, char **argv, char **envp,
+			  gfp_t gfp_mask,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index d45c96073afb..426a614e97fe 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -516,7 +516,7 @@ static void helper_unlock(void)
  * Function must be runnable in either a process context or the
  * context in which call_usermodehelper_exec is called.
  */
-struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 		char **envp, gfp_t gfp_mask,
 		int (*init)(struct subprocess_info *info, struct cred *new),
 		void (*cleanup)(struct subprocess_info *info),
@@ -613,7 +613,7 @@ EXPORT_SYMBOL(call_usermodehelper_exec);
  * This function is the equivalent to use call_usermodehelper_setup() and
  * call_usermodehelper_exec().
  */
-int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
 {
 	struct subprocess_info *info;
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-- 
cgit v1.2.3


From 219fb0c1436e4893a290ba270bc0e644d02465a3 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 11 Jan 2017 16:43:37 +0200
Subject: serial: sh-sci: Remove the platform data dma slave rx/tx channel IDs

Only SH platforms still use platform data for the sh-sci, and none of
them declare DMA channels connected to the SCI. Remove the corresponding
platform data fields and simplify the driver accordingly.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sh-sci.c | 23 ++++++-----------------
 include/linux/serial_sci.h  |  3 ---
 2 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index a8607cacee6c..c6aa5b9c2bfd 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -1387,20 +1387,14 @@ static void rx_timer_fn(unsigned long arg)
 }
 
 static struct dma_chan *sci_request_dma_chan(struct uart_port *port,
-					     enum dma_transfer_direction dir,
-					     unsigned int id)
+					     enum dma_transfer_direction dir)
 {
-	dma_cap_mask_t mask;
 	struct dma_chan *chan;
 	struct dma_slave_config cfg;
 	int ret;
 
-	dma_cap_zero(mask);
-	dma_cap_set(DMA_SLAVE, mask);
-
-	chan = dma_request_slave_channel_compat(mask, shdma_chan_filter,
-					(void *)(unsigned long)id, port->dev,
-					dir == DMA_MEM_TO_DEV ? "tx" : "rx");
+	chan = dma_request_slave_channel(port->dev,
+					 dir == DMA_MEM_TO_DEV ? "tx" : "rx");
 	if (!chan) {
 		dev_warn(port->dev,
 			 "dma_request_slave_channel_compat failed\n");
@@ -1436,12 +1430,11 @@ static void sci_request_dma(struct uart_port *port)
 
 	dev_dbg(port->dev, "%s: port %d\n", __func__, port->line);
 
-	if (!port->dev->of_node &&
-	    (s->cfg->dma_slave_tx <= 0 || s->cfg->dma_slave_rx <= 0))
+	if (!port->dev->of_node)
 		return;
 
 	s->cookie_tx = -EINVAL;
-	chan = sci_request_dma_chan(port, DMA_MEM_TO_DEV, s->cfg->dma_slave_tx);
+	chan = sci_request_dma_chan(port, DMA_MEM_TO_DEV);
 	dev_dbg(port->dev, "%s: TX: got channel %p\n", __func__, chan);
 	if (chan) {
 		s->chan_tx = chan;
@@ -1463,7 +1456,7 @@ static void sci_request_dma(struct uart_port *port)
 		INIT_WORK(&s->work_tx, work_fn_tx);
 	}
 
-	chan = sci_request_dma_chan(port, DMA_DEV_TO_MEM, s->cfg->dma_slave_rx);
+	chan = sci_request_dma_chan(port, DMA_DEV_TO_MEM);
 	dev_dbg(port->dev, "%s: RX: got channel %p\n", __func__, chan);
 	if (chan) {
 		unsigned int i;
@@ -2706,10 +2699,6 @@ static int sci_init_single(struct platform_device *dev,
 	port->serial_in		= sci_serial_in;
 	port->serial_out	= sci_serial_out;
 
-	if (p->dma_slave_tx > 0 && p->dma_slave_rx > 0)
-		dev_dbg(port->dev, "DMA tx %d, rx %d\n",
-			p->dma_slave_tx, p->dma_slave_rx);
-
 	return 0;
 }
 
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 9f2bfd055742..1a894c47bfc0 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -71,9 +71,6 @@ struct plat_sci_port {
 	unsigned char	regtype;
 
 	struct plat_sci_port_ops	*ops;
-
-	unsigned int	dma_slave_tx;
-	unsigned int	dma_slave_rx;
 };
 
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
cgit v1.2.3


From d5cb1319a91d4f1328b1c70b82c5899acd96af85 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 11 Jan 2017 16:43:38 +0200
Subject: serial: sh-sci: Remove manual break debouncing

The sh-sci driver implements manual break debouncing for a few SH
platforms by reading the value of the RX pin port register. This feature
is optional and the driver considers all negative or zero values of the
platform data port_reg field as invalid. As the four platforms that set
the field to a register address all use an address higher than
0x7fffffff, the driver will always consider the value as invalid and
never perform debouncing. The feature is unused, remove it.

Debouncing could be implemented properly in the future using the pinctrl
and GPIO APIs if desired.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sh-sci.c | 124 +++-----------------------------------------
 include/linux/serial_sci.h  |   5 --
 2 files changed, 7 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index c6aa5b9c2bfd..ae1404e701ea 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -125,10 +125,6 @@ struct sci_port {
 	resource_size_t		reg_size;
 	struct mctrl_gpios	*gpios;
 
-	/* Break timer */
-	struct timer_list	break_timer;
-	int			break_flag;
-
 	/* Clocks */
 	struct clk		*clks[SCI_NUM_CLKS];
 	unsigned long		clk_rates[SCI_NUM_CLKS];
@@ -517,14 +513,6 @@ static void sci_port_disable(struct sci_port *sci_port)
 	if (!sci_port->port.dev)
 		return;
 
-	/* Cancel the break timer to ensure that the timer handler will not try
-	 * to access the hardware with clocks and power disabled. Reset the
-	 * break flag to make the break debouncing state machine ready for the
-	 * next break.
-	 */
-	del_timer_sync(&sci_port->break_timer);
-	sci_port->break_flag = 0;
-
 	for (i = SCI_NUM_CLKS; i-- > 0; )
 		clk_disable_unprepare(sci_port->clks[i]);
 
@@ -751,20 +739,6 @@ static int sci_rxfill(struct uart_port *port)
 	return (serial_port_in(port, SCxSR) & SCxSR_RDxF(port)) != 0;
 }
 
-/*
- * SCI helper for checking the state of the muxed port/RXD pins.
- */
-static inline int sci_rxd_in(struct uart_port *port)
-{
-	struct sci_port *s = to_sci_port(port);
-
-	if (s->cfg->port_reg <= 0)
-		return 1;
-
-	/* Cast for ARM damage */
-	return !!__raw_readb((void __iomem *)(uintptr_t)s->cfg->port_reg);
-}
-
 /* ********************************************************************** *
  *                   the interrupt related routines                       *
  * ********************************************************************** */
@@ -832,7 +806,6 @@ static void sci_transmit_chars(struct uart_port *port)
 
 static void sci_receive_chars(struct uart_port *port)
 {
-	struct sci_port *sci_port = to_sci_port(port);
 	struct tty_port *tport = &port->state->port;
 	int i, count, copied = 0;
 	unsigned short status;
@@ -852,8 +825,7 @@ static void sci_receive_chars(struct uart_port *port)
 
 		if (port->type == PORT_SCI) {
 			char c = serial_port_in(port, SCxRDR);
-			if (uart_handle_sysrq_char(port, c) ||
-			    sci_port->break_flag)
+			if (uart_handle_sysrq_char(port, c))
 				count = 0;
 			else
 				tty_insert_flip_char(tport, c, TTY_NORMAL);
@@ -862,25 +834,6 @@ static void sci_receive_chars(struct uart_port *port)
 				char c = serial_port_in(port, SCxRDR);
 
 				status = serial_port_in(port, SCxSR);
-#if defined(CONFIG_CPU_SH3)
-				/* Skip "chars" during break */
-				if (sci_port->break_flag) {
-					if ((c == 0) &&
-					    (status & SCxSR_FER(port))) {
-						count--; i--;
-						continue;
-					}
-
-					/* Nonzero => end-of-break */
-					dev_dbg(port->dev, "debounce<%02x>\n", c);
-					sci_port->break_flag = 0;
-
-					if (STEPFN(c)) {
-						count--; i--;
-						continue;
-					}
-				}
-#endif /* CONFIG_CPU_SH3 */
 				if (uart_handle_sysrq_char(port, c)) {
 					count--; i--;
 					continue;
@@ -918,37 +871,6 @@ static void sci_receive_chars(struct uart_port *port)
 	}
 }
 
-#define SCI_BREAK_JIFFIES (HZ/20)
-
-/*
- * The sci generates interrupts during the break,
- * 1 per millisecond or so during the break period, for 9600 baud.
- * So dont bother disabling interrupts.
- * But dont want more than 1 break event.
- * Use a kernel timer to periodically poll the rx line until
- * the break is finished.
- */
-static inline void sci_schedule_break_timer(struct sci_port *port)
-{
-	mod_timer(&port->break_timer, jiffies + SCI_BREAK_JIFFIES);
-}
-
-/* Ensure that two consecutive samples find the break over. */
-static void sci_break_timer(unsigned long data)
-{
-	struct sci_port *port = (struct sci_port *)data;
-
-	if (sci_rxd_in(&port->port) == 0) {
-		port->break_flag = 1;
-		sci_schedule_break_timer(port);
-	} else if (port->break_flag == 1) {
-		/* break is over. */
-		port->break_flag = 2;
-		sci_schedule_break_timer(port);
-	} else
-		port->break_flag = 0;
-}
-
 static int sci_handle_errors(struct uart_port *port)
 {
 	int copied = 0;
@@ -968,35 +890,13 @@ static int sci_handle_errors(struct uart_port *port)
 	}
 
 	if (status & SCxSR_FER(port)) {
-		if (sci_rxd_in(port) == 0) {
-			/* Notify of BREAK */
-			struct sci_port *sci_port = to_sci_port(port);
-
-			if (!sci_port->break_flag) {
-				port->icount.brk++;
-
-				sci_port->break_flag = 1;
-				sci_schedule_break_timer(sci_port);
-
-				/* Do sysrq handling. */
-				if (uart_handle_break(port))
-					return 0;
-
-				dev_dbg(port->dev, "BREAK detected\n");
-
-				if (tty_insert_flip_char(tport, 0, TTY_BREAK))
-					copied++;
-			}
-
-		} else {
-			/* frame error */
-			port->icount.frame++;
+		/* frame error */
+		port->icount.frame++;
 
-			if (tty_insert_flip_char(tport, 0, TTY_FRAME))
-				copied++;
+		if (tty_insert_flip_char(tport, 0, TTY_FRAME))
+			copied++;
 
-			dev_notice(port->dev, "frame error\n");
-		}
+		dev_notice(port->dev, "frame error\n");
 	}
 
 	if (status & SCxSR_PER(port)) {
@@ -1049,17 +949,11 @@ static int sci_handle_breaks(struct uart_port *port)
 	int copied = 0;
 	unsigned short status = serial_port_in(port, SCxSR);
 	struct tty_port *tport = &port->state->port;
-	struct sci_port *s = to_sci_port(port);
 
 	if (uart_handle_break(port))
 		return 0;
 
-	if (!s->break_flag && status & SCxSR_BRK(port)) {
-#if defined(CONFIG_CPU_SH3)
-		/* Debounce break */
-		s->break_flag = 1;
-#endif
-
+	if (status & SCxSR_BRK(port)) {
 		port->icount.brk++;
 
 		/* Notify of BREAK */
@@ -2677,10 +2571,6 @@ static int sci_init_single(struct platform_device *dev,
 		pm_runtime_enable(&dev->dev);
 	}
 
-	sci_port->break_timer.data = (unsigned long)sci_port;
-	sci_port->break_timer.function = sci_break_timer;
-	init_timer(&sci_port->break_timer);
-
 	port->type		= p->type;
 	port->flags		= UPF_FIXED_PORT | UPF_BOOT_AUTOCONF | p->flags;
 	port->regshift		= p->regshift;
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 1a894c47bfc0..b4419931bf4c 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -9,8 +9,6 @@
  * Generic header for SuperH (H)SCI(F) (used by sh/sh64 and related parts)
  */
 
-#define SCIx_NOT_SUPPORTED	(-1)
-
 /* Serial Control Register (@ = not supported by all parts) */
 #define SCSCR_TIE	BIT(7)	/* Transmit Interrupt Enable */
 #define SCSCR_RIE	BIT(6)	/* Receive Interrupt Enable */
@@ -41,8 +39,6 @@ enum {
 	SCIx_NR_REGTYPES,
 };
 
-struct device;
-
 struct plat_sci_port_ops {
 	void (*init_pins)(struct uart_port *, unsigned int cflag);
 };
@@ -66,7 +62,6 @@ struct plat_sci_port {
 	/*
 	 * Platform overrides if necessary, defaults otherwise.
 	 */
-	int		port_reg;
 	unsigned char	regshift;
 	unsigned char	regtype;
 
-- 
cgit v1.2.3


From 97ed9790c514066bfae67f22e084b505ed5af436 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 11 Jan 2017 16:43:39 +0200
Subject: serial: sh-sci: Remove unused platform data capabilities field

The field isn't set by any platform but is only used internally in the
driver to hold data parsed from DT. Move it to the sci_port structure.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sh-sci.c | 11 +++++++----
 include/linux/serial_sci.h  |  6 ------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index ae1404e701ea..a04ed40279d1 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -149,6 +149,7 @@ struct sci_port {
 	unsigned int			rx_timeout;
 #endif
 
+	bool has_rtscts;
 	bool autorts;
 };
 
@@ -680,7 +681,7 @@ static void sci_init_pins(struct uart_port *port, unsigned int cflag)
 
 		/* Enable RXD and TXD pin functions */
 		ctrl &= ~(SCPCR_RXDC | SCPCR_TXDC);
-		if (to_sci_port(port)->cfg->capabilities & SCIx_HAVE_RTSCTS) {
+		if (to_sci_port(port)->has_rtscts) {
 			/* RTS# is output, driven 1 */
 			ctrl |= SCPCR_RTSC;
 			serial_port_out(port, SCPDR,
@@ -1738,7 +1739,7 @@ static void sci_set_mctrl(struct uart_port *port, unsigned int mctrl)
 
 	mctrl_gpio_set(s->gpios, mctrl);
 
-	if (!(s->cfg->capabilities & SCIx_HAVE_RTSCTS))
+	if (!s->has_rtscts)
 		return;
 
 	if (!(mctrl & TIOCM_RTS)) {
@@ -2809,6 +2810,7 @@ sci_parse_dt(struct platform_device *pdev, unsigned int *dev_id)
 	struct device_node *np = pdev->dev.of_node;
 	const struct of_device_id *match;
 	struct plat_sci_port *p;
+	struct sci_port *sp;
 	int id;
 
 	if (!IS_ENABLED(CONFIG_OF) || !np)
@@ -2829,13 +2831,14 @@ sci_parse_dt(struct platform_device *pdev, unsigned int *dev_id)
 		return NULL;
 	}
 
+	sp = &sci_ports[id];
 	*dev_id = id;
 
 	p->type = SCI_OF_TYPE(match->data);
 	p->regtype = SCI_OF_REGTYPE(match->data);
 
 	if (of_find_property(np, "uart-has-rtscts", NULL))
-		p->capabilities |= SCIx_HAVE_RTSCTS;
+		sp->has_rtscts = true;
 
 	return p;
 }
@@ -2863,7 +2866,7 @@ static int sci_probe_single(struct platform_device *dev,
 	if (IS_ERR(sciport->gpios) && PTR_ERR(sciport->gpios) != -ENOSYS)
 		return PTR_ERR(sciport->gpios);
 
-	if (p->capabilities & SCIx_HAVE_RTSCTS) {
+	if (sciport->has_rtscts) {
 		if (!IS_ERR_OR_NULL(mctrl_gpio_to_gpiod(sciport->gpios,
 							UART_GPIO_CTS)) ||
 		    !IS_ERR_OR_NULL(mctrl_gpio_to_gpiod(sciport->gpios,
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index b4419931bf4c..f9a4526f4ec5 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -43,18 +43,12 @@ struct plat_sci_port_ops {
 	void (*init_pins)(struct uart_port *, unsigned int cflag);
 };
 
-/*
- * Port-specific capabilities
- */
-#define SCIx_HAVE_RTSCTS	BIT(0)
-
 /*
  * Platform device specific platform_data struct
  */
 struct plat_sci_port {
 	unsigned int	type;			/* SCI / SCIF / IRDA / HSCIF */
 	upf_t		flags;			/* UPF_* flags */
-	unsigned long	capabilities;		/* Port features/capabilities */
 
 	unsigned int	sampling_rate;
 	unsigned int	scscr;			/* SCSCR initialization */
-- 
cgit v1.2.3


From dfc80387aefb78161f83732804c6d01c89c24595 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 11 Jan 2017 16:43:40 +0200
Subject: serial: sh-sci: Compute the regshift value for SCI ports

SCI instances found in SH SoCs have different spacing between registers
depending on the SoC. The platform data contains a regshift field that
tells the driver by how many bits to shift the register offset to
compute its address. We can compute the regshift value automatically
based on the memory resource size, there's no need to pass the value
through platform data.

Fix the sh7750 SCI and sh7760 SIM port memory resources length to ensure
proper computation of the regshift value.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/sh/kernel/cpu/sh3/setup-sh770x.c |  1 -
 arch/sh/kernel/cpu/sh4/setup-sh7750.c |  3 +--
 arch/sh/kernel/cpu/sh4/setup-sh7760.c | 10 ++++++++--
 drivers/tty/serial/sh-sci.c           |  8 +++++++-
 include/linux/serial_sci.h            |  1 -
 5 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index 592cd9ab30c4..59a88611df55 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -111,7 +111,6 @@ static struct platform_device rtc_device = {
 static struct plat_sci_port scif0_platform_data = {
 	.type		= PORT_SCI,
 	.ops		= &sh770x_sci_port_ops,
-	.regshift	= 1,
 };
 
 static struct resource scif0_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index d98a55416306..57d30689204d 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -39,11 +39,10 @@ static struct platform_device rtc_device = {
 
 static struct plat_sci_port sci_platform_data = {
 	.type		= PORT_SCI,
-	.regshift	= 2,
 };
 
 static struct resource sci_resources[] = {
-	DEFINE_RES_MEM(0xffe00000, 0x100),
+	DEFINE_RES_MEM(0xffe00000, 0x20),
 	DEFINE_RES_IRQ(evt2irq(0x4e0)),
 };
 
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index 0c0cdfc69dcc..e51fe1734e13 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -200,12 +200,18 @@ static struct platform_device scif2_device = {
 };
 
 static struct plat_sci_port scif3_platform_data = {
+	/*
+	 * This is actually a SIM card module serial port, based on an SCI with
+	 * additional registers. The sh-sci driver doesn't support the SIM port
+	 * type, declare it as a SCI. Don't declare the additional registers in
+	 * the memory resource or the driver will compute an incorrect regshift
+	 * value.
+	 */
 	.type		= PORT_SCI,
-	.regshift	= 2,
 };
 
 static struct resource scif3_resources[] = {
-	DEFINE_RES_MEM(0xfe480000, 0x100),
+	DEFINE_RES_MEM(0xfe480000, 0x10),
 	DEFINE_RES_IRQ(evt2irq(0xc00)),
 	DEFINE_RES_IRQ(evt2irq(0xc20)),
 	DEFINE_RES_IRQ(evt2irq(0xc40)),
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index a04ed40279d1..a18f4cb8e1fb 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -2574,9 +2574,15 @@ static int sci_init_single(struct platform_device *dev,
 
 	port->type		= p->type;
 	port->flags		= UPF_FIXED_PORT | UPF_BOOT_AUTOCONF | p->flags;
-	port->regshift		= p->regshift;
 	port->fifosize		= sci_port->params->fifosize;
 
+	if (port->type == PORT_SCI) {
+		if (sci_port->reg_size >= 0x20)
+			port->regshift = 2;
+		else
+			port->regshift = 1;
+	}
+
 	/*
 	 * The UART port needs an IRQ value, so we peg this to the RX IRQ
 	 * for the multi-IRQ ports, which is where we are primarily
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index f9a4526f4ec5..e598eaef3962 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -56,7 +56,6 @@ struct plat_sci_port {
 	/*
 	 * Platform overrides if necessary, defaults otherwise.
 	 */
-	unsigned char	regshift;
 	unsigned char	regtype;
 
 	struct plat_sci_port_ops	*ops;
-- 
cgit v1.2.3


From 9ed90d20449b01beb71a4e125d291a36c80c4ad4 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Mon, 16 Jan 2017 16:54:28 -0600
Subject: tty: move the non-file related parts of tty_release to new
 tty_release_struct

For in-kernel tty users, we need to be able to create and destroy
'struct tty' that are not associated with a file. The creation side is
fine, but tty_release() needs to be split into the file handle portion
and the struct tty portion. Introduce a new function, tty_release_struct,
to handle just the destroying of a struct tty.

Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-By: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 50 ++++++++++++++++++++++++++++++++------------------
 include/linux/tty.h  |  1 +
 2 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 734a635e7363..4790c0fb5a45 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1744,6 +1744,37 @@ static int tty_release_checks(struct tty_struct *tty, int idx)
 	return 0;
 }
 
+/**
+ *	tty_release_struct	-	release a tty struct
+ *	@tty: tty device
+ *	@idx: index of the tty
+ *
+ *	Performs the final steps to release and free a tty device. It is
+ *	roughly the reverse of tty_init_dev.
+ */
+void tty_release_struct(struct tty_struct *tty, int idx)
+{
+	/*
+	 * Ask the line discipline code to release its structures
+	 */
+	tty_ldisc_release(tty);
+
+	/* Wait for pending work before tty destruction commmences */
+	tty_flush_works(tty);
+
+	tty_debug_hangup(tty, "freeing structure\n");
+	/*
+	 * The release_tty function takes care of the details of clearing
+	 * the slots and preserving the termios structure. The tty_unlock_pair
+	 * should be safe as we keep a kref while the tty is locked (so the
+	 * unlock never unlocks a freed tty).
+	 */
+	mutex_lock(&tty_mutex);
+	release_tty(tty, idx);
+	mutex_unlock(&tty_mutex);
+}
+EXPORT_SYMBOL_GPL(tty_release_struct);
+
 /**
  *	tty_release		-	vfs callback for close
  *	@inode: inode of tty
@@ -1898,25 +1929,8 @@ int tty_release(struct inode *inode, struct file *filp)
 		return 0;
 
 	tty_debug_hangup(tty, "final close\n");
-	/*
-	 * Ask the line discipline code to release its structures
-	 */
-	tty_ldisc_release(tty);
-
-	/* Wait for pending work before tty destruction commmences */
-	tty_flush_works(tty);
-
-	tty_debug_hangup(tty, "freeing structure\n");
-	/*
-	 * The release_tty function takes care of the details of clearing
-	 * the slots and preserving the termios structure. The tty_unlock_pair
-	 * should be safe as we keep a kref while the tty is locked (so the
-	 * unlock never unlocks a freed tty).
-	 */
-	mutex_lock(&tty_mutex);
-	release_tty(tty, idx);
-	mutex_unlock(&tty_mutex);
 
+	tty_release_struct(tty, idx);
 	return 0;
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 40144f382516..86c7853282b7 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -528,6 +528,7 @@ extern int tty_alloc_file(struct file *file);
 extern void tty_add_file(struct tty_struct *tty, struct file *file);
 extern void tty_free_file(struct file *file);
 extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
+extern void tty_release_struct(struct tty_struct *tty, int idx);
 extern int tty_release(struct inode *inode, struct file *filp);
 extern void tty_init_termios(struct tty_struct *tty);
 extern int tty_standard_install(struct tty_driver *driver,
-- 
cgit v1.2.3


From 134e6a034cb004ed5acd3048792de70ced1c6cf5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Jan 2017 08:57:14 -0500
Subject: tracing: Show number of constants profiled in likely profiler

Now that constants are traced, it is useful to see the number of constants
that are traced in the likely/unlikely profiler in order to know if they
should be ignored or not.

The likely/unlikely will display a number after the "correct" number if a
"constant" count exists.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/compiler.h    | 15 ++++++----
 kernel/trace/trace_branch.c | 68 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index bbbe1570de1c..a73cc9afa784 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -101,13 +101,18 @@ struct ftrace_branch_data {
 	};
 };
 
+struct ftrace_likely_data {
+	struct ftrace_branch_data	data;
+	unsigned long			constant;
+};
+
 /*
  * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
  * to disable branch tracing on a per file basis.
  */
 #if defined(CONFIG_TRACE_BRANCH_PROFILING) \
     && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
-void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 			  int expect, int is_constant);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
@@ -115,13 +120,13 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val,
 
 #define __branch_check__(x, expect, is_constant) ({			\
 			int ______r;					\
-			static struct ftrace_branch_data		\
+			static struct ftrace_likely_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_annotated_branch"))) \
 				______f = {				\
-				.func = __func__,			\
-				.file = __FILE__,			\
-				.line = __LINE__,			\
+				.data.func = __func__,			\
+				.data.file = __FILE__,			\
+				.data.line = __LINE__,			\
 			};						\
 			______r = __builtin_expect(!!(x), expect);	\
 			ftrace_likely_update(&______f, ______r,		\
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7afe426ea528..fd483d74f8e1 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -200,25 +200,27 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 			  int expect, int is_constant)
 {
 	/* A constant is always correct */
-	if (is_constant)
+	if (is_constant) {
+		f->constant++;
 		val = expect;
+	}
 	/*
 	 * I would love to have a trace point here instead, but the
 	 * trace point code is so inundated with unlikely and likely
 	 * conditions that the recursive nightmare that exists is too
 	 * much to try to get working. At least for now.
 	 */
-	trace_likely_condition(f, val, expect);
+	trace_likely_condition(&f->data, val, expect);
 
 	/* FIXME: Make this atomic! */
 	if (val == expect)
-		f->correct++;
+		f->data.correct++;
 	else
-		f->incorrect++;
+		f->data.incorrect++;
 }
 EXPORT_SYMBOL(ftrace_likely_update);
 
@@ -249,29 +251,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p)
 	return percent;
 }
 
-static int branch_stat_show(struct seq_file *m, void *v)
+static const char *branch_stat_process_file(struct ftrace_branch_data *p)
 {
-	struct ftrace_branch_data *p = v;
 	const char *f;
-	long percent;
 
 	/* Only print the file, not the path */
 	f = p->file + strlen(p->file);
 	while (f >= p->file && *f != '/')
 		f--;
-	f++;
+	return ++f;
+}
+
+static void branch_stat_show(struct seq_file *m,
+			     struct ftrace_branch_data *p, const char *f)
+{
+	long percent;
 
 	/*
 	 * The miss is overlayed on correct, and hit on incorrect.
 	 */
 	percent = get_incorrect_percent(p);
 
-	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
 	if (percent < 0)
 		seq_puts(m, "  X ");
 	else
 		seq_printf(m, "%3ld ", percent);
+
 	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+}
+
+static int branch_stat_show_normal(struct seq_file *m,
+				   struct ftrace_branch_data *p, const char *f)
+{
+	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
+	branch_stat_show(m, p, f);
+	return 0;
+}
+
+static int annotate_branch_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+	int l;
+
+	f = branch_stat_process_file(&p->data);
+
+	if (!p->constant)
+		return branch_stat_show_normal(m, &p->data, f);
+
+	l = snprintf(NULL, 0, "/%lu", p->constant);
+	l = l > 8 ? 0 : 8 - l;
+
+	seq_printf(m, "%8lu/%lu %*lu ",
+		   p->data.correct, p->constant, l, p->data.incorrect);
+	branch_stat_show(m, &p->data, f);
 	return 0;
 }
 
@@ -283,7 +316,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace)
 static void *
 annotated_branch_stat_next(void *v, int idx)
 {
-	struct ftrace_branch_data *p = v;
+	struct ftrace_likely_data *p = v;
 
 	++p;
 
@@ -332,7 +365,7 @@ static struct tracer_stat annotated_branch_stats = {
 	.stat_next = annotated_branch_stat_next,
 	.stat_cmp = annotated_branch_stat_cmp,
 	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show
+	.stat_show = annotate_branch_stat_show
 };
 
 __init static int init_annotated_branch_stats(void)
@@ -383,12 +416,21 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
+static int all_branch_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_branch_data *p = v;
+	const char *f;
+
+	f = branch_stat_process_file(p);
+	return branch_stat_show_normal(m, p, f);
+}
+
 static struct tracer_stat all_branch_stats = {
 	.name = "branch_all",
 	.stat_start = all_branch_stat_start,
 	.stat_next = all_branch_stat_next,
 	.stat_headers = all_branch_stat_headers,
-	.stat_show = branch_stat_show
+	.stat_show = all_branch_stat_show
 };
 
 __init static int all_annotated_branch_stats(void)
-- 
cgit v1.2.3


From 579b2a65d245c093d3e63845c320b9321f112b75 Mon Sep 17 00:00:00 2001
From: Mitchel Humpherys <mitchelh@codeaurora.org>
Date: Fri, 6 Jan 2017 18:58:08 +0530
Subject: iommu: add IOMMU_PRIV attribute

Add the IOMMU_PRIV attribute, which is used to indicate privileged
mappings.

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Mitchel Humpherys <mitchelh@codeaurora.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/iommu.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0ff5111f6959..69e2417a2965 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -31,6 +31,13 @@
 #define IOMMU_CACHE	(1 << 2) /* DMA cache coherency */
 #define IOMMU_NOEXEC	(1 << 3)
 #define IOMMU_MMIO	(1 << 4) /* e.g. things like MSI doorbells */
+/*
+ * This is to make the IOMMU API setup privileged
+ * mapppings accessible by the master only at higher
+ * privileged execution level and inaccessible at
+ * less privileged levels.
+ */
+#define IOMMU_PRIV	(1 << 5)
 
 struct iommu_ops;
 struct iommu_group;
-- 
cgit v1.2.3


From b2fb366425ceb85dca56afa538257ec5a2c4f6d1 Mon Sep 17 00:00:00 2001
From: Mitchel Humpherys <mitchelh@codeaurora.org>
Date: Fri, 6 Jan 2017 18:58:11 +0530
Subject: common: DMA-mapping: add DMA_ATTR_PRIVILEGED attribute

This patch adds the DMA_ATTR_PRIVILEGED attribute to the DMA-mapping
subsystem.

Some advanced peripherals such as remote processors and GPUs perform
accesses to DMA buffers in both privileged "supervisor" and unprivileged
"user" modes.  This attribute is used to indicate to the DMA-mapping
subsystem that the buffer is fully accessible at the elevated privilege
level (and ideally inaccessible or at least read-only at the
lesser-privileged levels).

Cc: linux-doc@vger.kernel.org
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Mitchel Humpherys <mitchelh@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 Documentation/DMA-attributes.txt | 10 ++++++++++
 include/linux/dma-mapping.h      |  7 +++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
index 98bf7ac29aad..44c6bc496eee 100644
--- a/Documentation/DMA-attributes.txt
+++ b/Documentation/DMA-attributes.txt
@@ -143,3 +143,13 @@ So, this provides a way for drivers to avoid those error messages on calls
 where allocation failures are not a problem, and shouldn't bother the logs.
 
 NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
+
+DMA_ATTR_PRIVILEGED
+------------------------------
+
+Some advanced peripherals such as remote processors and GPUs perform
+accesses to DMA buffers in both privileged "supervisor" and unprivileged
+"user" modes.  This attribute is used to indicate to the DMA-mapping
+subsystem that the buffer is fully accessible at the elevated privilege
+level (and ideally inaccessible or at least read-only at the
+lesser-privileged levels).
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 10c5a17b1f51..c24721a33b4c 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -62,6 +62,13 @@
  */
 #define DMA_ATTR_NO_WARN	(1UL << 8)
 
+/*
+ * DMA_ATTR_PRIVILEGED: used to indicate that the buffer is fully
+ * accessible at an elevated privilege level (and ideally inaccessible or
+ * at least read-only at lesser-privileged levels).
+ */
+#define DMA_ATTR_PRIVILEGED		(1UL << 9)
+
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.
  * It can be given to a device to use as a DMA source or target.  A CPU cannot
-- 
cgit v1.2.3


From 737c85ca1c3af4f97acb61cd53415ec039b31111 Mon Sep 17 00:00:00 2001
From: Mitchel Humpherys <mitchelh@codeaurora.org>
Date: Fri, 6 Jan 2017 18:58:12 +0530
Subject: arm64/dma-mapping: Implement DMA_ATTR_PRIVILEGED

The newly added DMA_ATTR_PRIVILEGED is useful for creating mappings that
are only accessible to privileged DMA engines.  Implement it in
dma-iommu.c so that the ARM64 DMA IOMMU mapper can make use of it.

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Mitchel Humpherys <mitchelh@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/mm/dma-mapping.c |  6 +++---
 drivers/iommu/dma-iommu.c   | 12 +++++++++---
 include/linux/dma-iommu.h   |  3 ++-
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index e04082700bb1..ae9f817eadf2 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -558,7 +558,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 				 unsigned long attrs)
 {
 	bool coherent = is_device_dma_coherent(dev);
-	int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
+	int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
 	size_t iosize = size;
 	void *addr;
 
@@ -712,7 +712,7 @@ static dma_addr_t __iommu_map_page(struct device *dev, struct page *page,
 				   unsigned long attrs)
 {
 	bool coherent = is_device_dma_coherent(dev);
-	int prot = dma_direction_to_prot(dir, coherent);
+	int prot = dma_info_to_prot(dir, coherent, attrs);
 	dma_addr_t dev_addr = iommu_dma_map_page(dev, page, offset, size, prot);
 
 	if (!iommu_dma_mapping_error(dev, dev_addr) &&
@@ -770,7 +770,7 @@ static int __iommu_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
 		__iommu_sync_sg_for_device(dev, sgl, nelems, dir);
 
 	return iommu_dma_map_sg(dev, sgl, nelems,
-			dma_direction_to_prot(dir, coherent));
+				dma_info_to_prot(dir, coherent, attrs));
 }
 
 static void __iommu_unmap_sg_attrs(struct device *dev,
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 2db0d641cf45..3006eeebf521 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -181,16 +181,22 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 EXPORT_SYMBOL(iommu_dma_init_domain);
 
 /**
- * dma_direction_to_prot - Translate DMA API directions to IOMMU API page flags
+ * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
+ *                    page flags.
  * @dir: Direction of DMA transfer
  * @coherent: Is the DMA master cache-coherent?
+ * @attrs: DMA attributes for the mapping
  *
  * Return: corresponding IOMMU API page protection flags
  */
-int dma_direction_to_prot(enum dma_data_direction dir, bool coherent)
+int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
+		     unsigned long attrs)
 {
 	int prot = coherent ? IOMMU_CACHE : 0;
 
+	if (attrs & DMA_ATTR_PRIVILEGED)
+		prot |= IOMMU_PRIV;
+
 	switch (dir) {
 	case DMA_BIDIRECTIONAL:
 		return prot | IOMMU_READ | IOMMU_WRITE;
@@ -633,7 +639,7 @@ dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	return __iommu_dma_map(dev, phys, size,
-			dma_direction_to_prot(dir, false) | IOMMU_MMIO);
+			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO);
 }
 
 void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 7f7e9a7e3839..c5511e1d5560 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -34,7 +34,8 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 		u64 size, struct device *dev);
 
 /* General helpers for DMA-API <-> IOMMU-API interaction */
-int dma_direction_to_prot(enum dma_data_direction dir, bool coherent);
+int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
+		     unsigned long attrs);
 
 /*
  * These implement the bulk of the relevant DMA mapping callbacks, but require
-- 
cgit v1.2.3


From c92d781f1a5ea19708b1e1e2b85a3fbd4a738b30 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Mon, 16 Jan 2017 16:54:31 -0600
Subject: tty: constify tty_ldisc_receive_buf buffer pointer

This is needed to work with the client operations which uses const ptrs.

Really, the flags pointer could be const, too, but this would be a tree
wide fix.

Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-By: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_buffer.c | 2 +-
 include/linux/tty.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index aa80dc94ddc2..f4dc3e296dd5 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -422,7 +422,7 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string);
  *
  *	Returns the number of bytes not processed
  */
-int tty_ldisc_receive_buf(struct tty_ldisc *ld, unsigned char *p,
+int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 			  char *f, int count)
 {
 	if (ld->ops->receive_buf2)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 86c7853282b7..21c0fabfed60 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -657,7 +657,7 @@ extern int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty);
 extern void tty_ldisc_release(struct tty_struct *tty);
 extern void tty_ldisc_init(struct tty_struct *tty);
 extern void tty_ldisc_deinit(struct tty_struct *tty);
-extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, unsigned char *p,
+extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 				 char *f, int count);
 
 /* n_tty.c */
-- 
cgit v1.2.3


From 4567d686f5c6d955e57a3afa1741944c1e7f4033 Mon Sep 17 00:00:00 2001
From: Volodymyr Bendiuga <volodymyr.bendiuga@gmail.com>
Date: Thu, 19 Jan 2017 17:05:04 +0100
Subject: phy: increase size of MII_BUS_ID_SIZE and bus_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some bus names are pretty long and do not fit into
17 chars. Increase therefore MII_BUS_ID_SIZE and
phy_fixup.bus_id to larger number. Now mii_bus.id
can host larger name.

Signed-off-by: Volodymyr Bendiuga <volodymyr.bendiuga@gmail.com>
Signed-off-by: Magnus Öberg <magnus.oberg@westermo.se>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index f7d95f644eed..5c9d2529685f 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -158,11 +158,7 @@ static inline const char *phy_modes(phy_interface_t interface)
 /* Used when trying to connect to a specific phy (mii bus id:phy device id) */
 #define PHY_ID_FMT "%s:%02x"
 
-/*
- * Need to be a little smaller than phydev->dev.bus_id to leave room
- * for the ":%02x"
- */
-#define MII_BUS_ID_SIZE	(20 - 3)
+#define MII_BUS_ID_SIZE	61
 
 /* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit
    IEEE 802.3ae clause 45 addressing mode used by 10GIGE phy chips. */
@@ -632,7 +628,7 @@ struct phy_driver {
 /* A Structure for boards to register fixups with the PHY Lib */
 struct phy_fixup {
 	struct list_head list;
-	char bus_id[20];
+	char bus_id[MII_BUS_ID_SIZE + 3];
 	u32 phy_uid;
 	u32 phy_uid_mask;
 	int (*run)(struct phy_device *phydev);
-- 
cgit v1.2.3


From f9a1ef720e9e32bc6a4a382c15ac77d62749c79e Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Mon, 10 Oct 2016 16:05:53 +0300
Subject: net/mlx5: Add MTPPS and MTPPSE registers infrastructure

Implement query and set functionality for MTPPS and MTPPSE registers.
MTPPS (Management Pulse Per Second) provides the device PPS capabilities,
configures the PPS in and out modules and holds the PPS in time stamp.
Query MTPPS is supported only when HCA_CAP.pps is set and modify is supported
when HCA_CAP.pps_modify is set.

MTPPSE (Management Pulse Per Second Event) configures the different event
generation modes for PPS. Supported when HCA_CAP.pps is set.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  9 ++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  5 ++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 48 +++++++++++++++++
 include/linux/mlx5/device.h                        | 18 +++++++
 include/linux/mlx5/driver.h                        |  3 ++
 include/linux/mlx5/mlx5_ifc.h                      | 60 +++++++++++++++++++++-
 6 files changed, 142 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 5130d65dd41a..ea5d8d37a75c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -154,6 +154,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
 	case MLX5_EVENT_TYPE_PAGE_FAULT:
 		return "MLX5_EVENT_TYPE_PAGE_FAULT";
+	case MLX5_EVENT_TYPE_PPS_EVENT:
+		return "MLX5_EVENT_TYPE_PPS_EVENT";
 	default:
 		return "Unrecognized event";
 	}
@@ -470,6 +472,10 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			mlx5_port_module_event(dev, eqe);
 			break;
 
+		case MLX5_EVENT_TYPE_PPS_EVENT:
+			if (dev->event)
+				dev->event(dev, MLX5_DEV_EVENT_PPS, (unsigned long)eqe);
+			break;
 		default:
 			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
 				       eqe->type, eq->eqn);
@@ -684,6 +690,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	else
 		mlx5_core_dbg(dev, "port_module_event is not set\n");
 
+	if (MLX5_CAP_GEN(dev, pps))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
+
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 74241e82de63..090e3a1dedc2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -138,6 +138,11 @@ void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id);
 
 bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv);
 
+int mlx5_query_mtpps(struct mlx5_core_dev *dev, u32 *mtpps, u32 mtpps_size);
+int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size);
+int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode);
+int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode);
+
 void mlx5e_init(void);
 void mlx5e_cleanup(void);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index d2ec9d232a70..5ea85336b2e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -866,3 +866,51 @@ void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
 			       module_num, mlx5_pme_status[module_status - 1],
 			       mlx5_pme_error[error_type]);
 }
+
+int mlx5_query_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size)
+{
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), mtpps,
+				    mtpps_size, MLX5_REG_MTPPS, 0, 0);
+}
+
+int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size)
+{
+	u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	return mlx5_core_access_reg(mdev, mtpps, mtpps_size, out,
+				    sizeof(out), MLX5_REG_MTPPS, 0, 1);
+}
+
+int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode)
+{
+	u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	int err = 0;
+
+	MLX5_SET(mtppse_reg, in, pin, pin);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MTPPSE, 0, 0);
+	if (err)
+		return err;
+
+	*arm = MLX5_GET(mtppse_reg, in, event_arm);
+	*mode = MLX5_GET(mtppse_reg, in, event_generation_mode);
+
+	return err;
+}
+
+int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode)
+{
+	u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+
+	MLX5_SET(mtppse_reg, in, pin, pin);
+	MLX5_SET(mtppse_reg, in, event_arm, arm);
+	MLX5_SET(mtppse_reg, in, event_generation_mode, mode);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MTPPSE, 0, 1);
+}
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 7c5265db02a9..6ac8eb5a89ca 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -289,6 +289,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
 	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
 	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
+	MLX5_EVENT_TYPE_PPS_EVENT          = 0x25,
 
 	MLX5_EVENT_TYPE_DB_BF_CONGESTION   = 0x1a,
 	MLX5_EVENT_TYPE_STALL_EVENT	   = 0x1b,
@@ -569,6 +570,22 @@ struct mlx5_eqe_port_module {
 	u8        error_type;
 } __packed;
 
+struct mlx5_eqe_pps {
+	u8		rsvd0[3];
+	u8		pin;
+	u8		rsvd1[4];
+	union {
+		struct {
+			__be32		time_sec;
+			__be32		time_nsec;
+		};
+		struct {
+			__be64		time_stamp;
+		};
+	};
+	u8		rsvd2[12];
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -583,6 +600,7 @@ union ev_data {
 	struct mlx5_eqe_page_fault	page_fault;
 	struct mlx5_eqe_vport_change	vport_change;
 	struct mlx5_eqe_port_module	port_module;
+	struct mlx5_eqe_pps		pps;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3a309f6a4a15..ebbc8834063b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -125,6 +125,8 @@ enum {
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
+	MLX5_REG_MTPPS		 = 0x9053,
+	MLX5_REG_MTPPSE		 = 0x9054,
 };
 
 enum mlx5_dcbx_oper_mode {
@@ -172,6 +174,7 @@ enum mlx5_dev_event {
 	MLX5_DEV_EVENT_PKEY_CHANGE,
 	MLX5_DEV_EVENT_GUID_CHANGE,
 	MLX5_DEV_EVENT_CLIENT_REREG,
+	MLX5_DEV_EVENT_PPS,
 };
 
 enum mlx5_port_status {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 37327f6ba9cb..f5292f3b0301 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -835,7 +835,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
-	u8         reserved_at_1c0[0x3];
+	u8         reserved_at_1c0[0x1];
+	u8         pps[0x1];
+	u8         pps_modify[0x1];
 	u8         log_max_msg[0x5];
 	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
@@ -7821,6 +7823,60 @@ struct mlx5_ifc_initial_seg_bits {
 	u8         reserved_at_80a0[0x17fc0];
 };
 
+struct mlx5_ifc_mtpps_reg_bits {
+	u8         reserved_at_0[0xc];
+	u8         cap_number_of_pps_pins[0x4];
+	u8         reserved_at_10[0x4];
+	u8         cap_max_num_of_pps_in_pins[0x4];
+	u8         reserved_at_18[0x4];
+	u8         cap_max_num_of_pps_out_pins[0x4];
+
+	u8         reserved_at_20[0x24];
+	u8         cap_pin_3_mode[0x4];
+	u8         reserved_at_48[0x4];
+	u8         cap_pin_2_mode[0x4];
+	u8         reserved_at_50[0x4];
+	u8         cap_pin_1_mode[0x4];
+	u8         reserved_at_58[0x4];
+	u8         cap_pin_0_mode[0x4];
+
+	u8         reserved_at_60[0x4];
+	u8         cap_pin_7_mode[0x4];
+	u8         reserved_at_68[0x4];
+	u8         cap_pin_6_mode[0x4];
+	u8         reserved_at_70[0x4];
+	u8         cap_pin_5_mode[0x4];
+	u8         reserved_at_78[0x4];
+	u8         cap_pin_4_mode[0x4];
+
+	u8         reserved_at_80[0x80];
+
+	u8         enable[0x1];
+	u8         reserved_at_101[0xb];
+	u8         pattern[0x4];
+	u8         reserved_at_110[0x4];
+	u8         pin_mode[0x4];
+	u8         pin[0x8];
+
+	u8         reserved_at_120[0x20];
+
+	u8         time_stamp[0x40];
+
+	u8         out_pulse_duration[0x10];
+	u8         out_periodic_adjustment[0x10];
+
+	u8         reserved_at_1a0[0x60];
+};
+
+struct mlx5_ifc_mtppse_reg_bits {
+	u8         reserved_at_0[0x18];
+	u8         pin[0x8];
+	u8         event_arm[0x1];
+	u8         reserved_at_21[0x1b];
+	u8         event_generation_mode[0x4];
+	u8         reserved_at_40[0x40];
+};
+
 union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
 	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
@@ -7865,6 +7921,8 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
 	struct mlx5_ifc_slrg_reg_bits slrg_reg;
 	struct mlx5_ifc_sltp_reg_bits sltp_reg;
+	struct mlx5_ifc_mtpps_reg_bits mtpps_reg;
+	struct mlx5_ifc_mtppse_reg_bits mtppse_reg;
 	u8         reserved_at_0[0x60e0];
 };
 
-- 
cgit v1.2.3


From 105433659d394b70dc3b6ceb65d0c1673e14cee1 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Sun, 9 Oct 2016 16:25:43 +0300
Subject: net/mlx5: Add support to s-tag in mlx5 firmware interface

Add svlan_tag and rename vlan_tag to cvlan_tag in flow table entry
match param.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                       |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c         | 10 +++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c         |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c       | 12 ++++++------
 include/linux/mlx5/mlx5_ifc.h                           | 12 +++++++-----
 6 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index a191b9327b0c..8727116a4cab 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1704,9 +1704,9 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
 
 		if (ib_spec->eth.mask.vlan_tag) {
 			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 vlan_tag, 1);
+				 cvlan_tag, 1);
 			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 vlan_tag, 1);
+				 cvlan_tag, 1);
 
 			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
 				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 1fe80de5d68f..7ae0744ea50f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -172,7 +172,7 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 	dest.ft = priv->fs.l2.ft.t;
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
 
 	switch (rule_type) {
 	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
@@ -180,11 +180,11 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 		break;
 	case MLX5E_VLAN_RULE_TYPE_ANY_VID:
 		rule_p = &priv->fs.vlan.any_vlan_rule;
-		MLX5_SET(fte_match_param, spec->match_value, outer_headers.vlan_tag, 1);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1);
 		break;
 	default: /* MLX5E_VLAN_RULE_TYPE_MATCH_VID */
 		rule_p = &priv->fs.vlan.active_vlans_rule[vid];
-		MLX5_SET(fte_match_param, spec->match_value, outer_headers.vlan_tag, 1);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1);
 		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
 				 outer_headers.first_vid);
 		MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid,
@@ -991,7 +991,7 @@ static int __mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft, u32 *in
 
 	memset(in, 0, inlen);
 	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
-	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag);
 	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid);
 	MLX5_SET_CFG(in, start_flow_index, ix);
 	ix += MLX5E_VLAN_GROUP0_SIZE;
@@ -1003,7 +1003,7 @@ static int __mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft, u32 *in
 
 	memset(in, 0, inlen);
 	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
-	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag);
 	MLX5_SET_CFG(in, start_flow_index, ix);
 	ix += MLX5E_VLAN_GROUP1_SIZE;
 	MLX5_SET_CFG(in, end_flow_index, ix - 1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index d088effd7160..4b4323f3c158 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -237,9 +237,9 @@ static int set_flow_attrs(u32 *match_c, u32 *match_v,
 	if ((fs->flow_type & FLOW_EXT) &&
 	    (fs->m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK))) {
 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
-			 vlan_tag, 1);
+			 cvlan_tag, 1);
 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
-			 vlan_tag, 1);
+			 cvlan_tag, 1);
 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
 			 first_vid, 0xfff);
 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 46bef6a26a8c..d4af5507679f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -460,8 +460,8 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 						  FLOW_DISSECTOR_KEY_VLAN,
 						  f->mask);
 		if (mask->vlan_id || mask->vlan_priority) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c, vlan_tag, 1);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v, vlan_tag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
 
 			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, mask->vlan_id);
 			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, key->vlan_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index f14d9c9ba773..4b3b60be319d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -979,7 +979,7 @@ static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
 
 	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
 	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
-	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
 	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid);
 	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
 	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0);
@@ -1098,7 +1098,7 @@ static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
 	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
 
 	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
-	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
 	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
 	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
 	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
@@ -1115,7 +1115,7 @@ static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
 
 	memset(flow_group_in, 0, inlen);
 	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
-	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
 	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
 	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
 
@@ -1254,7 +1254,7 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 	}
 
 	if (vport->info.vlan || vport->info.qos)
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.vlan_tag);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
 
 	if (vport->info.spoofchk) {
 		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_47_16);
@@ -1335,8 +1335,8 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 	}
 
 	/* Allowed vlan rule */
-	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.vlan_tag);
-	MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag);
 	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid);
 	MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vport->info.vlan);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f5292f3b0301..6f19e4b8574f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -365,8 +365,8 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         ip_protocol[0x8];
 	u8         ip_dscp[0x6];
 	u8         ip_ecn[0x2];
-	u8         vlan_tag[0x1];
-	u8         reserved_at_91[0x1];
+	u8         cvlan_tag[0x1];
+	u8         svlan_tag[0x1];
 	u8         frag[0x1];
 	u8         reserved_at_93[0x4];
 	u8         tcp_flags[0x9];
@@ -398,9 +398,11 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         inner_second_cfi[0x1];
 	u8         inner_second_vid[0xc];
 
-	u8         outer_second_vlan_tag[0x1];
-	u8         inner_second_vlan_tag[0x1];
-	u8         reserved_at_62[0xe];
+	u8         outer_second_cvlan_tag[0x1];
+	u8         inner_second_cvlan_tag[0x1];
+	u8         outer_second_svlan_tag[0x1];
+	u8         inner_second_svlan_tag[0x1];
+	u8         reserved_at_64[0xc];
 	u8         gre_protocol[0x10];
 
 	u8         gre_key_h[0x18];
-- 
cgit v1.2.3


From cfdcbceaeffc669b70d904d80a2df9c86c232566 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Thu, 8 Dec 2016 15:52:00 +0200
Subject: net/mlx5: Expose PCAM, MCAM registers infrastructure

PCAM: Ports capabilities mask register.
MCAM: Management capabilities mask register.

PCAM and MCAM registers will provide information regarding firmware
support for different features, in order to avoid cases where new driver
combined with old firmware results in syndromes (for ex. PCIe counters
before this patchset).

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   | 16 ++++++++++++
 include/linux/mlx5/driver.h   |  2 ++
 include/linux/mlx5/mlx5_ifc.h | 60 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 77 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 6ac8eb5a89ca..79f38e67fe2f 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -970,6 +970,22 @@ enum mlx5_cap_type {
 	MLX5_CAP_NUM
 };
 
+enum mlx5_pcam_reg_groups {
+	MLX5_PCAM_REGS_5000_TO_507F                 = 0x0,
+};
+
+enum mlx5_pcam_feature_groups {
+	MLX5_PCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
+enum mlx5_mcam_reg_groups {
+	MLX5_MCAM_REGS_FIRST_128                    = 0x0,
+};
+
+enum mlx5_mcam_feature_groups {
+	MLX5_MCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
 /* GET Dev Caps macros */
 #define MLX5_CAP_GEN(mdev, cap) \
 	MLX5_GET(cmd_hca_cap, mdev->hca_caps_cur[MLX5_CAP_GENERAL], cap)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ebbc8834063b..60c2b156da8c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -121,12 +121,14 @@ enum {
 	MLX5_REG_PVLC		 = 0x500f,
 	MLX5_REG_PCMR		 = 0x5041,
 	MLX5_REG_PMLP		 = 0x5002,
+	MLX5_REG_PCAM		 = 0x507f,
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
 	MLX5_REG_MTPPS		 = 0x9053,
 	MLX5_REG_MTPPSE		 = 0x9054,
+	MLX5_REG_MCAM		 = 0x907f,
 };
 
 enum mlx5_dcbx_oper_mode {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f19e4b8574f..e8061a95326a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -826,7 +826,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         nic_flow_table[0x1];
 	u8         eswitch_flow_table[0x1];
 	u8	   early_vf_enable[0x1];
-	u8         reserved_at_1a9[0x2];
+	u8         mcam_reg[0x1];
+	u8         pcam_reg[0x1];
 	u8         local_ca_ack_delay[0x5];
 	u8         port_module_event[0x1];
 	u8         reserved_at_1b1[0x1];
@@ -7481,6 +7482,63 @@ struct mlx5_ifc_peir_reg_bits {
 	u8         error_type[0x8];
 };
 
+struct mlx5_ifc_pcam_enhanced_features_bits {
+	u8         reserved_at_0[0x7e];
+
+	u8         ppcnt_discard_group[0x1];
+	u8         ppcnt_statistical_group[0x1];
+};
+
+struct mlx5_ifc_pcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	union {
+		u8         reserved_at_0[0x80];
+	} port_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_pcam_enhanced_features_bits enhanced_features;
+		u8         reserved_at_0[0x80];
+	} feature_cap_mask;
+
+	u8         reserved_at_1c0[0xc0];
+};
+
+struct mlx5_ifc_mcam_enhanced_features_bits {
+	u8         reserved_at_0[0x7f];
+
+	u8         pcie_performance_group[0x1];
+};
+
+struct mlx5_ifc_mcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	union {
+		u8         reserved_at_0[0x80];
+	} mng_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_mcam_enhanced_features_bits enhanced_features;
+		u8         reserved_at_0[0x80];
+	} mng_feature_cap_mask;
+
+	u8         reserved_at_1c0[0x80];
+};
+
 struct mlx5_ifc_pcap_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
-- 
cgit v1.2.3


From 71862561f3a62015a11de16d1c306481e8415c08 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Thu, 8 Dec 2016 16:03:31 +0200
Subject: net/mlx5: Query and cache PCAM, MCAM registers on initialization

On load_one, we now cache our capabilities registers internally, similar
to QUERY_HCA_CAP. Capabilities can later be queried using macros
introduced in this patch.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c | 20 ++++++++++++++++++++
 include/linux/mlx5/device.h                  |  6 ++++++
 include/linux/mlx5/driver.h                  |  4 ++++
 3 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 5718aada6605..d0bbefa08af7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -91,6 +91,20 @@ out:
 }
 EXPORT_SYMBOL(mlx5_core_query_vendor_id);
 
+static int mlx5_get_pcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_pcam_reg(dev, dev->caps.pcam,
+				   MLX5_PCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_PCAM_REGS_5000_TO_507F);
+}
+
+static int mlx5_get_mcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_mcam_reg(dev, dev->caps.mcam,
+				   MLX5_MCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_MCAM_REGS_FIRST_128);
+}
+
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 {
 	int err;
@@ -154,6 +168,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 			return err;
 	}
 
+	if (MLX5_CAP_GEN(dev, pcam_reg))
+		mlx5_get_pcam_reg(dev);
+
+	if (MLX5_CAP_GEN(dev, mcam_reg))
+		mlx5_get_mcam_reg(dev);
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 79f38e67fe2f..f21528abfb74 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1081,6 +1081,12 @@ enum mlx5_mcam_feature_groups {
 #define MLX5_CAP_QOS(mdev, cap)\
 	MLX5_GET(qos_cap, mdev->hca_caps_cur[MLX5_CAP_QOS], cap)
 
+#define MLX5_CAP_PCAM_FEATURE(mdev, fld) \
+	MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld)
+
+#define MLX5_CAP_MCAM_FEATURE(mdev, fld) \
+	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
+
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
 	MLX5_CMD_STAT_INT_ERR			= 0x1,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 60c2b156da8c..69c4661d391e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -739,6 +739,10 @@ struct mlx5_core_dev {
 	struct mlx5_port_caps	port_caps[MLX5_MAX_PORTS];
 	u32 hca_caps_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 	u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
+	struct {
+		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
+		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
+	} caps;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
 	enum mlx5_device_state	state;
-- 
cgit v1.2.3


From d8dc0508c50f838f8abcf023cd74ca9a0f86b5a8 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Tue, 27 Sep 2016 17:04:51 +0300
Subject: net/mlx5: Add PPCNT physical layer statistical group infrastructure

Add the needed infrastructure for future use of PPCNT physical layer
statistical group.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/device.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f21528abfb74..5ad1d3ca47dc 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1115,6 +1115,7 @@ enum {
 	MLX5_PER_PRIORITY_COUNTERS_GROUP      = 0x10,
 	MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11,
 	MLX5_PHYSICAL_LAYER_COUNTERS_GROUP    = 0x12,
+	MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP = 0x16,
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e8061a95326a..f4860fa719d9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1384,6 +1384,42 @@ struct mlx5_ifc_phys_layer_cntrs_bits {
 	u8         reserved_at_640[0x180];
 };
 
+struct mlx5_ifc_phys_layer_statistical_cntrs_bits {
+	u8         time_since_last_clear_high[0x20];
+
+	u8         time_since_last_clear_low[0x20];
+
+	u8         phy_received_bits_high[0x20];
+
+	u8         phy_received_bits_low[0x20];
+
+	u8         phy_symbol_errors_high[0x20];
+
+	u8         phy_symbol_errors_low[0x20];
+
+	u8         phy_corrected_bits_high[0x20];
+
+	u8         phy_corrected_bits_low[0x20];
+
+	u8         phy_corrected_bits_lane0_high[0x20];
+
+	u8         phy_corrected_bits_lane0_low[0x20];
+
+	u8         phy_corrected_bits_lane1_high[0x20];
+
+	u8         phy_corrected_bits_lane1_low[0x20];
+
+	u8         phy_corrected_bits_lane2_high[0x20];
+
+	u8         phy_corrected_bits_lane2_low[0x20];
+
+	u8         phy_corrected_bits_lane3_high[0x20];
+
+	u8         phy_corrected_bits_lane3_low[0x20];
+
+	u8         reserved_at_200[0x5c0];
+};
+
 struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits {
 	u8	   symbol_error_counter[0x10];
 
@@ -2928,6 +2964,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
 	struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
 	struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout;
 	struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
+	struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs;
 	u8         reserved_at_0[0x7c0];
 };
 
-- 
cgit v1.2.3


From 8ed1a6306dc7892b63be7cdb1e3b1123265f42ff Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Thu, 17 Nov 2016 13:46:01 +0200
Subject: net/mlx5: Add MPCNT register infrastructure

Add the needed infrastructure for future use of MPCNT register.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   |  4 ++++
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 5ad1d3ca47dc..1cf97dce8215 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1119,6 +1119,10 @@ enum {
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
 
+enum {
+	MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP       = 0x0,
+};
+
 static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
 {
 	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 69c4661d391e..f4d6d390a9cf 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -126,6 +126,7 @@ enum {
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
+	MLX5_REG_MPCNT		 = 0x9051,
 	MLX5_REG_MTPPS		 = 0x9053,
 	MLX5_REG_MTPPSE		 = 0x9054,
 	MLX5_REG_MCAM		 = 0x907f,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f4860fa719d9..d96ebc319d63 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1802,6 +1802,30 @@ struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
 	u8         reserved_at_4c0[0x300];
 };
 
+struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
+	u8         life_time_counter_high[0x20];
+
+	u8         life_time_counter_low[0x20];
+
+	u8         rx_errors[0x20];
+
+	u8         tx_errors[0x20];
+
+	u8         l0_to_recovery_eieos[0x20];
+
+	u8         l0_to_recovery_ts[0x20];
+
+	u8         l0_to_recovery_framing[0x20];
+
+	u8         l0_to_recovery_retrain[0x20];
+
+	u8         crc_error_dllp[0x20];
+
+	u8         crc_error_tlp[0x20];
+
+	u8         reserved_at_140[0x680];
+};
+
 struct mlx5_ifc_cmd_inter_comp_event_bits {
 	u8         command_completion_vector[0x20];
 
@@ -2968,6 +2992,11 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
 	u8         reserved_at_0[0x7c0];
 };
 
+union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits {
+	struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits pcie_perf_cntrs_grp_data_layout;
+	u8         reserved_at_0[0x7c0];
+};
+
 union mlx5_ifc_event_auto_bits {
 	struct mlx5_ifc_comp_event_bits comp_event;
 	struct mlx5_ifc_dct_events_bits dct_events;
@@ -7290,6 +7319,18 @@ struct mlx5_ifc_ppcnt_reg_bits {
 	union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
 };
 
+struct mlx5_ifc_mpcnt_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         pcie_index[0x8];
+	u8         reserved_at_10[0xa];
+	u8         grp[0x6];
+
+	u8         clr[0x1];
+	u8         reserved_at_21[0x1f];
+
+	union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits counter_set;
+};
+
 struct mlx5_ifc_ppad_reg_bits {
 	u8         reserved_at_0[0x3];
 	u8         single_mac[0x1];
@@ -8006,6 +8047,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
 	struct mlx5_ifc_ppad_reg_bits ppad_reg;
 	struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
+	struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
 	struct mlx5_ifc_pplm_reg_bits pplm_reg;
 	struct mlx5_ifc_pplr_reg_bits pplr_reg;
 	struct mlx5_ifc_ppsc_reg_bits ppsc_reg;
-- 
cgit v1.2.3


From 701052c578195e6e02a22647fa6fd1c90c31dafd Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Wed, 14 Dec 2016 17:40:41 +0200
Subject: net/mlx5: Move cached hca caps to designated caps struct

The caps structure consists of hca caps and port/management caps,
all under one roof.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |  6 ++--
 include/linux/mlx5/device.h                       | 34 +++++++++++------------
 include/linux/mlx5/driver.h                       |  4 +--
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 0ac7a2fc916c..85ff4b843b4b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1665,7 +1665,7 @@ static int create_leaf_prios(struct mlx5_flow_namespace *ns, int prio,
 
 #define FLOW_TABLE_BIT_SZ 1
 #define GET_FLOW_TABLE_CAP(dev, offset) \
-	((be32_to_cpu(*((__be32 *)(dev->hca_caps_cur[MLX5_CAP_FLOW_TABLE]) +	\
+	((be32_to_cpu(*((__be32 *)(dev->caps.hca_cur[MLX5_CAP_FLOW_TABLE]) +	\
 			offset / 32)) >>					\
 	  (32 - FLOW_TABLE_BIT_SZ - (offset & 0x1f))) & FLOW_TABLE_BIT_SZ)
 static bool has_required_caps(struct mlx5_core_dev *dev, struct node_caps *caps)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 5d160f33bc17..84f7970c5080 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -418,11 +418,11 @@ static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev,
 
 	switch (cap_mode) {
 	case HCA_CAP_OPMOD_GET_MAX:
-		memcpy(dev->hca_caps_max[cap_type], hca_caps,
+		memcpy(dev->caps.hca_max[cap_type], hca_caps,
 		       MLX5_UN_SZ_BYTES(hca_cap_union));
 		break;
 	case HCA_CAP_OPMOD_GET_CUR:
-		memcpy(dev->hca_caps_cur[cap_type], hca_caps,
+		memcpy(dev->caps.hca_cur[cap_type], hca_caps,
 		       MLX5_UN_SZ_BYTES(hca_cap_union));
 		break;
 	default:
@@ -513,7 +513,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 
 	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
 				   capability);
-	memcpy(set_hca_cap, dev->hca_caps_cur[MLX5_CAP_GENERAL],
+	memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_GENERAL],
 	       MLX5_ST_SZ_BYTES(cmd_hca_cap));
 
 	mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n",
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 1cf97dce8215..7b6cd67a263f 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -988,36 +988,36 @@ enum mlx5_mcam_feature_groups {
 
 /* GET Dev Caps macros */
 #define MLX5_CAP_GEN(mdev, cap) \
-	MLX5_GET(cmd_hca_cap, mdev->hca_caps_cur[MLX5_CAP_GENERAL], cap)
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
 
 #define MLX5_CAP_GEN_MAX(mdev, cap) \
-	MLX5_GET(cmd_hca_cap, mdev->hca_caps_max[MLX5_CAP_GENERAL], cap)
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap)
 
 #define MLX5_CAP_ETH(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->hca_caps_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+		 mdev->caps.hca_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap)
 
 #define MLX5_CAP_ETH_MAX(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
-		 mdev->hca_caps_max[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+		 mdev->caps.hca_max[MLX5_CAP_ETHERNET_OFFLOADS], cap)
 
 #define MLX5_CAP_ROCE(mdev, cap) \
-	MLX5_GET(roce_cap, mdev->hca_caps_cur[MLX5_CAP_ROCE], cap)
+	MLX5_GET(roce_cap, mdev->caps.hca_cur[MLX5_CAP_ROCE], cap)
 
 #define MLX5_CAP_ROCE_MAX(mdev, cap) \
-	MLX5_GET(roce_cap, mdev->hca_caps_max[MLX5_CAP_ROCE], cap)
+	MLX5_GET(roce_cap, mdev->caps.hca_max[MLX5_CAP_ROCE], cap)
 
 #define MLX5_CAP_ATOMIC(mdev, cap) \
-	MLX5_GET(atomic_caps, mdev->hca_caps_cur[MLX5_CAP_ATOMIC], cap)
+	MLX5_GET(atomic_caps, mdev->caps.hca_cur[MLX5_CAP_ATOMIC], cap)
 
 #define MLX5_CAP_ATOMIC_MAX(mdev, cap) \
-	MLX5_GET(atomic_caps, mdev->hca_caps_max[MLX5_CAP_ATOMIC], cap)
+	MLX5_GET(atomic_caps, mdev->caps.hca_max[MLX5_CAP_ATOMIC], cap)
 
 #define MLX5_CAP_FLOWTABLE(mdev, cap) \
-	MLX5_GET(flow_table_nic_cap, mdev->hca_caps_cur[MLX5_CAP_FLOW_TABLE], cap)
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca_cur[MLX5_CAP_FLOW_TABLE], cap)
 
 #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \
-	MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap)
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca_max[MLX5_CAP_FLOW_TABLE], cap)
 
 #define MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) \
 	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.cap)
@@ -1039,11 +1039,11 @@ enum mlx5_mcam_feature_groups {
 
 #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
-		 mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
 
 #define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
-		 mdev->hca_caps_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+		 mdev->caps.hca_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
 
 #define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap)
@@ -1065,21 +1065,21 @@ enum mlx5_mcam_feature_groups {
 
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
-		 mdev->hca_caps_cur[MLX5_CAP_ESWITCH], cap)
+		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH], cap)
 
 #define MLX5_CAP_ESW_MAX(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
-		 mdev->hca_caps_max[MLX5_CAP_ESWITCH], cap)
+		 mdev->caps.hca_max[MLX5_CAP_ESWITCH], cap)
 
 #define MLX5_CAP_ODP(mdev, cap)\
-	MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap)
+	MLX5_GET(odp_cap, mdev->caps.hca_cur[MLX5_CAP_ODP], cap)
 
 #define MLX5_CAP_VECTOR_CALC(mdev, cap) \
 	MLX5_GET(vector_calc_cap, \
-		 mdev->hca_caps_cur[MLX5_CAP_VECTOR_CALC], cap)
+		 mdev->caps.hca_cur[MLX5_CAP_VECTOR_CALC], cap)
 
 #define MLX5_CAP_QOS(mdev, cap)\
-	MLX5_GET(qos_cap, mdev->hca_caps_cur[MLX5_CAP_QOS], cap)
+	MLX5_GET(qos_cap, mdev->caps.hca_cur[MLX5_CAP_QOS], cap)
 
 #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \
 	MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f4d6d390a9cf..1bc4641734da 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -738,9 +738,9 @@ struct mlx5_core_dev {
 	char			board_id[MLX5_BOARD_ID_LEN];
 	struct mlx5_cmd		cmd;
 	struct mlx5_port_caps	port_caps[MLX5_MAX_PORTS];
-	u32 hca_caps_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
-	u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 	struct {
+		u32 hca_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
+		u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
 	} caps;
-- 
cgit v1.2.3


From acb04058de49458010c44bb35b849d45113fd668 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Jan 2017 14:36:33 +0100
Subject: sched/clock: Fix hotplug crash

Mike reported that he could trigger the WARN_ON_ONCE() in
set_sched_clock_stable() using hotplug.

This exposed a fundamental problem with the interface, we should never
mark the TSC stable if we ever find it to be unstable. Therefore
set_sched_clock_stable() is a broken interface.

The reason it existed is that not having it is a pain, it means all
relevant architecture code needs to call clear_sched_clock_stable()
where appropriate.

Of the three architectures that select HAVE_UNSTABLE_SCHED_CLOCK ia64
and parisc are trivial in that they never called
set_sched_clock_stable(), so add an unconditional call to
clear_sched_clock_stable() to them.

For x86 the story is a lot more involved, and what this patch tries to
do is ensure we preserve the status quo. So even is Cyrix or Transmeta
have usable TSC they never called set_sched_clock_stable() so they now
get an explicit mark unstable.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9881b024b7d7 ("sched/clock: Delay switching sched_clock to stable")
Link: http://lkml.kernel.org/r/20170119133633.GB6536@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/setup.c        |  2 ++
 arch/parisc/kernel/setup.c      |  2 ++
 arch/x86/kernel/cpu/amd.c       |  6 ++++--
 arch/x86/kernel/cpu/centaur.c   |  6 ++++--
 arch/x86/kernel/cpu/common.c    |  3 +++
 arch/x86/kernel/cpu/cyrix.c     |  2 ++
 arch/x86/kernel/cpu/intel.c     |  6 ++++--
 arch/x86/kernel/cpu/transmeta.c |  3 +++
 arch/x86/kernel/kvmclock.c      |  2 +-
 include/linux/sched.h           |  1 -
 kernel/sched/clock.c            | 29 ++++++++---------------------
 11 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 7ec7acc844c2..c483ece3eb84 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -619,6 +619,8 @@ setup_arch (char **cmdline_p)
 	check_sal_cache_flush();
 #endif
 	paging_init();
+
+	clear_sched_clock_stable();
 }
 
 /*
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 2e66a887788e..068ed3607bac 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -36,6 +36,7 @@
 #undef PCI_DEBUG
 #include <linux/proc_fs.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 
 #include <asm/processor.h>
 #include <asm/sections.h>
@@ -176,6 +177,7 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;	/* we use do_take_over_console() later ! */
 #endif
 
+	clear_sched_clock_stable();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 71cae73a5076..1bb253a6ee4d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -548,8 +548,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 1661d8ec9280..2c234a6d94c4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,5 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
+
+#include <linux/sched.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820.h>
@@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
 #endif
+
+	clear_sched_clock_stable();
 }
 
 static void init_centaur(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dc1697ca5191..3457186275a0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c)
 			strcpy(c->x86_model_id, "386");
 	}
 #endif
+	clear_sched_clock_stable();
 }
 
 static const struct cpu_dev default_cpu = {
@@ -1055,6 +1056,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	if (this_cpu->c_init)
 		this_cpu->c_init(c);
+	else
+		clear_sched_clock_stable();
 
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index bd9dcd6b712d..47416f959a48 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -9,6 +9,7 @@
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
 #include <asm/cpufeature.h>
+#include <linux/sched.h>
 
 #include "cpu.h"
 
@@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
 		break;
 	}
+	clear_sched_clock_stable();
 }
 
 static void init_cyrix(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcd484d2bb03..26eaff4907b2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -124,8 +124,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 34178564be2a..c1ea5b999839 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
 		if (xlvl >= 0x80860001)
 			c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
 	}
+
+	clear_sched_clock_stable();
 }
 
 static void init_transmeta(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2a5cafdf8808..542710b99f52 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable)
 {
 	if (!stable) {
 		pv_time_ops.sched_clock = kvm_clock_read;
+		clear_sched_clock_stable();
 		return;
 	}
 
 	kvm_sched_clock_offset = kvm_clock_read();
 	pv_time_ops.sched_clock = kvm_sched_clock_read;
-	set_sched_clock_stable();
 
 	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
 			kvm_sched_clock_offset);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a8daed914eef..69e6852fede1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2547,7 +2547,6 @@ extern void sched_clock_init_late(void);
  * is reliable after all:
  */
 extern int sched_clock_stable(void);
-extern void set_sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
 extern void sched_clock_tick(void);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 7713b2b53f61..ad64efe41722 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -83,8 +83,15 @@ void sched_clock_init(void)
 }
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+/*
+ * We must start with !__sched_clock_stable because the unstable -> stable
+ * transition is accurate, while the stable -> unstable transition is not.
+ *
+ * Similarly we start with __sched_clock_stable_early, thereby assuming we
+ * will become stable, such that there's only a single 1 -> 0 transition.
+ */
 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
-static int __sched_clock_stable_early;
+static int __sched_clock_stable_early = 1;
 
 /*
  * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
@@ -132,24 +139,6 @@ static void __set_sched_clock_stable(void)
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
-void set_sched_clock_stable(void)
-{
-	__sched_clock_stable_early = 1;
-
-	smp_mb(); /* matches sched_clock_init_late() */
-
-	/*
-	 * This really should only be called early (before
-	 * sched_clock_init_late()) when guestimating our sched_clock() is
-	 * solid.
-	 *
-	 * After that we test stability and we can negate our guess using
-	 * clear_sched_clock_stable, possibly from a watchdog.
-	 */
-	if (WARN_ON_ONCE(sched_clock_running == 2))
-		__set_sched_clock_stable();
-}
-
 static void __clear_sched_clock_stable(struct work_struct *work)
 {
 	struct sched_clock_data *scd = this_scd();
@@ -199,8 +188,6 @@ void sched_clock_init_late(void)
 
 	if (__sched_clock_stable_early)
 		__set_sched_clock_stable();
-	else
-		__clear_sched_clock_stable(NULL);
 }
 
 /*
-- 
cgit v1.2.3


From 9c829c097f2f1cbebcfff69a7254b1df9852fe4e Mon Sep 17 00:00:00 2001
From: Stephen Boyd <stephen.boyd@linaro.org>
Date: Wed, 28 Dec 2016 14:56:47 -0800
Subject: of: device: Support loading a module with OF based modalias

In the case of ULPI devices, we want to be able to load the
driver before registering the device so that we don't get stuck
in a loop waiting for the phy module to appear and failing usb
controller probe. Currently we request the ulpi module via the
ulpi ids, but in the DT case we might need to request it with the
OF based modalias instead. Add a common function that allows
anyone to request a module with the OF based modalias.

Acked-by: Rob Herring <robh@kernel.org>
Cc: <devicetree@vger.kernel.org>
Signed-off-by: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/of/device.c       | 23 +++++++++++++++++++++++
 include/linux/of_device.h |  6 ++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index fd5cfad7c403..8a22a253a830 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -226,6 +226,29 @@ ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len)
 	return tsize;
 }
 
+int of_device_request_module(struct device *dev)
+{
+	char *str;
+	ssize_t size;
+	int ret;
+
+	size = of_device_get_modalias(dev, NULL, 0);
+	if (size < 0)
+		return size;
+
+	str = kmalloc(size + 1, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	of_device_get_modalias(dev, str, size);
+	str[size] = '\0';
+	ret = request_module(str);
+	kfree(str);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(of_device_request_module);
+
 /**
  * of_device_uevent - Display OF related uevent information
  */
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index cc7dd687a89d..e9afbcc8de12 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -37,6 +37,7 @@ extern const void *of_device_get_match_data(const struct device *dev);
 
 extern ssize_t of_device_get_modalias(struct device *dev,
 					char *str, ssize_t len);
+extern int of_device_request_module(struct device *dev);
 
 extern void of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
 extern int of_device_uevent_modalias(struct device *dev, struct kobj_uevent_env *env);
@@ -78,6 +79,11 @@ static inline int of_device_get_modalias(struct device *dev,
 	return -ENODEV;
 }
 
+static inline int of_device_request_module(struct device *dev)
+{
+	return -ENODEV;
+}
+
 static inline int of_device_uevent_modalias(struct device *dev,
 				   struct kobj_uevent_env *env)
 {
-- 
cgit v1.2.3


From a89b94b53371bbfa582787c2fa3378000ea4263d Mon Sep 17 00:00:00 2001
From: Stephen Boyd <stephen.boyd@linaro.org>
Date: Wed, 28 Dec 2016 14:56:51 -0800
Subject: usb: chipidea: Handle extcon events properly

We're currently emulating the vbus and id interrupts in the OTGSC
read API, but we also need to make sure that if we're handling
the events with extcon that we don't enable the interrupts for
those events in the hardware. Therefore, properly emulate this
register if we're using extcon, but don't enable the interrupts.
This allows me to get my cable connect/disconnect working
properly without getting spurious interrupts on my device that
uses an extcon for these two events.

Acked-by: Peter Chen <peter.chen@nxp.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Ivan T. Ivanov" <iivanov.xz@gmail.com>
Fixes: 3ecb3e09b042 ("usb: chipidea: Use extcon framework for VBUS and ID detect")
Signed-off-by: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/otg.c   | 46 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/usb/chipidea.h |  2 ++
 2 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/otg.c b/drivers/usb/chipidea/otg.c
index a829607c3e4d..0cf149edddd8 100644
--- a/drivers/usb/chipidea/otg.c
+++ b/drivers/usb/chipidea/otg.c
@@ -44,12 +44,15 @@ u32 hw_read_otgsc(struct ci_hdrc *ci, u32 mask)
 		else
 			val &= ~OTGSC_BSVIS;
 
-		cable->changed = false;
-
 		if (cable->state)
 			val |= OTGSC_BSV;
 		else
 			val &= ~OTGSC_BSV;
+
+		if (cable->enabled)
+			val |= OTGSC_BSVIE;
+		else
+			val &= ~OTGSC_BSVIE;
 	}
 
 	cable = &ci->platdata->id_extcon;
@@ -59,15 +62,18 @@ u32 hw_read_otgsc(struct ci_hdrc *ci, u32 mask)
 		else
 			val &= ~OTGSC_IDIS;
 
-		cable->changed = false;
-
 		if (cable->state)
 			val |= OTGSC_ID;
 		else
 			val &= ~OTGSC_ID;
+
+		if (cable->enabled)
+			val |= OTGSC_IDIE;
+		else
+			val &= ~OTGSC_IDIE;
 	}
 
-	return val;
+	return val & mask;
 }
 
 /**
@@ -77,6 +83,36 @@ u32 hw_read_otgsc(struct ci_hdrc *ci, u32 mask)
  */
 void hw_write_otgsc(struct ci_hdrc *ci, u32 mask, u32 data)
 {
+	struct ci_hdrc_cable *cable;
+
+	cable = &ci->platdata->vbus_extcon;
+	if (!IS_ERR(cable->edev)) {
+		if (data & mask & OTGSC_BSVIS)
+			cable->changed = false;
+
+		/* Don't enable vbus interrupt if using external notifier */
+		if (data & mask & OTGSC_BSVIE) {
+			cable->enabled = true;
+			data &= ~OTGSC_BSVIE;
+		} else if (mask & OTGSC_BSVIE) {
+			cable->enabled = false;
+		}
+	}
+
+	cable = &ci->platdata->id_extcon;
+	if (!IS_ERR(cable->edev)) {
+		if (data & mask & OTGSC_IDIS)
+			cable->changed = false;
+
+		/* Don't enable id interrupt if using external notifier */
+		if (data & mask & OTGSC_IDIE) {
+			cable->enabled = true;
+			data &= ~OTGSC_IDIE;
+		} else if (mask & OTGSC_IDIE) {
+			cable->enabled = false;
+		}
+	}
+
 	hw_write(ci, OP_OTGSC, mask | OTGSC_INT_STATUS_BITS, data);
 }
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 5dd75fa47dd8..f9be467d6695 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -14,6 +14,7 @@ struct ci_hdrc;
  * struct ci_hdrc_cable - structure for external connector cable state tracking
  * @state: current state of the line
  * @changed: set to true when extcon event happen
+ * @enabled: set to true if we've enabled the vbus or id interrupt
  * @edev: device which generate events
  * @ci: driver state of the chipidea device
  * @nb: hold event notification callback
@@ -22,6 +23,7 @@ struct ci_hdrc;
 struct ci_hdrc_cable {
 	bool				state;
 	bool				changed;
+	bool				enabled;
 	struct extcon_dev		*edev;
 	struct ci_hdrc			*ci;
 	struct notifier_block		nb;
-- 
cgit v1.2.3


From 8feb3680bd0363a8d784fa0d065e0a6cdc9e0cff Mon Sep 17 00:00:00 2001
From: Stephen Boyd <stephen.boyd@linaro.org>
Date: Wed, 28 Dec 2016 14:56:52 -0800
Subject: usb: chipidea: Add platform flag for wrapper phy management

The ULPI phy on qcom platforms needs to be initialized and
powered on after a USB reset and before we toggle the run/stop
bit. Otherwise, the phy locks up and doesn't work properly.
Therefore, add a flag to skip any phy power management in the
core layer, leaving it up to the glue driver to manage.

Acked-by: Peter Chen <peter.chen@nxp.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/core.c  | 6 ++++++
 include/linux/usb/chipidea.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 24859b44c45c..b399326db08f 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -361,6 +361,9 @@ static int _ci_usb_phy_init(struct ci_hdrc *ci)
  */
 static void ci_usb_phy_exit(struct ci_hdrc *ci)
 {
+	if (ci->platdata->flags & CI_HDRC_OVERRIDE_PHY_CONTROL)
+		return;
+
 	if (ci->phy) {
 		phy_power_off(ci->phy);
 		phy_exit(ci->phy);
@@ -379,6 +382,9 @@ static int ci_usb_phy_init(struct ci_hdrc *ci)
 {
 	int ret;
 
+	if (ci->platdata->flags & CI_HDRC_OVERRIDE_PHY_CONTROL)
+		return 0;
+
 	switch (ci->platdata->phy_mode) {
 	case USBPHY_INTERFACE_MODE_UTMI:
 	case USBPHY_INTERFACE_MODE_UTMIW:
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index f9be467d6695..d07b162073f7 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -57,6 +57,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_OVERRIDE_AHB_BURST	BIT(9)
 #define CI_HDRC_OVERRIDE_TX_BURST	BIT(10)
 #define CI_HDRC_OVERRIDE_RX_BURST	BIT(11)
+#define CI_HDRC_OVERRIDE_PHY_CONTROL	BIT(12) /* Glue layer manages phy */
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From 5cc49268995a1f063a7a569299393e4cf9d06923 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <stephen.boyd@linaro.org>
Date: Fri, 20 Jan 2017 15:11:55 +0800
Subject: usb: chipidea: Consolidate extcon notifiers

The two extcon notifiers are almost the same except for the
variable name for the cable structure and the id notifier inverts
the cable->state logic. Make it the same and replace two
functions with one to save some lines. This also makes it so that
the id cable state is true when the id pin is pulled low, so we
change the name of ->state to ->connected to properly reflect
that we're interested in the cable being connected.

Acked-by: Peter Chen <peter.chen@nxp.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Ivan T. Ivanov" <iivanov.xz@gmail.com>
Signed-off-by: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/core.c  | 45 ++++++++++++--------------------------------
 drivers/usb/chipidea/otg.c   |  8 ++++----
 include/linux/usb/chipidea.h |  4 ++--
 3 files changed, 18 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index ad4e01cd0477..367d02a02145 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -577,35 +577,14 @@ static irqreturn_t ci_irq(int irq, void *data)
 	return ret;
 }
 
-static int ci_vbus_notifier(struct notifier_block *nb, unsigned long event,
-			    void *ptr)
+static int ci_cable_notifier(struct notifier_block *nb, unsigned long event,
+			     void *ptr)
 {
-	struct ci_hdrc_cable *vbus = container_of(nb, struct ci_hdrc_cable, nb);
-	struct ci_hdrc *ci = vbus->ci;
+	struct ci_hdrc_cable *cbl = container_of(nb, struct ci_hdrc_cable, nb);
+	struct ci_hdrc *ci = cbl->ci;
 
-	if (event)
-		vbus->state = true;
-	else
-		vbus->state = false;
-
-	vbus->changed = true;
-
-	ci_irq(ci->irq, ci);
-	return NOTIFY_DONE;
-}
-
-static int ci_id_notifier(struct notifier_block *nb, unsigned long event,
-			  void *ptr)
-{
-	struct ci_hdrc_cable *id = container_of(nb, struct ci_hdrc_cable, nb);
-	struct ci_hdrc *ci = id->ci;
-
-	if (event)
-		id->state = false;
-	else
-		id->state = true;
-
-	id->changed = true;
+	cbl->connected = event;
+	cbl->changed = true;
 
 	ci_irq(ci->irq, ci);
 	return NOTIFY_DONE;
@@ -714,27 +693,27 @@ static int ci_get_platdata(struct device *dev,
 	}
 
 	cable = &platdata->vbus_extcon;
-	cable->nb.notifier_call = ci_vbus_notifier;
+	cable->nb.notifier_call = ci_cable_notifier;
 	cable->edev = ext_vbus;
 
 	if (!IS_ERR(ext_vbus)) {
 		ret = extcon_get_state(cable->edev, EXTCON_USB);
 		if (ret)
-			cable->state = true;
+			cable->connected = true;
 		else
-			cable->state = false;
+			cable->connected = false;
 	}
 
 	cable = &platdata->id_extcon;
-	cable->nb.notifier_call = ci_id_notifier;
+	cable->nb.notifier_call = ci_cable_notifier;
 	cable->edev = ext_id;
 
 	if (!IS_ERR(ext_id)) {
 		ret = extcon_get_state(cable->edev, EXTCON_USB_HOST);
 		if (ret)
-			cable->state = false;
+			cable->connected = true;
 		else
-			cable->state = true;
+			cable->connected = false;
 	}
 	return 0;
 }
diff --git a/drivers/usb/chipidea/otg.c b/drivers/usb/chipidea/otg.c
index 0cf149edddd8..695f3fe3ae21 100644
--- a/drivers/usb/chipidea/otg.c
+++ b/drivers/usb/chipidea/otg.c
@@ -44,7 +44,7 @@ u32 hw_read_otgsc(struct ci_hdrc *ci, u32 mask)
 		else
 			val &= ~OTGSC_BSVIS;
 
-		if (cable->state)
+		if (cable->connected)
 			val |= OTGSC_BSV;
 		else
 			val &= ~OTGSC_BSV;
@@ -62,10 +62,10 @@ u32 hw_read_otgsc(struct ci_hdrc *ci, u32 mask)
 		else
 			val &= ~OTGSC_IDIS;
 
-		if (cable->state)
-			val |= OTGSC_ID;
+		if (cable->connected)
+			val &= ~OTGSC_ID; /* host */
 		else
-			val &= ~OTGSC_ID;
+			val |= OTGSC_ID; /* device */
 
 		if (cable->enabled)
 			val |= OTGSC_IDIE;
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index d07b162073f7..7e3daa37cf60 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -12,7 +12,7 @@ struct ci_hdrc;
 
 /**
  * struct ci_hdrc_cable - structure for external connector cable state tracking
- * @state: current state of the line
+ * @connected: true if cable is connected, false otherwise
  * @changed: set to true when extcon event happen
  * @enabled: set to true if we've enabled the vbus or id interrupt
  * @edev: device which generate events
@@ -21,7 +21,7 @@ struct ci_hdrc;
  * @conn: used for notification registration
  */
 struct ci_hdrc_cable {
-	bool				state;
+	bool				connected;
 	bool				changed;
 	bool				enabled;
 	struct extcon_dev		*edev;
-- 
cgit v1.2.3


From 11893dae63da0f5b251cf7f9a24d64c8ff4771ff Mon Sep 17 00:00:00 2001
From: Stephen Boyd <stephen.boyd@linaro.org>
Date: Wed, 28 Dec 2016 14:57:06 -0800
Subject: usb: chipidea: msm: Handle phy power states

The ULPI phy on qcom platforms needs to be initialized and
powered on after a USB reset and before we toggle the run/stop
bit. Otherwise, the phy locks up and doesn't work properly. Hook
the phy initialization into the RESET event and the phy power off
into the STOPPED event.

Acked-by: Peter Chen <peter.chen@nxp.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/ci_hdrc_msm.c | 40 +++++++++++++++++++-------------------
 drivers/usb/chipidea/core.c        |  8 ++++++--
 drivers/usb/chipidea/host.c        |  8 ++++++--
 include/linux/usb/chipidea.h       |  2 +-
 4 files changed, 33 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
index fe96df7b530c..316044dba0ac 100644
--- a/drivers/usb/chipidea/ci_hdrc_msm.c
+++ b/drivers/usb/chipidea/ci_hdrc_msm.c
@@ -80,20 +80,33 @@ static const struct reset_control_ops ci_hdrc_msm_reset_ops = {
 	.reset = ci_hdrc_msm_por_reset,
 };
 
-static void ci_hdrc_msm_notify_event(struct ci_hdrc *ci, unsigned event)
+static int ci_hdrc_msm_notify_event(struct ci_hdrc *ci, unsigned event)
 {
 	struct device *dev = ci->dev->parent;
 	struct ci_hdrc_msm *msm_ci = dev_get_drvdata(dev);
+	int ret;
 
 	switch (event) {
 	case CI_HDRC_CONTROLLER_RESET_EVENT:
 		dev_dbg(dev, "CI_HDRC_CONTROLLER_RESET_EVENT received\n");
+
+		hw_phymode_configure(ci);
 		if (msm_ci->secondary_phy) {
 			u32 val = readl_relaxed(msm_ci->base + HS_PHY_SEC_CTRL);
 			val |= HS_PHY_DIG_CLAMP_N;
 			writel_relaxed(val, msm_ci->base + HS_PHY_SEC_CTRL);
 		}
 
+		ret = phy_init(ci->phy);
+		if (ret)
+			return ret;
+
+		ret = phy_power_on(ci->phy);
+		if (ret) {
+			phy_exit(ci->phy);
+			return ret;
+		}
+
 		/* use AHB transactor, allow posted data writes */
 		hw_write_id_reg(ci, HS_PHY_AHB_MODE, 0xffffffff, 0x8);
 
@@ -113,21 +126,18 @@ static void ci_hdrc_msm_notify_event(struct ci_hdrc *ci, unsigned event)
 				 HSPHY_SESS_VLD_CTRL);
 
 		}
-
-		usb_phy_init(ci->usb_phy);
 		break;
 	case CI_HDRC_CONTROLLER_STOPPED_EVENT:
 		dev_dbg(dev, "CI_HDRC_CONTROLLER_STOPPED_EVENT received\n");
-		/*
-		 * Put the phy in non-driving mode. Otherwise host
-		 * may not detect soft-disconnection.
-		 */
-		usb_phy_notify_disconnect(ci->usb_phy, USB_SPEED_UNKNOWN);
+		phy_power_off(ci->phy);
+		phy_exit(ci->phy);
 		break;
 	default:
 		dev_dbg(dev, "unknown ci_hdrc event\n");
 		break;
 	}
+
+	return 0;
 }
 
 static int ci_hdrc_msm_mux_phy(struct ci_hdrc_msm *ci,
@@ -167,7 +177,6 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
 {
 	struct ci_hdrc_msm *ci;
 	struct platform_device *plat_ci;
-	struct usb_phy *phy;
 	struct clk *clk;
 	struct reset_control *reset;
 	struct resource *res;
@@ -181,21 +190,12 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	platform_set_drvdata(pdev, ci);
 
-	/*
-	 * OTG(PHY) driver takes care of PHY initialization, clock management,
-	 * powering up VBUS, mapping of registers address space and power
-	 * management.
-	 */
-	phy = devm_usb_get_phy_by_phandle(&pdev->dev, "usb-phy", 0);
-	if (IS_ERR(phy))
-		return PTR_ERR(phy);
-
 	ci->pdata.name = "ci_hdrc_msm";
 	ci->pdata.capoffset = DEF_CAPOFFSET;
 	ci->pdata.flags	= CI_HDRC_REGS_SHARED | CI_HDRC_DISABLE_STREAMING |
-			  CI_HDRC_OVERRIDE_AHB_BURST;
+			  CI_HDRC_OVERRIDE_AHB_BURST |
+			  CI_HDRC_OVERRIDE_PHY_CONTROL;
 	ci->pdata.notify_event = ci_hdrc_msm_notify_event;
-	ci->pdata.usb_phy = phy;
 
 	reset = devm_reset_control_get(&pdev->dev, "core");
 	if (IS_ERR(reset))
diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 367d02a02145..db69f6a0bf98 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -327,6 +327,7 @@ void hw_phymode_configure(struct ci_hdrc *ci)
 			hw_write(ci, OP_PORTSC, PORTSC_STS, PORTSC_STS);
 	}
 }
+EXPORT_SYMBOL_GPL(hw_phymode_configure);
 
 /**
  * _ci_usb_phy_init: initialize phy taking in account both phy and usb_phy
@@ -503,9 +504,12 @@ int hw_device_reset(struct ci_hdrc *ci)
 		return ret;
 	}
 
-	if (ci->platdata->notify_event)
-		ci->platdata->notify_event(ci,
+	if (ci->platdata->notify_event) {
+		ret = ci->platdata->notify_event(ci,
 			CI_HDRC_CONTROLLER_RESET_EVENT);
+		if (ret)
+			return ret;
+	}
 
 	/* USBMODE should be configured step by step */
 	hw_write(ci, OP_USBMODE, USBMODE_CM, USBMODE_CM_IDLE);
diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c
index f884c0877bca..915f3e91586e 100644
--- a/drivers/usb/chipidea/host.c
+++ b/drivers/usb/chipidea/host.c
@@ -90,8 +90,12 @@ static int ehci_ci_reset(struct usb_hcd *hcd)
 
 	ehci->need_io_watchdog = 0;
 
-	if (ci->platdata->notify_event)
-		ci->platdata->notify_event(ci, CI_HDRC_CONTROLLER_RESET_EVENT);
+	if (ci->platdata->notify_event) {
+		ret = ci->platdata->notify_event(ci,
+				CI_HDRC_CONTROLLER_RESET_EVENT);
+		if (ret)
+			return ret;
+	}
 
 	ci_platform_configure(ci);
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 7e3daa37cf60..c5fdfcf99828 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -61,7 +61,7 @@ struct ci_hdrc_platform_data {
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-	void	(*notify_event) (struct ci_hdrc *ci, unsigned event);
+	int	(*notify_event) (struct ci_hdrc *ci, unsigned event);
 	struct regulator	*reg_vbus;
 	struct usb_otg_caps	ci_otg_caps;
 	bool			tpl_support;
-- 
cgit v1.2.3


From ee48c726d0b014ac8dd5ec803fb63ce0596a42cf Mon Sep 17 00:00:00 2001
From: Ramiro Oliveira <Ramiro.Oliveira@synopsys.com>
Date: Fri, 13 Jan 2017 17:57:40 +0000
Subject: reset: Change shared flag from int to bool

Since the new parameter being added is going to be a bool this patch
changes the shared flag from int to bool to match the new parameter.

Signed-off-by: Ramiro Oliveira <Ramiro.Oliveira@synopsys.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  |  8 ++++----
 include/linux/reset.h | 32 ++++++++++++++++----------------
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index 10368ed8fd13..272c1e4ecb5c 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -41,7 +41,7 @@ struct reset_control {
 	struct list_head list;
 	unsigned int id;
 	unsigned int refcnt;
-	int shared;
+	bool shared;
 	atomic_t deassert_count;
 	atomic_t triggered_count;
 };
@@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(reset_control_status);
 
 static struct reset_control *__reset_control_get(
 				struct reset_controller_dev *rcdev,
-				unsigned int index, int shared)
+				unsigned int index, bool shared)
 {
 	struct reset_control *rstc;
 
@@ -299,7 +299,7 @@ static void __reset_control_put(struct reset_control *rstc)
 }
 
 struct reset_control *__of_reset_control_get(struct device_node *node,
-				     const char *id, int index, int shared)
+				     const char *id, int index, bool shared)
 {
 	struct reset_control *rstc;
 	struct reset_controller_dev *r, *rcdev;
@@ -379,7 +379,7 @@ static void devm_reset_control_release(struct device *dev, void *res)
 }
 
 struct reset_control *__devm_reset_control_get(struct device *dev,
-				     const char *id, int index, int shared)
+				     const char *id, int index, bool shared)
 {
 	struct reset_control **ptr, *rstc;
 
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 5daff15722d3..ec1d1fd28f5f 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -13,10 +13,10 @@ int reset_control_deassert(struct reset_control *rstc);
 int reset_control_status(struct reset_control *rstc);
 
 struct reset_control *__of_reset_control_get(struct device_node *node,
-				     const char *id, int index, int shared);
+				     const char *id, int index, bool shared);
 void reset_control_put(struct reset_control *rstc);
 struct reset_control *__devm_reset_control_get(struct device *dev,
-				     const char *id, int index, int shared);
+				     const char *id, int index, bool shared);
 
 int __must_check device_reset(struct device *dev);
 
@@ -69,14 +69,14 @@ static inline int device_reset_optional(struct device *dev)
 
 static inline struct reset_control *__of_reset_control_get(
 					struct device_node *node,
-					const char *id, int index, int shared)
+					const char *id, int index, bool shared)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
 
 static inline struct reset_control *__devm_reset_control_get(
 					struct device *dev,
-					const char *id, int index, int shared)
+					const char *id, int index, bool shared)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
@@ -132,19 +132,19 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 static inline struct reset_control *reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, 1);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true);
 }
 
 static inline struct reset_control *reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, 0);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false);
 }
 
 static inline struct reset_control *reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, 1);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true);
 }
 
 /**
@@ -185,7 +185,7 @@ static inline struct reset_control *of_reset_control_get_exclusive(
 static inline struct reset_control *of_reset_control_get_shared(
 				struct device_node *node, const char *id)
 {
-	return __of_reset_control_get(node, id, 0, 1);
+	return __of_reset_control_get(node, id, 0, true);
 }
 
 /**
@@ -202,7 +202,7 @@ static inline struct reset_control *of_reset_control_get_shared(
 static inline struct reset_control *of_reset_control_get_exclusive_by_index(
 					struct device_node *node, int index)
 {
-	return __of_reset_control_get(node, NULL, index, 0);
+	return __of_reset_control_get(node, NULL, index, false);
 }
 
 /**
@@ -230,7 +230,7 @@ static inline struct reset_control *of_reset_control_get_exclusive_by_index(
 static inline struct reset_control *of_reset_control_get_shared_by_index(
 					struct device_node *node, int index)
 {
-	return __of_reset_control_get(node, NULL, index, 1);
+	return __of_reset_control_get(node, NULL, index, true);
 }
 
 /**
@@ -252,7 +252,7 @@ __must_check devm_reset_control_get_exclusive(struct device *dev,
 #ifndef CONFIG_RESET_CONTROLLER
 	WARN_ON(1);
 #endif
-	return __devm_reset_control_get(dev, id, 0, 0);
+	return __devm_reset_control_get(dev, id, 0, false);
 }
 
 /**
@@ -267,19 +267,19 @@ __must_check devm_reset_control_get_exclusive(struct device *dev,
 static inline struct reset_control *devm_reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, 1);
+	return __devm_reset_control_get(dev, id, 0, true);
 }
 
 static inline struct reset_control *devm_reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, 0);
+	return __devm_reset_control_get(dev, id, 0, false);
 }
 
 static inline struct reset_control *devm_reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, 1);
+	return __devm_reset_control_get(dev, id, 0, true);
 }
 
 /**
@@ -297,7 +297,7 @@ static inline struct reset_control *devm_reset_control_get_optional_shared(
 static inline struct reset_control *
 devm_reset_control_get_exclusive_by_index(struct device *dev, int index)
 {
-	return __devm_reset_control_get(dev, NULL, index, 0);
+	return __devm_reset_control_get(dev, NULL, index, false);
 }
 
 /**
@@ -313,7 +313,7 @@ devm_reset_control_get_exclusive_by_index(struct device *dev, int index)
 static inline struct reset_control *
 devm_reset_control_get_shared_by_index(struct device *dev, int index)
 {
-	return __devm_reset_control_get(dev, NULL, index, 1);
+	return __devm_reset_control_get(dev, NULL, index, true);
 }
 
 /*
-- 
cgit v1.2.3


From bb475230b8e59a547ab66ac3b02572df21a580e9 Mon Sep 17 00:00:00 2001
From: Ramiro Oliveira <Ramiro.Oliveira@synopsys.com>
Date: Fri, 13 Jan 2017 17:57:41 +0000
Subject: reset: make optional functions really optional

The *_get_optional_* functions weren't really optional so this patch
makes them really optional.

These *_get_optional_* functions will now return NULL instead of an error
if no matching reset phandle is found in the DT, and all the
reset_control_* functions now accept NULL rstc pointers.

Signed-off-by: Ramiro Oliveira <Ramiro.Oliveira@synopsys.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  | 49 +++++++++++++++++++++++++++++++++++++++----------
 include/linux/reset.h | 45 ++++++++++++++++++++++++++-------------------
 2 files changed, 65 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index 272c1e4ecb5c..c79cce3a7b6d 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -143,12 +143,18 @@ EXPORT_SYMBOL_GPL(devm_reset_controller_register);
  * a no-op.
  * Consumers must not use reset_control_(de)assert on shared reset lines when
  * reset_control_reset has been used.
+ *
+ * If rstc is NULL it is an optional reset and the function will just
+ * return 0.
  */
 int reset_control_reset(struct reset_control *rstc)
 {
 	int ret;
 
-	if (WARN_ON(IS_ERR_OR_NULL(rstc)))
+	if (!rstc)
+		return 0;
+
+	if (WARN_ON(IS_ERR(rstc)))
 		return -EINVAL;
 
 	if (!rstc->rcdev->ops->reset)
@@ -182,10 +188,17 @@ EXPORT_SYMBOL_GPL(reset_control_reset);
  * internal state to be reset, but must be prepared for this to happen.
  * Consumers must not use reset_control_reset on shared reset lines when
  * reset_control_(de)assert has been used.
+ * return 0.
+ *
+ * If rstc is NULL it is an optional reset and the function will just
+ * return 0.
  */
 int reset_control_assert(struct reset_control *rstc)
 {
-	if (WARN_ON(IS_ERR_OR_NULL(rstc)))
+	if (!rstc)
+		return 0;
+
+	if (WARN_ON(IS_ERR(rstc)))
 		return -EINVAL;
 
 	if (!rstc->rcdev->ops->assert)
@@ -213,10 +226,17 @@ EXPORT_SYMBOL_GPL(reset_control_assert);
  * After calling this function, the reset is guaranteed to be deasserted.
  * Consumers must not use reset_control_reset on shared reset lines when
  * reset_control_(de)assert has been used.
+ * return 0.
+ *
+ * If rstc is NULL it is an optional reset and the function will just
+ * return 0.
  */
 int reset_control_deassert(struct reset_control *rstc)
 {
-	if (WARN_ON(IS_ERR_OR_NULL(rstc)))
+	if (!rstc)
+		return 0;
+
+	if (WARN_ON(IS_ERR(rstc)))
 		return -EINVAL;
 
 	if (!rstc->rcdev->ops->deassert)
@@ -237,12 +257,15 @@ EXPORT_SYMBOL_GPL(reset_control_deassert);
 /**
  * reset_control_status - returns a negative errno if not supported, a
  * positive value if the reset line is asserted, or zero if the reset
- * line is not asserted.
+ * line is not asserted or if the desc is NULL (optional reset).
  * @rstc: reset controller
  */
 int reset_control_status(struct reset_control *rstc)
 {
-	if (WARN_ON(IS_ERR_OR_NULL(rstc)))
+	if (!rstc)
+		return 0;
+
+	if (WARN_ON(IS_ERR(rstc)))
 		return -EINVAL;
 
 	if (rstc->rcdev->ops->status)
@@ -299,7 +322,8 @@ static void __reset_control_put(struct reset_control *rstc)
 }
 
 struct reset_control *__of_reset_control_get(struct device_node *node,
-				     const char *id, int index, bool shared)
+				     const char *id, int index, bool shared,
+				     bool optional)
 {
 	struct reset_control *rstc;
 	struct reset_controller_dev *r, *rcdev;
@@ -313,14 +337,18 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 	if (id) {
 		index = of_property_match_string(node,
 						 "reset-names", id);
+		if (index == -EILSEQ)
+			return ERR_PTR(index);
 		if (index < 0)
-			return ERR_PTR(-ENOENT);
+			return optional ? NULL : ERR_PTR(-ENOENT);
 	}
 
 	ret = of_parse_phandle_with_args(node, "resets", "#reset-cells",
 					 index, &args);
-	if (ret)
+	if (ret == -EINVAL)
 		return ERR_PTR(ret);
+	if (ret)
+		return optional ? NULL : ERR_PTR(ret);
 
 	mutex_lock(&reset_list_mutex);
 	rcdev = NULL;
@@ -379,7 +407,8 @@ static void devm_reset_control_release(struct device *dev, void *res)
 }
 
 struct reset_control *__devm_reset_control_get(struct device *dev,
-				     const char *id, int index, bool shared)
+				     const char *id, int index, bool shared,
+				     bool optional)
 {
 	struct reset_control **ptr, *rstc;
 
@@ -389,7 +418,7 @@ struct reset_control *__devm_reset_control_get(struct device *dev,
 		return ERR_PTR(-ENOMEM);
 
 	rstc = __of_reset_control_get(dev ? dev->of_node : NULL,
-				      id, index, shared);
+				      id, index, shared, optional);
 	if (!IS_ERR(rstc)) {
 		*ptr = rstc;
 		devres_add(dev, ptr);
diff --git a/include/linux/reset.h b/include/linux/reset.h
index ec1d1fd28f5f..86b4ed75359e 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -13,10 +13,12 @@ int reset_control_deassert(struct reset_control *rstc);
 int reset_control_status(struct reset_control *rstc);
 
 struct reset_control *__of_reset_control_get(struct device_node *node,
-				     const char *id, int index, bool shared);
+				     const char *id, int index, bool shared,
+				     bool optional);
 void reset_control_put(struct reset_control *rstc);
 struct reset_control *__devm_reset_control_get(struct device *dev,
-				     const char *id, int index, bool shared);
+				     const char *id, int index, bool shared,
+				     bool optional);
 
 int __must_check device_reset(struct device *dev);
 
@@ -69,14 +71,15 @@ static inline int device_reset_optional(struct device *dev)
 
 static inline struct reset_control *__of_reset_control_get(
 					struct device_node *node,
-					const char *id, int index, bool shared)
+					const char *id, int index, bool shared,
+					bool optional)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
 
 static inline struct reset_control *__devm_reset_control_get(
-					struct device *dev,
-					const char *id, int index, bool shared)
+					struct device *dev, const char *id,
+					int index, bool shared, bool optional)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
@@ -104,7 +107,8 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 #ifndef CONFIG_RESET_CONTROLLER
 	WARN_ON(1);
 #endif
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, 0);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
+									false);
 }
 
 /**
@@ -132,19 +136,22 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 static inline struct reset_control *reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
+									false);
 }
 
 static inline struct reset_control *reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
+									true);
 }
 
 static inline struct reset_control *reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true);
+	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
+									true);
 }
 
 /**
@@ -160,7 +167,7 @@ static inline struct reset_control *reset_control_get_optional_shared(
 static inline struct reset_control *of_reset_control_get_exclusive(
 				struct device_node *node, const char *id)
 {
-	return __of_reset_control_get(node, id, 0, 0);
+	return __of_reset_control_get(node, id, 0, false, false);
 }
 
 /**
@@ -185,7 +192,7 @@ static inline struct reset_control *of_reset_control_get_exclusive(
 static inline struct reset_control *of_reset_control_get_shared(
 				struct device_node *node, const char *id)
 {
-	return __of_reset_control_get(node, id, 0, true);
+	return __of_reset_control_get(node, id, 0, true, false);
 }
 
 /**
@@ -202,7 +209,7 @@ static inline struct reset_control *of_reset_control_get_shared(
 static inline struct reset_control *of_reset_control_get_exclusive_by_index(
 					struct device_node *node, int index)
 {
-	return __of_reset_control_get(node, NULL, index, false);
+	return __of_reset_control_get(node, NULL, index, false, false);
 }
 
 /**
@@ -230,7 +237,7 @@ static inline struct reset_control *of_reset_control_get_exclusive_by_index(
 static inline struct reset_control *of_reset_control_get_shared_by_index(
 					struct device_node *node, int index)
 {
-	return __of_reset_control_get(node, NULL, index, true);
+	return __of_reset_control_get(node, NULL, index, true, false);
 }
 
 /**
@@ -252,7 +259,7 @@ __must_check devm_reset_control_get_exclusive(struct device *dev,
 #ifndef CONFIG_RESET_CONTROLLER
 	WARN_ON(1);
 #endif
-	return __devm_reset_control_get(dev, id, 0, false);
+	return __devm_reset_control_get(dev, id, 0, false, false);
 }
 
 /**
@@ -267,19 +274,19 @@ __must_check devm_reset_control_get_exclusive(struct device *dev,
 static inline struct reset_control *devm_reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, true);
+	return __devm_reset_control_get(dev, id, 0, true, false);
 }
 
 static inline struct reset_control *devm_reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, false);
+	return __devm_reset_control_get(dev, id, 0, false, true);
 }
 
 static inline struct reset_control *devm_reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __devm_reset_control_get(dev, id, 0, true);
+	return __devm_reset_control_get(dev, id, 0, true, true);
 }
 
 /**
@@ -297,7 +304,7 @@ static inline struct reset_control *devm_reset_control_get_optional_shared(
 static inline struct reset_control *
 devm_reset_control_get_exclusive_by_index(struct device *dev, int index)
 {
-	return __devm_reset_control_get(dev, NULL, index, false);
+	return __devm_reset_control_get(dev, NULL, index, false, false);
 }
 
 /**
@@ -313,7 +320,7 @@ devm_reset_control_get_exclusive_by_index(struct device *dev, int index)
 static inline struct reset_control *
 devm_reset_control_get_shared_by_index(struct device *dev, int index)
 {
-	return __devm_reset_control_get(dev, NULL, index, true);
+	return __devm_reset_control_get(dev, NULL, index, true, false);
 }
 
 /*
-- 
cgit v1.2.3


From a62a77881b1b6708ffeddd9bf0529494f7b199e3 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Mon, 16 Jan 2017 11:17:57 +0100
Subject: brcmfmac: add support for BCM43455 with modalias sdio:c00v02D0dA9BF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BCM43455 is a more recent revision of the BCM4345. Some of the BCM43455
got a dedicated SDIO device ID which is currently not supported by
brcmfmac.
Adding the new sdio_device_id to brcmfmac is enough to get the BCM43455
supported because the chip itself is already supported (due to BCM4345
support in the driver).

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Reviewed-by: Andreas Färber <afaerber@suse.de>
Tested-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 +
 include/linux/mmc/sdio_ids.h                              | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 72139b579b18..5bc2ba214735 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -1104,6 +1104,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4345),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43455),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356),
 	{ /* end: all zeroes */ }
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index d43ef96bf075..71b113e1223f 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -36,6 +36,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_4345		0x4345
+#define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
 #define SDIO_DEVICE_ID_BROADCOM_4354		0x4354
 #define SDIO_DEVICE_ID_BROADCOM_4356		0x4356
 
-- 
cgit v1.2.3


From 76640b84bd7a9d68c70d8bc8ecd02cdc4bd8855e Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 19 Jan 2017 14:48:41 +0100
Subject: soc: samsung: pmu: Provide global function to get PMU regmap

PMU is something like a SoC wide service, so add a helper function to get
PMU regmap. This will be used by other Exynos device drivers. This way it
can be avoided to model this dependency in device tree (as phandles to PMU
node) for almost every device in the SoC.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Tomasz Figa <tomasz.figa@gmail.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
---
 drivers/soc/samsung/exynos-pmu.c       | 11 +++++++++++
 include/linux/soc/samsung/exynos-pmu.h | 10 ++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/samsung/exynos-pmu.c b/drivers/soc/samsung/exynos-pmu.c
index 0acdfd82e751..5c269bf23210 100644
--- a/drivers/soc/samsung/exynos-pmu.c
+++ b/drivers/soc/samsung/exynos-pmu.c
@@ -11,6 +11,7 @@
 
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/mfd/syscon.h>
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 
@@ -92,6 +93,16 @@ static const struct of_device_id exynos_pmu_of_device_ids[] = {
 	{ /*sentinel*/ },
 };
 
+struct regmap *exynos_get_pmu_regmap(void)
+{
+	struct device_node *np = of_find_matching_node(NULL,
+						      exynos_pmu_of_device_ids);
+	if (np)
+		return syscon_node_to_regmap(np);
+	return ERR_PTR(-ENODEV);
+}
+EXPORT_SYMBOL_GPL(exynos_get_pmu_regmap);
+
 static int exynos_pmu_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *match;
diff --git a/include/linux/soc/samsung/exynos-pmu.h b/include/linux/soc/samsung/exynos-pmu.h
index e2e9de1acc5b..e57eb4b6cc5a 100644
--- a/include/linux/soc/samsung/exynos-pmu.h
+++ b/include/linux/soc/samsung/exynos-pmu.h
@@ -12,6 +12,8 @@
 #ifndef __LINUX_SOC_EXYNOS_PMU_H
 #define __LINUX_SOC_EXYNOS_PMU_H
 
+struct regmap;
+
 enum sys_powerdown {
 	SYS_AFTR,
 	SYS_LPA,
@@ -20,5 +22,13 @@ enum sys_powerdown {
 };
 
 extern void exynos_sys_powerdown_conf(enum sys_powerdown mode);
+#ifdef CONFIG_EXYNOS_PMU
+extern struct regmap *exynos_get_pmu_regmap(void);
+#else
+static inline struct regmap *exynos_get_pmu_regmap(void)
+{
+	return ERR_PTR(-ENODEV);
+}
+#endif
 
 #endif /* __LINUX_SOC_EXYNOS_PMU_H */
-- 
cgit v1.2.3


From 582a686f52d2e002da64093fe4b9aa5befcaf4e4 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 18 Jan 2017 14:04:37 +0100
Subject: device: bus_type: Introduce num_vf callback

This allows for bus types to implement their own method of retrieving
the number of virtual functions a NIC on that type of bus supports.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/device.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 491b4c0ca633..6d73b70a4a5d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -88,6 +88,8 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  *
  * @suspend:	Called when a device on this bus wants to go to sleep mode.
  * @resume:	Called to bring a device on this bus out of sleep mode.
+ * @num_vf:	Called to find out how many virtual functions a device on this
+ *		bus supports.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
  * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
@@ -127,6 +129,8 @@ struct bus_type {
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
 
+	int (*num_vf)(struct device *dev);
+
 	const struct dev_pm_ops *pm;
 
 	const struct iommu_ops *iommu_ops;
-- 
cgit v1.2.3


From 9af15c38254d81c9991eba89335ca7c537d7f2c3 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 18 Jan 2017 14:04:39 +0100
Subject: device: Implement a bus agnostic dev_num_vf routine

Now that pci_bus_type has num_vf callback set, dev_num_vf can be
implemented in a bus type independent way and the check for whether a
PCI device is being handled in rtnetlink can be dropped.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/device.h | 7 +++++++
 include/linux/pci.h    | 2 --
 net/core/rtnetlink.c   | 3 +--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 6d73b70a4a5d..bd684fc8ec1d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1144,6 +1144,13 @@ extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
 extern void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
 
+static inline int dev_num_vf(struct device *dev)
+{
+	if (dev->bus && dev->bus->num_vf)
+		return dev->bus->num_vf(dev);
+	return 0;
+}
+
 /*
  * Root device objects for grouping under /sys/devices
  */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e2d1a124216a..adbc859fe7c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -885,7 +885,6 @@ void pcibios_setup_bridge(struct pci_bus *bus, unsigned long type);
 void pci_sort_breadthfirst(void);
 #define dev_is_pci(d) ((d)->bus == &pci_bus_type)
 #define dev_is_pf(d) ((dev_is_pci(d) ? to_pci_dev(d)->is_physfn : false))
-#define dev_num_vf(d) ((dev_is_pci(d) ? pci_num_vf(to_pci_dev(d)) : 0))
 
 /* Generic PCI functions exported to card drivers */
 
@@ -1630,7 +1629,6 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 
 #define dev_is_pci(d) (false)
 #define dev_is_pf(d) (false)
-#define dev_num_vf(d) (0)
 #endif /* CONFIG_PCI */
 
 /* Include architecture-dependent settings and functions */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f538f764fca6..152744643074 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 static inline int rtnl_vfinfo_size(const struct net_device *dev,
 				   u32 ext_filter_mask)
 {
-	if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
-	    (ext_filter_mask & RTEXT_FILTER_VF)) {
+	if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
 		int num_vfs = dev_num_vf(dev->dev.parent);
 		size_t size = nla_total_size(0);
 		size += num_vfs *
-- 
cgit v1.2.3


From 9f8197980d87a28ec3d0b3b986f770e7e7878485 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 19 Jan 2017 10:59:13 +0000
Subject: delay: Add explanation of udelay() inaccuracy

There seems to be some misunderstanding that udelay() and friends will
always guarantee the specified delay.  This is a false understanding.
When udelay() is based on CPU cycles, it can return early for many
reasons which are detailed by Linus' reply to me in a thread in 2011:

  http://lists.openwall.net/linux-kernel/2011/01/12/372

However, a udelay test module was created in 2014 which allows udelay()
to only be 0.5% fast, which is outside of the CPU-cycles udelay()
results I measured back in 2011, which were deemed to be in the "we
don't care" region.

test_udelay() should be fixed to reflect the real allowable tolerance
on udelay(), rather than 0.5%.

Cc: David Riley <davidriley@chromium.org>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/delay.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index a6ecb34cf547..2ecb3c46b20a 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -5,6 +5,17 @@
  * Copyright (C) 1993 Linus Torvalds
  *
  * Delay routines, using a pre-computed "loops_per_jiffy" value.
+ *
+ * Please note that ndelay(), udelay() and mdelay() may return early for
+ * several reasons:
+ *  1. computed loops_per_jiffy too low (due to the time taken to
+ *     execute the timer interrupt.)
+ *  2. cache behaviour affecting the time it takes to execute the
+ *     loop function.
+ *  3. CPU clock rate changes.
+ *
+ * Please see this thread:
+ *   http://lists.openwall.net/linux-kernel/2011/01/09/56
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From bcc9a76d5ac426bc45c9e863b1830347827ca77a Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sat, 21 Jan 2017 21:33:35 -0500
Subject: locking/rwsem: Reinit wake_q after use

In __rwsem_down_write_failed_common(), the same wake_q variable name
is defined twice, with the inner wake_q hiding the one in outer scope.
We can either use different names for the two wake_q's.

Even better, we can use the same wake_q twice, if necessary.

To enable the latter change, we need to define a new helper function
wake_q_init() to enable reinitalization of wake_q after use.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485052415-9611-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       | 6 ++++++
 kernel/locking/rwsem-xadd.c | 7 +++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4f9d32f12b3..4e62b378bd65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1021,6 +1021,12 @@ struct wake_q_head {
 #define DEFINE_WAKE_Q(name)				\
 	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
 
+static inline void wake_q_init(struct wake_q_head *head)
+{
+	head->first = WAKE_Q_TAIL;
+	head->lastp = &head->first;
+}
+
 extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a3a381aee24b..2ad8d8dc3bb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -502,8 +502,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 		 * wake any read locks that were queued ahead of us.
 		 */
 		if (count > RWSEM_WAITING_BIAS) {
-			DEFINE_WAKE_Q(wake_q);
-
 			__rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
 			/*
 			 * The wakeup is normally called _after_ the wait_lock
@@ -513,6 +511,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 			 * for attempting rwsem_try_write_lock().
 			 */
 			wake_up_q(&wake_q);
+
+			/*
+			 * Reinitialize wake_q after use.
+			 */
+			wake_q_init(&wake_q);
 		}
 
 	} else
-- 
cgit v1.2.3


From 582d0ac397ca02a6a3fb653b13154ac87b884beb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 20 Jan 2017 12:36:33 -0800
Subject: net: phy: bcm7xxx: Add entry for BCM7278

Add support for the BCM7278 28nm process Gigabit Ethernet PHY.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm7xxx.c | 2 ++
 include/linux/brcmphy.h   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index 264b085d796b..fb11927de0ff 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -416,6 +416,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev)
 
 static struct phy_driver bcm7xxx_driver[] = {
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7250, "Broadcom BCM7250"),
+	BCM7XXX_28NM_GPHY(PHY_ID_BCM7278, "Broadcom BCM7278"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7364, "Broadcom BCM7364"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7366, "Broadcom BCM7366"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"),
@@ -430,6 +431,7 @@ static struct phy_driver bcm7xxx_driver[] = {
 
 static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = {
 	{ PHY_ID_BCM7250, 0xfffffff0, },
+	{ PHY_ID_BCM7278, 0xfffffff0, },
 	{ PHY_ID_BCM7364, 0xfffffff0, },
 	{ PHY_ID_BCM7366, 0xfffffff0, },
 	{ PHY_ID_BCM7346, 0xfffffff0, },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 4f7d8be9ddbf..295fb3e73de5 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -24,6 +24,7 @@
 #define PHY_ID_BCM57780			0x03625d90
 
 #define PHY_ID_BCM7250			0xae025280
+#define PHY_ID_BCM7278			0xae0251a0
 #define PHY_ID_BCM7364			0xae025260
 #define PHY_ID_BCM7366			0x600d8490
 #define PHY_ID_BCM7346			0x600d8650
-- 
cgit v1.2.3


From fdbe574eb69312a7fbe09674d69c01b80e4ed9dc Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 19 Jan 2017 20:57:46 +0000
Subject: iommu/dma: Allow MSI-only cookies

IOMMU domain users such as VFIO face a similar problem to DMA API ops
with regard to mapping MSI messages in systems where the MSI write is
subject to IOMMU translation. With the relevant infrastructure now in
place for managed DMA domains, it's actually really simple for other
users to piggyback off that and reap the benefits without giving up
their own IOVA management, and without having to reinvent their own
wheel in the MSI layer.

Allow such users to opt into automatic MSI remapping by dedicating a
region of their IOVA space to a managed cookie, and extend the mapping
routine to implement a trivial linear allocator in such cases, to avoid
the needless overhead of a full-blown IOVA domain.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/dma-iommu.c | 119 +++++++++++++++++++++++++++++++++++++---------
 include/linux/dma-iommu.h |   6 +++
 2 files changed, 102 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 2db0d641cf45..de41ead6542a 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -37,15 +37,50 @@ struct iommu_dma_msi_page {
 	phys_addr_t		phys;
 };
 
+enum iommu_dma_cookie_type {
+	IOMMU_DMA_IOVA_COOKIE,
+	IOMMU_DMA_MSI_COOKIE,
+};
+
 struct iommu_dma_cookie {
-	struct iova_domain	iovad;
-	struct list_head	msi_page_list;
-	spinlock_t		msi_lock;
+	enum iommu_dma_cookie_type	type;
+	union {
+		/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
+		struct iova_domain	iovad;
+		/* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
+		dma_addr_t		msi_iova;
+	};
+	struct list_head		msi_page_list;
+	spinlock_t			msi_lock;
 };
 
+static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
+{
+	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
+		return cookie->iovad.granule;
+	return PAGE_SIZE;
+}
+
 static inline struct iova_domain *cookie_iovad(struct iommu_domain *domain)
 {
-	return &((struct iommu_dma_cookie *)domain->iova_cookie)->iovad;
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+
+	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
+		return &cookie->iovad;
+	return NULL;
+}
+
+static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type)
+{
+	struct iommu_dma_cookie *cookie;
+
+	cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
+	if (cookie) {
+		spin_lock_init(&cookie->msi_lock);
+		INIT_LIST_HEAD(&cookie->msi_page_list);
+		cookie->type = type;
+	}
+	return cookie;
 }
 
 int iommu_dma_init(void)
@@ -61,26 +96,54 @@ int iommu_dma_init(void)
  * callback when domain->type == IOMMU_DOMAIN_DMA.
  */
 int iommu_get_dma_cookie(struct iommu_domain *domain)
+{
+	if (domain->iova_cookie)
+		return -EEXIST;
+
+	domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE);
+	if (!domain->iova_cookie)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(iommu_get_dma_cookie);
+
+/**
+ * iommu_get_msi_cookie - Acquire just MSI remapping resources
+ * @domain: IOMMU domain to prepare
+ * @base: Start address of IOVA region for MSI mappings
+ *
+ * Users who manage their own IOVA allocation and do not want DMA API support,
+ * but would still like to take advantage of automatic MSI remapping, can use
+ * this to initialise their own domain appropriately. Users should reserve a
+ * contiguous IOVA region, starting at @base, large enough to accommodate the
+ * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
+ * used by the devices attached to @domain.
+ */
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
 {
 	struct iommu_dma_cookie *cookie;
 
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+
 	if (domain->iova_cookie)
 		return -EEXIST;
 
-	cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
+	cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
 	if (!cookie)
 		return -ENOMEM;
 
-	spin_lock_init(&cookie->msi_lock);
-	INIT_LIST_HEAD(&cookie->msi_page_list);
+	cookie->msi_iova = base;
 	domain->iova_cookie = cookie;
 	return 0;
 }
-EXPORT_SYMBOL(iommu_get_dma_cookie);
+EXPORT_SYMBOL(iommu_get_msi_cookie);
 
 /**
  * iommu_put_dma_cookie - Release a domain's DMA mapping resources
- * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or
+ *          iommu_get_msi_cookie()
  *
  * IOMMU drivers should normally call this from their domain_free callback.
  */
@@ -92,7 +155,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 	if (!cookie)
 		return;
 
-	if (cookie->iovad.granule)
+	if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule)
 		put_iova_domain(&cookie->iovad);
 
 	list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
@@ -137,11 +200,12 @@ static void iova_reserve_pci_windows(struct pci_dev *dev,
 int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 		u64 size, struct device *dev)
 {
-	struct iova_domain *iovad = cookie_iovad(domain);
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
 	unsigned long order, base_pfn, end_pfn;
 
-	if (!iovad)
-		return -ENODEV;
+	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
+		return -EINVAL;
 
 	/* Use the smallest supported page size for IOVA granularity */
 	order = __ffs(domain->pgsize_bitmap);
@@ -662,11 +726,12 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 {
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	struct iommu_dma_msi_page *msi_page;
-	struct iova_domain *iovad = &cookie->iovad;
+	struct iova_domain *iovad = cookie_iovad(domain);
 	struct iova *iova;
 	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+	size_t size = cookie_msi_granule(cookie);
 
-	msi_addr &= ~(phys_addr_t)iova_mask(iovad);
+	msi_addr &= ~(phys_addr_t)(size - 1);
 	list_for_each_entry(msi_page, &cookie->msi_page_list, list)
 		if (msi_page->phys == msi_addr)
 			return msi_page;
@@ -675,13 +740,18 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 	if (!msi_page)
 		return NULL;
 
-	iova = __alloc_iova(domain, iovad->granule, dma_get_mask(dev));
-	if (!iova)
-		goto out_free_page;
-
 	msi_page->phys = msi_addr;
-	msi_page->iova = iova_dma_addr(iovad, iova);
-	if (iommu_map(domain, msi_page->iova, msi_addr, iovad->granule, prot))
+	if (iovad) {
+		iova = __alloc_iova(domain, size, dma_get_mask(dev));
+		if (!iova)
+			goto out_free_page;
+		msi_page->iova = iova_dma_addr(iovad, iova);
+	} else {
+		msi_page->iova = cookie->msi_iova;
+		cookie->msi_iova += size;
+	}
+
+	if (iommu_map(domain, msi_page->iova, msi_addr, size, prot))
 		goto out_free_iova;
 
 	INIT_LIST_HEAD(&msi_page->list);
@@ -689,7 +759,10 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 	return msi_page;
 
 out_free_iova:
-	__free_iova(iovad, iova);
+	if (iovad)
+		__free_iova(iovad, iova);
+	else
+		cookie->msi_iova -= size;
 out_free_page:
 	kfree(msi_page);
 	return NULL;
@@ -730,7 +803,7 @@ void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)
 		msg->data = ~0U;
 	} else {
 		msg->address_hi = upper_32_bits(msi_page->iova);
-		msg->address_lo &= iova_mask(&cookie->iovad);
+		msg->address_lo &= cookie_msi_granule(cookie) - 1;
 		msg->address_lo += lower_32_bits(msi_page->iova);
 	}
 }
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 7f7e9a7e3839..28df844a23b6 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -27,6 +27,7 @@ int iommu_dma_init(void);
 
 /* Domain management interface for IOMMU drivers */
 int iommu_get_dma_cookie(struct iommu_domain *domain);
+int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
 /* Setup call for arch DMA mapping code */
@@ -86,6 +87,11 @@ static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
 	return -ENODEV;
 }
 
+static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
+{
+	return -ENODEV;
+}
+
 static inline void iommu_put_dma_cookie(struct iommu_domain *domain)
 {
 }
-- 
cgit v1.2.3


From e5b5234a36ca283158721d3d2e0cddfa324abdf9 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:47 +0000
Subject: iommu: Rename iommu_dm_regions into iommu_resv_regions

We want to extend the callbacks used for dm regions and
use them for reserved regions. Reserved regions can be
- directly mapped regions
- regions that cannot be iommu mapped (PCI host bridge windows, ...)
- MSI regions (because they belong to another address space or because
  they are not translated by the IOMMU and need special handling)

So let's rename the struct and also the callbacks.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/amd_iommu.c | 20 ++++++++++----------
 drivers/iommu/iommu.c     | 22 +++++++++++-----------
 include/linux/iommu.h     | 29 +++++++++++++++--------------
 3 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 3ef0f42984f2..f7a024f1e1dc 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3161,8 +3161,8 @@ static bool amd_iommu_capable(enum iommu_cap cap)
 	return false;
 }
 
-static void amd_iommu_get_dm_regions(struct device *dev,
-				     struct list_head *head)
+static void amd_iommu_get_resv_regions(struct device *dev,
+				       struct list_head *head)
 {
 	struct unity_map_entry *entry;
 	int devid;
@@ -3172,7 +3172,7 @@ static void amd_iommu_get_dm_regions(struct device *dev,
 		return;
 
 	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
-		struct iommu_dm_region *region;
+		struct iommu_resv_region *region;
 
 		if (devid < entry->devid_start || devid > entry->devid_end)
 			continue;
@@ -3195,18 +3195,18 @@ static void amd_iommu_get_dm_regions(struct device *dev,
 	}
 }
 
-static void amd_iommu_put_dm_regions(struct device *dev,
+static void amd_iommu_put_resv_regions(struct device *dev,
 				     struct list_head *head)
 {
-	struct iommu_dm_region *entry, *next;
+	struct iommu_resv_region *entry, *next;
 
 	list_for_each_entry_safe(entry, next, head, list)
 		kfree(entry);
 }
 
-static void amd_iommu_apply_dm_region(struct device *dev,
+static void amd_iommu_apply_resv_region(struct device *dev,
 				      struct iommu_domain *domain,
-				      struct iommu_dm_region *region)
+				      struct iommu_resv_region *region)
 {
 	struct dma_ops_domain *dma_dom = to_dma_ops_domain(to_pdomain(domain));
 	unsigned long start, end;
@@ -3230,9 +3230,9 @@ static const struct iommu_ops amd_iommu_ops = {
 	.add_device = amd_iommu_add_device,
 	.remove_device = amd_iommu_remove_device,
 	.device_group = amd_iommu_device_group,
-	.get_dm_regions = amd_iommu_get_dm_regions,
-	.put_dm_regions = amd_iommu_put_dm_regions,
-	.apply_dm_region = amd_iommu_apply_dm_region,
+	.get_resv_regions = amd_iommu_get_resv_regions,
+	.put_resv_regions = amd_iommu_put_resv_regions,
+	.apply_resv_region = amd_iommu_apply_resv_region,
 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
 };
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index dbe7f653bb7c..1cee5c361c21 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -318,7 +318,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 					      struct device *dev)
 {
 	struct iommu_domain *domain = group->default_domain;
-	struct iommu_dm_region *entry;
+	struct iommu_resv_region *entry;
 	struct list_head mappings;
 	unsigned long pg_size;
 	int ret = 0;
@@ -331,14 +331,14 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 	pg_size = 1UL << __ffs(domain->pgsize_bitmap);
 	INIT_LIST_HEAD(&mappings);
 
-	iommu_get_dm_regions(dev, &mappings);
+	iommu_get_resv_regions(dev, &mappings);
 
 	/* We need to consider overlapping regions for different devices */
 	list_for_each_entry(entry, &mappings, list) {
 		dma_addr_t start, end, addr;
 
-		if (domain->ops->apply_dm_region)
-			domain->ops->apply_dm_region(dev, domain, entry);
+		if (domain->ops->apply_resv_region)
+			domain->ops->apply_resv_region(dev, domain, entry);
 
 		start = ALIGN(entry->start, pg_size);
 		end   = ALIGN(entry->start + entry->length, pg_size);
@@ -358,7 +358,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 	}
 
 out:
-	iommu_put_dm_regions(dev, &mappings);
+	iommu_put_resv_regions(dev, &mappings);
 
 	return ret;
 }
@@ -1559,20 +1559,20 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
 
-void iommu_get_dm_regions(struct device *dev, struct list_head *list)
+void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-	if (ops && ops->get_dm_regions)
-		ops->get_dm_regions(dev, list);
+	if (ops && ops->get_resv_regions)
+		ops->get_resv_regions(dev, list);
 }
 
-void iommu_put_dm_regions(struct device *dev, struct list_head *list)
+void iommu_put_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-	if (ops && ops->put_dm_regions)
-		ops->put_dm_regions(dev, list);
+	if (ops && ops->put_resv_regions)
+		ops->put_resv_regions(dev, list);
 }
 
 /* Request that a device is direct mapped by the IOMMU */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0ff5111f6959..bfecb8b74078 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,13 +118,13 @@ enum iommu_attr {
 };
 
 /**
- * struct iommu_dm_region - descriptor for a direct mapped memory region
+ * struct iommu_resv_region - descriptor for a reserved memory region
  * @list: Linked list pointers
  * @start: System physical start address of the region
  * @length: Length of the region in bytes
  * @prot: IOMMU Protection flags (READ/WRITE/...)
  */
-struct iommu_dm_region {
+struct iommu_resv_region {
 	struct list_head	list;
 	phys_addr_t		start;
 	size_t			length;
@@ -150,9 +150,9 @@ struct iommu_dm_region {
  * @device_group: find iommu group for a particular device
  * @domain_get_attr: Query domain attributes
  * @domain_set_attr: Change domain attributes
- * @get_dm_regions: Request list of direct mapping requirements for a device
- * @put_dm_regions: Free list of direct mapping requirements for a device
- * @apply_dm_region: Temporary helper call-back for iova reserved ranges
+ * @get_resv_regions: Request list of reserved regions for a device
+ * @put_resv_regions: Free list of reserved regions for a device
+ * @apply_resv_region: Temporary helper call-back for iova reserved ranges
  * @domain_window_enable: Configure and enable a particular window for a domain
  * @domain_window_disable: Disable a particular window for a domain
  * @domain_set_windows: Set the number of windows for a domain
@@ -184,11 +184,12 @@ struct iommu_ops {
 	int (*domain_set_attr)(struct iommu_domain *domain,
 			       enum iommu_attr attr, void *data);
 
-	/* Request/Free a list of direct mapping requirements for a device */
-	void (*get_dm_regions)(struct device *dev, struct list_head *list);
-	void (*put_dm_regions)(struct device *dev, struct list_head *list);
-	void (*apply_dm_region)(struct device *dev, struct iommu_domain *domain,
-				struct iommu_dm_region *region);
+	/* Request/Free a list of reserved regions for a device */
+	void (*get_resv_regions)(struct device *dev, struct list_head *list);
+	void (*put_resv_regions)(struct device *dev, struct list_head *list);
+	void (*apply_resv_region)(struct device *dev,
+				  struct iommu_domain *domain,
+				  struct iommu_resv_region *region);
 
 	/* Window handling functions */
 	int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
@@ -233,8 +234,8 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t io
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
 
-extern void iommu_get_dm_regions(struct device *dev, struct list_head *list);
-extern void iommu_put_dm_regions(struct device *dev, struct list_head *list);
+extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
+extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
 extern int iommu_request_dm_for_dev(struct device *dev);
 
 extern int iommu_attach_group(struct iommu_domain *domain,
@@ -443,12 +444,12 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain,
 {
 }
 
-static inline void iommu_get_dm_regions(struct device *dev,
+static inline void iommu_get_resv_regions(struct device *dev,
 					struct list_head *list)
 {
 }
 
-static inline void iommu_put_dm_regions(struct device *dev,
+static inline void iommu_put_resv_regions(struct device *dev,
 					struct list_head *list)
 {
 }
-- 
cgit v1.2.3


From d30ddcaa7b028049cdfee3a40248002d07b2bbf3 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:48 +0000
Subject: iommu: Add a new type field in iommu_resv_region

We introduce a new field to differentiate the reserved region
types and specialize the apply_resv_region implementation.

Legacy direct mapped regions have IOMMU_RESV_DIRECT type.
We introduce 2 new reserved memory types:
- IOMMU_RESV_MSI will characterize MSI regions that are mapped
- IOMMU_RESV_RESERVED characterize regions that cannot by mapped.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/amd_iommu.c | 1 +
 include/linux/iommu.h     | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f7a024f1e1dc..5f7ea4faa505 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3186,6 +3186,7 @@ static void amd_iommu_get_resv_regions(struct device *dev,
 
 		region->start = entry->address_start;
 		region->length = entry->address_end - entry->address_start;
+		region->type = IOMMU_RESV_DIRECT;
 		if (entry->prot & IOMMU_PROT_IR)
 			region->prot |= IOMMU_READ;
 		if (entry->prot & IOMMU_PROT_IW)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bfecb8b74078..233a6bf093bf 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -117,18 +117,25 @@ enum iommu_attr {
 	DOMAIN_ATTR_MAX,
 };
 
+/* These are the possible reserved region types */
+#define IOMMU_RESV_DIRECT	(1 << 0)
+#define IOMMU_RESV_RESERVED	(1 << 1)
+#define IOMMU_RESV_MSI		(1 << 2)
+
 /**
  * struct iommu_resv_region - descriptor for a reserved memory region
  * @list: Linked list pointers
  * @start: System physical start address of the region
  * @length: Length of the region in bytes
  * @prot: IOMMU Protection flags (READ/WRITE/...)
+ * @type: Type of the reserved region
  */
 struct iommu_resv_region {
 	struct list_head	list;
 	phys_addr_t		start;
 	size_t			length;
 	int			prot;
+	int			type;
 };
 
 #ifdef CONFIG_IOMMU_API
-- 
cgit v1.2.3


From 2b20cbba3390a55c511acba2f0f517dd27a528b2 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:49 +0000
Subject: iommu: iommu_alloc_resv_region

Introduce a new helper serving the purpose to allocate a reserved
region. This will be used in iommu driver implementing reserved
region callbacks.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/iommu.c | 18 ++++++++++++++++++
 include/linux/iommu.h |  2 ++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1cee5c361c21..927878d0a612 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1575,6 +1575,24 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
 		ops->put_resv_regions(dev, list);
 }
 
+struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
+						  size_t length,
+						  int prot, int type)
+{
+	struct iommu_resv_region *region;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return NULL;
+
+	INIT_LIST_HEAD(&region->list);
+	region->start = start;
+	region->length = length;
+	region->prot = prot;
+	region->type = type;
+	return region;
+}
+
 /* Request that a device is direct mapped by the IOMMU */
 int iommu_request_dm_for_dev(struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 233a6bf093bf..f6bb55d3e606 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -244,6 +244,8 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
 extern int iommu_request_dm_for_dev(struct device *dev);
+extern struct iommu_resv_region *
+iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, int type);
 
 extern int iommu_attach_group(struct iommu_domain *domain,
 			      struct iommu_group *group);
-- 
cgit v1.2.3


From 6c65fb318e8bbf21e939e651028b955324f1d873 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:51 +0000
Subject: iommu: iommu_get_group_resv_regions

Introduce iommu_get_group_resv_regions whose role consists in
enumerating all devices from the group and collecting their
reserved regions. The list is sorted and overlaps between
regions of the same type are handled by merging the regions.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/iommu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h |  8 +++++
 2 files changed, 106 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 41c190695749..640056ba46c2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -133,6 +133,104 @@ static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
 	return sprintf(buf, "%s\n", group->name);
 }
 
+/**
+ * iommu_insert_resv_region - Insert a new region in the
+ * list of reserved regions.
+ * @new: new region to insert
+ * @regions: list of regions
+ *
+ * The new element is sorted by address with respect to the other
+ * regions of the same type. In case it overlaps with another
+ * region of the same type, regions are merged. In case it
+ * overlaps with another region of different type, regions are
+ * not merged.
+ */
+static int iommu_insert_resv_region(struct iommu_resv_region *new,
+				    struct list_head *regions)
+{
+	struct iommu_resv_region *region;
+	phys_addr_t start = new->start;
+	phys_addr_t end = new->start + new->length - 1;
+	struct list_head *pos = regions->next;
+
+	while (pos != regions) {
+		struct iommu_resv_region *entry =
+			list_entry(pos, struct iommu_resv_region, list);
+		phys_addr_t a = entry->start;
+		phys_addr_t b = entry->start + entry->length - 1;
+		int type = entry->type;
+
+		if (end < a) {
+			goto insert;
+		} else if (start > b) {
+			pos = pos->next;
+		} else if ((start >= a) && (end <= b)) {
+			if (new->type == type)
+				goto done;
+			else
+				pos = pos->next;
+		} else {
+			if (new->type == type) {
+				phys_addr_t new_start = min(a, start);
+				phys_addr_t new_end = max(b, end);
+
+				list_del(&entry->list);
+				entry->start = new_start;
+				entry->length = new_end - new_start + 1;
+				iommu_insert_resv_region(entry, regions);
+			} else {
+				pos = pos->next;
+			}
+		}
+	}
+insert:
+	region = iommu_alloc_resv_region(new->start, new->length,
+					 new->prot, new->type);
+	if (!region)
+		return -ENOMEM;
+
+	list_add_tail(&region->list, pos);
+done:
+	return 0;
+}
+
+static int
+iommu_insert_device_resv_regions(struct list_head *dev_resv_regions,
+				 struct list_head *group_resv_regions)
+{
+	struct iommu_resv_region *entry;
+	int ret;
+
+	list_for_each_entry(entry, dev_resv_regions, list) {
+		ret = iommu_insert_resv_region(entry, group_resv_regions);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+int iommu_get_group_resv_regions(struct iommu_group *group,
+				 struct list_head *head)
+{
+	struct iommu_device *device;
+	int ret = 0;
+
+	mutex_lock(&group->mutex);
+	list_for_each_entry(device, &group->devices, list) {
+		struct list_head dev_resv_regions;
+
+		INIT_LIST_HEAD(&dev_resv_regions);
+		iommu_get_resv_regions(device->dev, &dev_resv_regions);
+		ret = iommu_insert_device_resv_regions(&dev_resv_regions, head);
+		iommu_put_resv_regions(device->dev, &dev_resv_regions);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&group->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions);
+
 static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
 
 static void iommu_group_release(struct kobject *kobj)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f6bb55d3e606..bec3730dc009 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -246,6 +246,8 @@ extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
 extern int iommu_request_dm_for_dev(struct device *dev);
 extern struct iommu_resv_region *
 iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, int type);
+extern int iommu_get_group_resv_regions(struct iommu_group *group,
+					struct list_head *head);
 
 extern int iommu_attach_group(struct iommu_domain *domain,
 			      struct iommu_group *group);
@@ -463,6 +465,12 @@ static inline void iommu_put_resv_regions(struct device *dev,
 {
 }
 
+static inline int iommu_get_group_resv_regions(struct iommu_group *group,
+					       struct list_head *head)
+{
+	return -ENODEV;
+}
+
 static inline int iommu_request_dm_for_dev(struct device *dev)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From d0f949e220fdf5ed1033e9f6e80749b05f2e7b70 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Date: Fri, 20 Jan 2017 10:15:03 +0100
Subject: mfd: Add STM32 Timers driver

This hardware block could at used at same time for PWM generation
and IIO timers.
PWM and IIO timer configuration are mixed in the same registers
so we need a multi fonction driver to be able to share those registers.

version 7:
- rebase on v4.10-rc2

version 6:
- rename files to stm32-timers
- rename functions to stm32_timers_xxx

version 5:
- fix Lee comments about detect function
- add missing dependency on REGMAP_MMIO

version 4:
- add a function to detect Auto Reload Register (ARR) size
- rename the structure shared with other drivers

version 2:
- rename driver "stm32-gptimer" to be align with SoC documentation
- only keep one compatible
- use of_platform_populate() instead of devm_mfd_add_devices()

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig              | 11 ++++++
 drivers/mfd/Makefile             |  2 +
 drivers/mfd/stm32-timers.c       | 80 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/stm32-timers.h | 71 +++++++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+)
 create mode 100644 drivers/mfd/stm32-timers.c
 create mode 100644 include/linux/mfd/stm32-timers.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 4ce3b6f11830..d0c14b88b24e 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1621,6 +1621,17 @@ config MFD_STW481X
 	  in various ST Microelectronics and ST-Ericsson embedded
 	  Nomadik series.
 
+config MFD_STM32_TIMERS
+	tristate "Support for STM32 Timers"
+	depends on (ARCH_STM32 && OF) || COMPILE_TEST
+	select MFD_CORE
+	select REGMAP
+	select REGMAP_MMIO
+	help
+	  Select this option to enable STM32 timers driver used
+	  for PWM and IIO Timer. This driver allow to share the
+	  registers between the others drivers.
+
 menu "Multimedia Capabilities Port drivers"
 	depends on ARCH_SA1100
 
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index dda4d4f73ad7..876ca8600c51 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -212,3 +212,5 @@ obj-$(CONFIG_MFD_MT6397)	+= mt6397-core.o
 
 obj-$(CONFIG_MFD_ALTERA_A10SR)	+= altera-a10sr.o
 obj-$(CONFIG_MFD_SUN4I_GPADC)	+= sun4i-gpadc.o
+
+obj-$(CONFIG_MFD_STM32_TIMERS) 	+= stm32-timers.o
diff --git a/drivers/mfd/stm32-timers.c b/drivers/mfd/stm32-timers.c
new file mode 100644
index 000000000000..41bd9017f3d0
--- /dev/null
+++ b/drivers/mfd/stm32-timers.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) STMicroelectronics 2016
+ *
+ * Author: Benjamin Gaignard <benjamin.gaignard@st.com>
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#include <linux/mfd/stm32-timers.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/reset.h>
+
+static const struct regmap_config stm32_timers_regmap_cfg = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = sizeof(u32),
+	.max_register = 0x400,
+};
+
+static void stm32_timers_get_arr_size(struct stm32_timers *ddata)
+{
+	/*
+	 * Only the available bits will be written so when readback
+	 * we get the maximum value of auto reload register
+	 */
+	regmap_write(ddata->regmap, TIM_ARR, ~0L);
+	regmap_read(ddata->regmap, TIM_ARR, &ddata->max_arr);
+	regmap_write(ddata->regmap, TIM_ARR, 0x0);
+}
+
+static int stm32_timers_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct stm32_timers *ddata;
+	struct resource *res;
+	void __iomem *mmio;
+
+	ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL);
+	if (!ddata)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	mmio = devm_ioremap_resource(dev, res);
+	if (IS_ERR(mmio))
+		return PTR_ERR(mmio);
+
+	ddata->regmap = devm_regmap_init_mmio_clk(dev, "int", mmio,
+						  &stm32_timers_regmap_cfg);
+	if (IS_ERR(ddata->regmap))
+		return PTR_ERR(ddata->regmap);
+
+	ddata->clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(ddata->clk))
+		return PTR_ERR(ddata->clk);
+
+	stm32_timers_get_arr_size(ddata);
+
+	platform_set_drvdata(pdev, ddata);
+
+	return of_platform_populate(pdev->dev.of_node, NULL, NULL, &pdev->dev);
+}
+
+static const struct of_device_id stm32_timers_of_match[] = {
+	{ .compatible = "st,stm32-timers", },
+	{ /* end node */ },
+};
+MODULE_DEVICE_TABLE(of, stm32_timers_of_match);
+
+static struct platform_driver stm32_timers_driver = {
+	.probe = stm32_timers_probe,
+	.driver	= {
+		.name = "stm32-timers",
+		.of_match_table = stm32_timers_of_match,
+	},
+};
+module_platform_driver(stm32_timers_driver);
+
+MODULE_DESCRIPTION("STMicroelectronics STM32 Timers");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
new file mode 100644
index 000000000000..d0300045f04a
--- /dev/null
+++ b/include/linux/mfd/stm32-timers.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) STMicroelectronics 2016
+ *
+ * Author: Benjamin Gaignard <benjamin.gaignard@st.com>
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#ifndef _LINUX_STM32_GPTIMER_H_
+#define _LINUX_STM32_GPTIMER_H_
+
+#include <linux/clk.h>
+#include <linux/regmap.h>
+
+#define TIM_CR1		0x00	/* Control Register 1      */
+#define TIM_CR2		0x04	/* Control Register 2      */
+#define TIM_SMCR	0x08	/* Slave mode control reg  */
+#define TIM_DIER	0x0C	/* DMA/interrupt register  */
+#define TIM_SR		0x10	/* Status register	   */
+#define TIM_EGR		0x14	/* Event Generation Reg    */
+#define TIM_CCMR1	0x18	/* Capt/Comp 1 Mode Reg    */
+#define TIM_CCMR2	0x1C	/* Capt/Comp 2 Mode Reg    */
+#define TIM_CCER	0x20	/* Capt/Comp Enable Reg    */
+#define TIM_PSC		0x28	/* Prescaler               */
+#define TIM_ARR		0x2c	/* Auto-Reload Register    */
+#define TIM_CCR1	0x34	/* Capt/Comp Register 1    */
+#define TIM_CCR2	0x38	/* Capt/Comp Register 2    */
+#define TIM_CCR3	0x3C	/* Capt/Comp Register 3    */
+#define TIM_CCR4	0x40	/* Capt/Comp Register 4    */
+#define TIM_BDTR	0x44	/* Break and Dead-Time Reg */
+
+#define TIM_CR1_CEN	BIT(0)	/* Counter Enable	   */
+#define TIM_CR1_ARPE	BIT(7)	/* Auto-reload Preload Ena */
+#define TIM_CR2_MMS	(BIT(4) | BIT(5) | BIT(6)) /* Master mode selection */
+#define TIM_SMCR_SMS	(BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */
+#define TIM_SMCR_TS	(BIT(4) | BIT(5) | BIT(6)) /* Trigger selection */
+#define TIM_DIER_UIE	BIT(0)	/* Update interrupt	   */
+#define TIM_SR_UIF	BIT(0)	/* Update interrupt flag   */
+#define TIM_EGR_UG	BIT(0)	/* Update Generation       */
+#define TIM_CCMR_PE	BIT(3)	/* Channel Preload Enable  */
+#define TIM_CCMR_M1	(BIT(6) | BIT(5))  /* Channel PWM Mode 1 */
+#define TIM_CCER_CC1E	BIT(0)	/* Capt/Comp 1  out Ena    */
+#define TIM_CCER_CC1P	BIT(1)	/* Capt/Comp 1  Polarity   */
+#define TIM_CCER_CC1NE	BIT(2)	/* Capt/Comp 1N out Ena    */
+#define TIM_CCER_CC1NP	BIT(3)	/* Capt/Comp 1N Polarity   */
+#define TIM_CCER_CC2E	BIT(4)	/* Capt/Comp 2  out Ena    */
+#define TIM_CCER_CC3E	BIT(8)	/* Capt/Comp 3  out Ena    */
+#define TIM_CCER_CC4E	BIT(12)	/* Capt/Comp 4  out Ena    */
+#define TIM_CCER_CCXE	(BIT(0) | BIT(4) | BIT(8) | BIT(12))
+#define TIM_BDTR_BKE	BIT(12) /* Break input enable	   */
+#define TIM_BDTR_BKP	BIT(13) /* Break input polarity	   */
+#define TIM_BDTR_AOE	BIT(14)	/* Automatic Output Enable */
+#define TIM_BDTR_MOE	BIT(15)	/* Main Output Enable      */
+#define TIM_BDTR_BKF	(BIT(16) | BIT(17) | BIT(18) | BIT(19))
+#define TIM_BDTR_BK2F	(BIT(20) | BIT(21) | BIT(22) | BIT(23))
+#define TIM_BDTR_BK2E	BIT(24) /* Break 2 input enable	   */
+#define TIM_BDTR_BK2P	BIT(25) /* Break 2 input polarity  */
+
+#define MAX_TIM_PSC		0xFFFF
+#define TIM_CR2_MMS_SHIFT	4
+#define TIM_SMCR_TS_SHIFT	4
+#define TIM_BDTR_BKF_MASK	0xF
+#define TIM_BDTR_BKF_SHIFT	16
+#define TIM_BDTR_BK2F_SHIFT	20
+
+struct stm32_timers {
+	struct clk *clk;
+	struct regmap *regmap;
+	u32 max_arr;
+};
+#endif
-- 
cgit v1.2.3


From 631a9639ac413da6242cb15558ebd661cf633622 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:57 +0000
Subject: irqdomain: Add irq domain MSI and MSI_REMAP flags

We introduce two new enum values for the irq domain flag:
- IRQ_DOMAIN_FLAG_MSI indicates the irq domain corresponds to
  an MSI domain
- IRQ_DOMAIN_FLAG_MSI_REMAP indicates the irq domain has MSI
  remapping capabilities.

Those values will be useful to check all MSI irq domains have
MSI remapping support when assessing the safety of IRQ assignment
to a guest.

irq_domain_hierarchical_is_msi_remap() allows to check if an
irq domain or any parent implements MSI remapping.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/irqdomain.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/irq/irqdomain.c    | 14 ++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index ffb84604c1de..bc2f5719dace 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -183,6 +183,12 @@ enum {
 	/* Irq domain is an IPI domain with single virq */
 	IRQ_DOMAIN_FLAG_IPI_SINGLE	= (1 << 3),
 
+	/* Irq domain implements MSIs */
+	IRQ_DOMAIN_FLAG_MSI		= (1 << 4),
+
+	/* Irq domain implements MSI remapping */
+	IRQ_DOMAIN_FLAG_MSI_REMAP	= (1 << 5),
+
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
@@ -446,6 +452,19 @@ static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
 {
 	return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE;
 }
+
+static inline bool irq_domain_is_msi(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_MSI;
+}
+
+static inline bool irq_domain_is_msi_remap(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_MSI_REMAP;
+}
+
+extern bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain);
+
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 static inline void irq_domain_activate_irq(struct irq_data *data) { }
 static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
@@ -477,6 +496,22 @@ static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
 {
 	return false;
 }
+
+static inline bool irq_domain_is_msi(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool irq_domain_is_msi_remap(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool
+irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
+{
+	return false;
+}
 #endif	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
 #else /* CONFIG_IRQ_DOMAIN */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c0a0ae43521..876e13172dc8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1392,6 +1392,20 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
 	if (domain->ops->alloc)
 		domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
 }
+
+/**
+ * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
+ * parent has MSI remapping support
+ * @domain: domain pointer
+ */
+bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
+{
+	for (; domain; domain = domain->parent) {
+		if (irq_domain_is_msi_remap(domain))
+			return true;
+	}
+	return false;
+}
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 /**
  * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
-- 
cgit v1.2.3


From c7b41f0af38f53e46050b56a5b0e96710097b83c Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:59 +0000
Subject: irqdomain: irq_domain_check_msi_remap

This new function checks whether all MSI irq domains
implement IRQ remapping. This is useful to understand
whether VFIO passthrough is safe with respect to interrupts.

On ARM typically an MSI controller can sit downstream
to the IOMMU without preventing VFIO passthrough.
As such any assigned device can write into the MSI doorbell.
In case the MSI controller implements IRQ remapping, assigned
devices will not be able to trigger interrupts towards the
host. On the contrary, the assignment must be emphasized as
unsafe with respect to interrupts.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/irqdomain.h |  1 +
 kernel/irq/irqdomain.c    | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index bc2f5719dace..188eced6813e 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -222,6 +222,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 void *host_data);
 extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 						   enum irq_domain_bus_token bus_token);
+extern bool irq_domain_check_msi_remap(void);
 extern void irq_set_default_host(struct irq_domain *host);
 extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
 				  irq_hw_number_t hwirq, int node,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 876e13172dc8..80c4f9312187 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -277,6 +277,31 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 }
 EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
 
+/**
+ * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
+ * IRQ remapping
+ *
+ * Return: false if any MSI irq domain does not support IRQ remapping,
+ * true otherwise (including if there is no MSI irq domain)
+ */
+bool irq_domain_check_msi_remap(void)
+{
+	struct irq_domain *h;
+	bool ret = true;
+
+	mutex_lock(&irq_domain_mutex);
+	list_for_each_entry(h, &irq_domain_list, link) {
+		if (irq_domain_is_msi(h) &&
+		    !irq_domain_hierarchical_is_msi_remap(h)) {
+			ret = false;
+			break;
+		}
+	}
+	mutex_unlock(&irq_domain_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
+
 /**
  * irq_set_default_host() - Set a "default" irq domain
  * @domain: default domain pointer
-- 
cgit v1.2.3


From d78973c32a210b0057b51880f7119bf2b61d5a65 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 12 Dec 2016 18:01:45 -0800
Subject: llist: Clarify comments about when locking is needed

llist.h comments are confusing about when locking is needed versus when it
isn't. Clarify these comments by being more descriptive about why locking is
needed for llist_del_first.

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Huang Ying <ying.huang@intel.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/llist.h | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index fd4ca0b4fe0f..171baa90f6f6 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -3,28 +3,33 @@
 /*
  * Lock-less NULL terminated single linked list
  *
- * If there are multiple producers and multiple consumers, llist_add
- * can be used in producers and llist_del_all can be used in
- * consumers.  They can work simultaneously without lock.  But
- * llist_del_first can not be used here.  Because llist_del_first
- * depends on list->first->next does not changed if list->first is not
- * changed during its operation, but llist_del_first, llist_add,
- * llist_add (or llist_del_all, llist_add, llist_add) sequence in
- * another consumer may violate that.
- *
- * If there are multiple producers and one consumer, llist_add can be
- * used in producers and llist_del_all or llist_del_first can be used
- * in the consumer.
- *
- * This can be summarized as follow:
+ * Cases where locking is not needed:
+ * If there are multiple producers and multiple consumers, llist_add can be
+ * used in producers and llist_del_all can be used in consumers simultaneously
+ * without locking. Also a single consumer can use llist_del_first while
+ * multiple producers simultaneously use llist_add, without any locking.
+ *
+ * Cases where locking is needed:
+ * If we have multiple consumers with llist_del_first used in one consumer, and
+ * llist_del_first or llist_del_all used in other consumers, then a lock is
+ * needed.  This is because llist_del_first depends on list->first->next not
+ * changing, but without lock protection, there's no way to be sure about that
+ * if a preemption happens in the middle of the delete operation and on being
+ * preempted back, the list->first is the same as before causing the cmpxchg in
+ * llist_del_first to succeed. For example, while a llist_del_first operation
+ * is in progress in one consumer, then a llist_del_first, llist_add,
+ * llist_add (or llist_del_all, llist_add, llist_add) sequence in another
+ * consumer may cause violations.
+ *
+ * This can be summarized as follows:
  *
  *           |   add    | del_first |  del_all
  * add       |    -     |     -     |     -
  * del_first |          |     L     |     L
  * del_all   |          |           |     -
  *
- * Where "-" stands for no lock is needed, while "L" stands for lock
- * is needed.
+ * Where, a particular row's operation can happen concurrently with a column's
+ * operation, with "-" being no lock needed, while "L" being lock is needed.
  *
  * The list entries deleted via llist_del_all can be traversed with
  * traversing function such as llist_for_each etc.  But the list
-- 
cgit v1.2.3


From 02a5c550b2738f2bfea8e1e00aa75944d71c9e18 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2016 17:25:06 -0700
Subject: rcu: Abstract extended quiescent state determination

This commit is the fourth step towards full abstraction of all accesses
to the ->dynticks counter, implementing previously open-coded checks and
comparisons in new rcu_dynticks_in_eqs() and rcu_dynticks_in_eqs_since()
functions.  This abstraction will ease changes to the ->dynticks counter
operation.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcutiny.h  |  6 ++++++
 kernel/rcu/tree.c        | 52 +++++++++++++++++++++++++++++++++++-------------
 kernel/rcu/tree.h        |  2 ++
 kernel/rcu/tree_exp.h    |  6 +++---
 kernel/rcu/tree_plugin.h |  2 +-
 kernel/rcu/tree_trace.c  |  2 +-
 6 files changed, 51 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ac81e4063b40..4f9b2fa2173d 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,6 +27,12 @@
 
 #include <linux/cache.h>
 
+struct rcu_dynticks;
+static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+{
+	return 0;
+}
+
 static inline unsigned long get_state_synchronize_rcu(void)
 {
 	return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3169d5a21b55..8b970319c75b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -336,17 +336,48 @@ static void rcu_dynticks_eqs_online(void)
 	atomic_add(0x1, &rdtp->dynticks);
 }
 
+/*
+ * Is the current CPU in an extended quiescent state?
+ *
+ * No ordering, as we are sampling CPU-local information.
+ */
+bool rcu_dynticks_curr_cpu_in_eqs(void)
+{
+	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+	return !(atomic_read(&rdtp->dynticks) & 0x1);
+}
+
 /*
  * Snapshot the ->dynticks counter with full ordering so as to allow
  * stable comparison of this counter with past and future snapshots.
  */
-static int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 {
 	int snap = atomic_add_return(0, &rdtp->dynticks);
 
 	return snap;
 }
 
+/*
+ * Return true if the snapshot returned from rcu_dynticks_snap()
+ * indicates that RCU is in an extended quiescent state.
+ */
+static bool rcu_dynticks_in_eqs(int snap)
+{
+	return !(snap & 0x1);
+}
+
+/*
+ * Return true if the CPU corresponding to the specified rcu_dynticks
+ * structure has spent some time in an extended quiescent state since
+ * rcu_dynticks_snap() returned the specified snapshot.
+ */
+static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
+{
+	return snap != rcu_dynticks_snap(rdtp);
+}
+
 /*
  * Do a double-increment of the ->dynticks counter to emulate a
  * momentary idle-CPU quiescent state.
@@ -1045,7 +1076,7 @@ void rcu_nmi_enter(void)
 	 * to be in the outermost NMI handler that interrupted an RCU-idle
 	 * period (observation due to Andy Lutomirski).
 	 */
-	if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+	if (rcu_dynticks_curr_cpu_in_eqs()) {
 		rcu_dynticks_eqs_exit();
 		incby = 1;
 	}
@@ -1071,7 +1102,7 @@ void rcu_nmi_exit(void)
 	 * to us!)
 	 */
 	WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
-	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
 
 	/*
 	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
@@ -1097,9 +1128,7 @@ void rcu_nmi_exit(void)
  */
 bool notrace __rcu_is_watching(void)
 {
-	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
-	return atomic_read(&rdtp->dynticks) & 0x1;
+	return !rcu_dynticks_curr_cpu_in_eqs();
 }
 
 /**
@@ -1184,7 +1213,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 {
 	rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
 	rcu_sysidle_check_cpu(rdp, isidle, maxj);
-	if ((rdp->dynticks_snap & 0x1) == 0) {
+	if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
 				 rdp->mynode->gpnum))
@@ -1203,12 +1232,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 				    bool *isidle, unsigned long *maxj)
 {
-	unsigned int curr;
 	int *rcrmp;
-	unsigned int snap;
-
-	curr = (unsigned int)rcu_dynticks_snap(rdp->dynticks);
-	snap = (unsigned int)rdp->dynticks_snap;
 
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
@@ -1218,7 +1242,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	 * read-side critical section that started before the beginning
 	 * of the current RCU grace period.
 	 */
-	if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+	if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		rdp->dynticks_fqs++;
 		return 1;
@@ -3807,7 +3831,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
-	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
 	rcu_boot_init_nocb_percpu_data(rdp);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe98dd24adf8..3b953dcf6afc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -595,6 +595,8 @@ extern struct rcu_state rcu_bh_state;
 extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 011f626b2fd8..e155a465cf84 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -360,7 +360,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			rdp->exp_dynticks_snap =
 				rcu_dynticks_snap(rdp->dynticks);
 			if (raw_smp_processor_id() == cpu ||
-			    !(rdp->exp_dynticks_snap & 0x1) ||
+			    rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
 			    !(rnp->qsmaskinitnext & rdp->grpmask))
 				mask_ofl_test |= rdp->grpmask;
 		}
@@ -383,8 +383,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			if (!(mask_ofl_ipi & mask))
 				continue;
 retry_ipi:
-			if (rcu_dynticks_snap(rdp->dynticks) !=
-			    rdp->exp_dynticks_snap) {
+			if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+						      rdp->exp_dynticks_snap)) {
 				mask_ofl_test |= mask;
 				continue;
 			}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 56583e764ebf..652209589adf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
 	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
 	       ticks_value, ticks_title,
-	       atomic_read(&rdtp->dynticks) & 0xfff,
+	       rcu_dynticks_snap(rdtp) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index b1f28972872c..b833cd0a29e8 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -124,7 +124,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
 		   rdp->core_needs_qs);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",
-		   atomic_read(&rdp->dynticks->dynticks),
+		   rcu_dynticks_snap(rdp->dynticks),
 		   rdp->dynticks->dynticks_nesting,
 		   rdp->dynticks->dynticks_nmi_nesting,
 		   rdp->dynticks_fqs);
-- 
cgit v1.2.3


From 1cce1eea0aff51201753fcaca421df825b0813b6 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <n.borisov.lkml@gmail.com>
Date: Wed, 14 Dec 2016 15:56:33 +0200
Subject: inotify: Convert to using per-namespace limits

This patchset converts inotify to using the newly introduced
per-userns sysctl infrastructure.

Currently the inotify instances/watches are being accounted in the
user_struct structure. This means that in setups where multiple
users in unprivileged containers map to the same underlying
real user (i.e. pointing to the same user_struct) the inotify limits
are going to be shared as well, allowing one user(or application) to exhaust
all others limits.

Fix this by switching the inotify sysctls to using the
per-namespace/per-user limits. This will allow the server admin to
set sensible global limits, which can further be tuned inside every
individual user namespace. Additionally, in order to preserve the
sysctl ABI make the existing inotify instances/watches sysctls
modify the values of the initial user namespace.

Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/notify/inotify/inotify.h          | 17 +++++++++++++++++
 fs/notify/inotify/inotify_fsnotify.c |  6 ++----
 fs/notify/inotify/inotify_user.c     | 34 +++++++++++++++++-----------------
 include/linux/fsnotify_backend.h     |  3 ++-
 include/linux/sched.h                |  4 ----
 include/linux/user_namespace.h       |  4 ++++
 kernel/ucount.c                      |  6 +++++-
 7 files changed, 47 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index a6f5907a3fee..7c461fd49c4c 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				const unsigned char *file_name, u32 cookie);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
+
+#ifdef CONFIG_INOTIFY_USER
+static inline void dec_inotify_instances(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
+}
+
+static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
+{
+	return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
+}
+
+static inline void dec_inotify_watches(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
+}
+#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 19e7ec109a75..f36c29398de3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 	/* ideally the idr is empty and we won't hit the BUG in the callback */
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_destroy(&group->inotify_data.idr);
-	if (group->inotify_data.user) {
-		atomic_dec(&group->inotify_data.user->inotify_devs);
-		free_uid(group->inotify_data.user);
-	}
+	if (group->inotify_data.ucounts)
+		dec_inotify_instances(group->inotify_data.ucounts);
 }
 
 static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..1cf41c623be1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@
 
 #include <asm/ioctls.h>
 
-/* these are configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
+/* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
@@ -60,7 +58,7 @@ static int zero;
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
 	},
 	{
 		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	/* remove this mark from the idr */
 	inotify_remove_from_idr(group, i_mark);
 
-	atomic_dec(&group->inotify_data.user->inotify_watches);
+	dec_inotify_watches(group->inotify_data.ucounts);
 }
 
 /* ding dong the mark is dead */
@@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	tmp_i_mark->fsn_mark.mask = mask;
 	tmp_i_mark->wd = -1;
 
-	ret = -ENOSPC;
-	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-		goto out_err;
-
 	ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
 	if (ret)
 		goto out_err;
 
+	/* increment the number of watches the user has */
+	if (!inc_inotify_watches(group->inotify_data.ucounts)) {
+		inotify_remove_from_idr(group, tmp_i_mark);
+		ret = -ENOSPC;
+		goto out_err;
+	}
+
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
 				       NULL, 0);
@@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
 		goto out_err;
 	}
 
-	/* increment the number of watches the user has */
-	atomic_inc(&group->inotify_data.user->inotify_watches);
 
 	/* return the watch descriptor for this new mark */
 	ret = tmp_i_mark->wd;
@@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.user = get_current_user();
+	group->inotify_data.ucounts = inc_ucount(current_user_ns(),
+						 current_euid(),
+						 UCOUNT_INOTIFY_INSTANCES);
 
-	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
-	    inotify_max_user_instances) {
+	if (!group->inotify_data.ucounts) {
 		fsnotify_destroy_group(group);
 		return ERR_PTR(-EMFILE);
 	}
@@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0cf34d6cc253..c8f2738113f4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
+#include <linux/user_namespace.h>
 
 /*
  * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -170,7 +171,7 @@ struct fsnotify_group {
 		struct inotify_group_private_data {
 			spinlock_t	idr_lock;
 			struct idr      idr;
-			struct user_struct      *user;
+			struct ucounts *ucounts;
 		} inotify_data;
 #endif
 #ifdef CONFIG_FANOTIFY
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..d2334229167f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -868,10 +868,6 @@ struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY_USER
-	atomic_t inotify_watches; /* How many inotify watches does this user have? */
-	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
-#endif
 #ifdef CONFIG_FANOTIFY
 	atomic_t fanotify_listeners;
 #endif
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..363e0e8082a9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@ enum ucount_type {
 	UCOUNT_NET_NAMESPACES,
 	UCOUNT_MNT_NAMESPACES,
 	UCOUNT_CGROUP_NAMESPACES,
+#ifdef CONFIG_INOTIFY_USER
+	UCOUNT_INOTIFY_INSTANCES,
+	UCOUNT_INOTIFY_WATCHES,
+#endif
 	UCOUNT_COUNTS,
 };
 
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 4bbd38ec3788..68716403b261 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {
 
 static int zero = 0;
 static int int_max = INT_MAX;
-#define UCOUNT_ENTRY(name) 				\
+#define UCOUNT_ENTRY(name)				\
 	{						\
 		.procname	= name,			\
 		.maxlen		= sizeof(int),		\
@@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_net_namespaces"),
 	UCOUNT_ENTRY("max_mnt_namespaces"),
 	UCOUNT_ENTRY("max_cgroup_namespaces"),
+#ifdef CONFIG_INOTIFY_USER
+	UCOUNT_ENTRY("max_inotify_instances"),
+	UCOUNT_ENTRY("max_inotify_watches"),
+#endif
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
-- 
cgit v1.2.3


From 9227dd2a84a765fcfef1677ff17de0958b192eda Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 Jan 2017 17:26:31 +1300
Subject: exec: Remove LSM_UNSAFE_PTRACE_CAP

With previous changes every location that tests for
LSM_UNSAFE_PTRACE_CAP also tests for LSM_UNSAFE_PTRACE making the
LSM_UNSAFE_PTRACE_CAP redundant, so remove it.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c                  | 8 ++------
 include/linux/security.h   | 3 +--
 security/apparmor/domain.c | 2 +-
 security/commoncap.c       | 2 +-
 security/selinux/hooks.c   | 3 +--
 security/smack/smack_lsm.c | 2 +-
 6 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index e57946610733..c195ebb8e2aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	struct task_struct *p = current, *t;
 	unsigned n_fs;
 
-	if (p->ptrace) {
-		if (ptracer_capable(p, current_user_ns()))
-			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			bprm->unsafe |= LSM_UNSAFE_PTRACE;
-	}
+	if (p->ptrace)
+		bprm->unsafe |= LSM_UNSAFE_PTRACE;
 
 	/*
 	 * This isn't strictly necessary, but it makes it harder for LSMs to
diff --git a/include/linux/security.h b/include/linux/security.h
index c2125e9093e8..9d9ee90f1f35 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -140,8 +140,7 @@ struct request_sock;
 /* bprm->unsafe reasons */
 #define LSM_UNSAFE_SHARE	1
 #define LSM_UNSAFE_PTRACE	2
-#define LSM_UNSAFE_PTRACE_CAP	4
-#define LSM_UNSAFE_NO_NEW_PRIVS	8
+#define LSM_UNSAFE_NO_NEW_PRIVS	4
 
 #ifdef CONFIG_MMU
 extern int mmap_min_addr_handler(struct ctl_table *table, int write,
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index a4d90aa1045a..04185b7fd38a 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -469,7 +469,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
 		;
 	}
 
-	if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+	if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
 		error = may_change_ptraced_domain(new_profile);
 		if (error)
 			goto audit;
diff --git a/security/commoncap.c b/security/commoncap.c
index cbb203c91406..8ec6b7fe909e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -548,7 +548,7 @@ skip:
 
 	if ((is_setid ||
 	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
-	    ((bprm->unsafe & ~(LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) ||
+	    ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
 	     !ptracer_capable(current, new->user_ns))) {
 		/* downgrade; they get no more than they had, and maybe less */
 		if (!ns_capable(new->user_ns, CAP_SETUID) ||
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c7c6619431d5..cece6fe55f02 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2404,8 +2404,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 
 		/* Make sure that anyone attempting to ptrace over a task that
 		 * changes its SID has the appropriate permit */
-		if (bprm->unsafe &
-		    (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+		if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
 			u32 ptsid = ptrace_parent_sid(current);
 			if (ptsid != 0) {
 				rc = avc_has_perm(ptsid, new_tsec->sid,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 94dc9d406ce3..bc2ff09f1494 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -934,7 +934,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
 	    isp->smk_task != sbsp->smk_root)
 		return 0;
 
-	if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+	if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
 		struct task_struct *tracer;
 		rc = 0;
 
-- 
cgit v1.2.3


From 1331b62c97217459780e040e8a66bb609f2acd20 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 2 Jan 2017 16:01:57 +0100
Subject: rfkill: remove rfkill-regulator

There are no users of this ("vrfkill") in the tree, so it's just
dead code - remove it.

This also isn't really how rfkill is supposed to be used - it's
intended as a signalling mechanism to/from the device, which the
driver (and partially cfg80211) will handle - having a separate
rfkill instance for a regulator is confusing, the driver should
use the regulator instead to turn off the device when requested.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill-regulator.h |  48 ------------
 net/rfkill/Kconfig               |  11 ---
 net/rfkill/Makefile              |   1 -
 net/rfkill/rfkill-regulator.c    | 154 ---------------------------------------
 4 files changed, 214 deletions(-)
 delete mode 100644 include/linux/rfkill-regulator.h
 delete mode 100644 net/rfkill/rfkill-regulator.c

(limited to 'include/linux')

diff --git a/include/linux/rfkill-regulator.h b/include/linux/rfkill-regulator.h
deleted file mode 100644
index aca36bc83315..000000000000
--- a/include/linux/rfkill-regulator.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * rfkill-regulator.c - Regulator consumer driver for rfkill
- *
- * Copyright (C) 2009  Guiming Zhuo <gmzhuo@gmail.com>
- * Copyright (C) 2011  Antonio Ospite <ospite@studenti.unina.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#ifndef __LINUX_RFKILL_REGULATOR_H
-#define __LINUX_RFKILL_REGULATOR_H
-
-/*
- * Use "vrfkill" as supply id when declaring the regulator consumer:
- *
- * static struct regulator_consumer_supply pcap_regulator_V6_consumers [] = {
- * 	{ .dev_name = "rfkill-regulator.0", .supply = "vrfkill" },
- * };
- *
- * If you have several regulator driven rfkill, you can append a numerical id to
- * .dev_name as done above, and use the same id when declaring the platform
- * device:
- *
- * static struct rfkill_regulator_platform_data ezx_rfkill_bt_data = {
- * 	.name  = "ezx-bluetooth",
- * 	.type  = RFKILL_TYPE_BLUETOOTH,
- * };
- *
- * static struct platform_device a910_rfkill = {
- * 	.name  = "rfkill-regulator",
- * 	.id    = 0,
- * 	.dev   = {
- * 		.platform_data = &ezx_rfkill_bt_data,
- * 	},
- * };
- */
-
-#include <linux/rfkill.h>
-
-struct rfkill_regulator_platform_data {
-	char *name;             /* the name for the rfkill switch */
-	enum rfkill_type type;  /* the type as specified in rfkill.h */
-};
-
-#endif /* __LINUX_RFKILL_REGULATOR_H */
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..060600b03fad 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -23,17 +23,6 @@ config RFKILL_INPUT
 	depends on INPUT = y || RFKILL = INPUT
 	default y if !EXPERT
 
-config RFKILL_REGULATOR
-	tristate "Generic rfkill regulator driver"
-	depends on RFKILL || !RFKILL
-	depends on REGULATOR
-	help
-          This options enable controlling radio transmitters connected to
-          voltage regulator using the regulator framework.
-
-          To compile this driver as a module, choose M here: the module will
-          be called rfkill-regulator.
-
 config RFKILL_GPIO
 	tristate "GPIO RFKILL driver"
 	depends on RFKILL
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
index 311768783f4a..87a80aded0b3 100644
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -5,5 +5,4 @@
 rfkill-y			+= core.o
 rfkill-$(CONFIG_RFKILL_INPUT)	+= input.o
 obj-$(CONFIG_RFKILL)		+= rfkill.o
-obj-$(CONFIG_RFKILL_REGULATOR)	+= rfkill-regulator.o
 obj-$(CONFIG_RFKILL_GPIO)	+= rfkill-gpio.o
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
deleted file mode 100644
index 50cd26a48e87..000000000000
--- a/net/rfkill/rfkill-regulator.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * rfkill-regulator.c - Regulator consumer driver for rfkill
- *
- * Copyright (C) 2009  Guiming Zhuo <gmzhuo@gmail.com>
- * Copyright (C) 2011  Antonio Ospite <ospite@studenti.unina.it>
- *
- * Implementation inspired by leds-regulator driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/consumer.h>
-#include <linux/rfkill.h>
-#include <linux/rfkill-regulator.h>
-
-struct rfkill_regulator_data {
-	struct rfkill *rf_kill;
-	bool reg_enabled;
-
-	struct regulator *vcc;
-};
-
-static int rfkill_regulator_set_block(void *data, bool blocked)
-{
-	struct rfkill_regulator_data *rfkill_data = data;
-	int ret = 0;
-
-	pr_debug("%s: blocked: %d\n", __func__, blocked);
-
-	if (blocked) {
-		if (rfkill_data->reg_enabled) {
-			regulator_disable(rfkill_data->vcc);
-			rfkill_data->reg_enabled = false;
-		}
-	} else {
-		if (!rfkill_data->reg_enabled) {
-			ret = regulator_enable(rfkill_data->vcc);
-			if (!ret)
-				rfkill_data->reg_enabled = true;
-		}
-	}
-
-	pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
-		regulator_is_enabled(rfkill_data->vcc));
-
-	return ret;
-}
-
-static struct rfkill_ops rfkill_regulator_ops = {
-	.set_block = rfkill_regulator_set_block,
-};
-
-static int rfkill_regulator_probe(struct platform_device *pdev)
-{
-	struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
-	struct rfkill_regulator_data *rfkill_data;
-	struct regulator *vcc;
-	struct rfkill *rf_kill;
-	int ret = 0;
-
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "no platform data\n");
-		return -ENODEV;
-	}
-
-	if (pdata->name == NULL || pdata->type == 0) {
-		dev_err(&pdev->dev, "invalid name or type in platform data\n");
-		return -EINVAL;
-	}
-
-	vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
-	if (IS_ERR(vcc)) {
-		dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
-		ret = PTR_ERR(vcc);
-		goto out;
-	}
-
-	rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
-	if (rfkill_data == NULL) {
-		ret = -ENOMEM;
-		goto err_data_alloc;
-	}
-
-	rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
-				pdata->type,
-				&rfkill_regulator_ops, rfkill_data);
-	if (rf_kill == NULL) {
-		ret = -ENOMEM;
-		goto err_rfkill_alloc;
-	}
-
-	if (regulator_is_enabled(vcc)) {
-		dev_dbg(&pdev->dev, "Regulator already enabled\n");
-		rfkill_data->reg_enabled = true;
-	}
-	rfkill_data->vcc = vcc;
-	rfkill_data->rf_kill = rf_kill;
-
-	ret = rfkill_register(rf_kill);
-	if (ret) {
-		dev_err(&pdev->dev, "Cannot register rfkill device\n");
-		goto err_rfkill_register;
-	}
-
-	platform_set_drvdata(pdev, rfkill_data);
-	dev_info(&pdev->dev, "%s initialized\n", pdata->name);
-
-	return 0;
-
-err_rfkill_register:
-	rfkill_destroy(rf_kill);
-err_rfkill_alloc:
-	kfree(rfkill_data);
-err_data_alloc:
-	regulator_put(vcc);
-out:
-	return ret;
-}
-
-static int rfkill_regulator_remove(struct platform_device *pdev)
-{
-	struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
-	struct rfkill *rf_kill = rfkill_data->rf_kill;
-
-	rfkill_unregister(rf_kill);
-	rfkill_destroy(rf_kill);
-	regulator_put(rfkill_data->vcc);
-	kfree(rfkill_data);
-
-	return 0;
-}
-
-static struct platform_driver rfkill_regulator_driver = {
-	.probe = rfkill_regulator_probe,
-	.remove = rfkill_regulator_remove,
-	.driver = {
-		.name = "rfkill-regulator",
-	},
-};
-
-module_platform_driver(rfkill_regulator_driver);
-
-MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
-MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
-MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:rfkill-regulator");
-- 
cgit v1.2.3


From 12a6075cabc0d9ffbc0366b44daa22f278606312 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 10 Jan 2017 18:52:06 +0100
Subject: can: dev: add CAN interface termination API

This patch adds a netlink interface to configure the CAN bus termination of
CAN interfaces.

Inside the driver an array of supported termination values is defined:

const u16 drvname_termination[] = { 60, 120, CAN_TERMINATION_DISABLED };

struct drvname_priv *priv;
priv = netdev_priv(dev);

priv->termination_const = drvname_termination;
priv->termination_const_cnt = ARRAY_SIZE(drvname_termination);
priv->termination = CAN_TERMINATION_DISABLED;

And the funtion to set the value has to be defined:

priv->do_set_termination = drvname_set_termination;

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Reviewed-by: Ramesh Shanmugasundaram <Ramesh.shanmugasundaram@bp.renesas.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c            | 49 +++++++++++++++++++++++++++++++++++++++-
 include/linux/can/dev.h          |  4 ++++
 include/uapi/linux/can/netlink.h |  5 ++++
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 8d6208c0b400..fefe2cd17721 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -958,6 +958,30 @@ static int can_changelink(struct net_device *dev,
 		}
 	}
 
+	if (data[IFLA_CAN_TERMINATION]) {
+		const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]);
+		const unsigned int num_term = priv->termination_const_cnt;
+		unsigned int i;
+
+		if (!priv->do_set_termination)
+			return -EOPNOTSUPP;
+
+		/* check whether given value is supported by the interface */
+		for (i = 0; i < num_term; i++) {
+			if (termval == priv->termination_const[i])
+				break;
+		}
+		if (i >= num_term)
+			return -EINVAL;
+
+		/* Finally, set the termination value */
+		err = priv->do_set_termination(dev, termval);
+		if (err)
+			return err;
+
+		priv->termination = termval;
+	}
+
 	return 0;
 }
 
@@ -980,6 +1004,11 @@ static size_t can_get_size(const struct net_device *dev)
 		size += nla_total_size(sizeof(struct can_bittiming));
 	if (priv->data_bittiming_const)				/* IFLA_CAN_DATA_BITTIMING_CONST */
 		size += nla_total_size(sizeof(struct can_bittiming_const));
+	if (priv->termination_const) {
+		size += nla_total_size(sizeof(priv->termination));		/* IFLA_CAN_TERMINATION */
+		size += nla_total_size(sizeof(*priv->termination_const) *	/* IFLA_CAN_TERMINATION_CONST */
+				       priv->termination_const_cnt);
+	}
 
 	return size;
 }
@@ -1018,7 +1047,15 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    (priv->data_bittiming_const &&
 	     nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST,
 		     sizeof(*priv->data_bittiming_const),
-		     priv->data_bittiming_const)))
+		     priv->data_bittiming_const)) ||
+
+	    (priv->termination_const &&
+	     (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) ||
+	      nla_put(skb, IFLA_CAN_TERMINATION_CONST,
+		      sizeof(*priv->termination_const) *
+		      priv->termination_const_cnt,
+		      priv->termination_const))))
+
 		return -EMSGSIZE;
 
 	return 0;
@@ -1073,6 +1110,16 @@ static struct rtnl_link_ops can_link_ops __read_mostly = {
  */
 int register_candev(struct net_device *dev)
 {
+	struct can_priv *priv = netdev_priv(dev);
+
+	/* Ensure termination_const, termination_const_cnt and
+	 * do_set_termination consistency. All must be either set or
+	 * unset.
+	 */
+	if ((!priv->termination_const != !priv->termination_const_cnt) ||
+	    (!priv->termination_const != !priv->do_set_termination))
+		return -EINVAL;
+
 	dev->rtnl_link_ops = &can_link_ops;
 	return register_netdev(dev);
 }
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 5f5270941ba0..f6a57f322f00 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -38,6 +38,9 @@ struct can_priv {
 	struct can_bittiming bittiming, data_bittiming;
 	const struct can_bittiming_const *bittiming_const,
 		*data_bittiming_const;
+	const u16 *termination_const;
+	unsigned int termination_const_cnt;
+	u16 termination;
 	struct can_clock clock;
 
 	enum can_state state;
@@ -53,6 +56,7 @@ struct can_priv {
 	int (*do_set_bittiming)(struct net_device *dev);
 	int (*do_set_data_bittiming)(struct net_device *dev);
 	int (*do_set_mode)(struct net_device *dev, enum can_mode mode);
+	int (*do_set_termination)(struct net_device *dev, u16 term);
 	int (*do_get_state)(const struct net_device *dev,
 			    enum can_state *state);
 	int (*do_get_berr_counter)(const struct net_device *dev,
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 94ffe0c83ce7..7414771926fb 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -127,9 +127,14 @@ enum {
 	IFLA_CAN_BERR_COUNTER,
 	IFLA_CAN_DATA_BITTIMING,
 	IFLA_CAN_DATA_BITTIMING_CONST,
+	IFLA_CAN_TERMINATION,
+	IFLA_CAN_TERMINATION_CONST,
 	__IFLA_CAN_MAX
 };
 
 #define IFLA_CAN_MAX	(__IFLA_CAN_MAX - 1)
 
+/* u16 termination range: 1..65535 Ohms */
+#define CAN_TERMINATION_DISABLED 0
+
 #endif /* !_UAPI_CAN_NETLINK_H */
-- 
cgit v1.2.3


From 431af779256cd6cb8328ac23c5696bae63c33a51 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 11 Jan 2017 17:05:35 +0100
Subject: can: dev: add CAN interface API for fixed bitrates

Some CAN interfaces only support fixed fixed bitrates. This patch adds a
netlink interface to get the list of the CAN interface's fixed bitrates and
data bitrates.

Inside the driver arrays of supported data- bitrate values are defined.

const u32 drvname_bitrate[] = { 20000, 50000, 100000 };
const u32 drvname_data_bitrate[] = { 200000, 500000, 1000000 };

struct drvname_priv *priv;
priv = netdev_priv(dev);

priv->bitrate_const = drvname_bitrate;
priv->bitrate_const_cnt = ARRAY_SIZE(drvname_bitrate);
priv->data_bitrate_const = drvname_data_bitrate;
priv->data_bitrate_const_cnt = ARRAY_SIZE(drvname_data_bitrate);

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c            | 81 ++++++++++++++++++++++++++++++++--------
 include/linux/can/dev.h          |  4 ++
 include/uapi/linux/can/netlink.h |  2 +
 3 files changed, 71 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index afcf487382c9..611d16a7061d 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -279,8 +279,29 @@ static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	return 0;
 }
 
+/* Checks the validity of predefined bitrate settings */
+static int can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt,
+				const u32 *bitrate_const,
+				const unsigned int bitrate_const_cnt)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	unsigned int i;
+
+	for (i = 0; i < bitrate_const_cnt; i++) {
+		if (bt->bitrate == bitrate_const[i])
+			break;
+	}
+
+	if (i >= priv->bitrate_const_cnt)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
-			     const struct can_bittiming_const *btc)
+			     const struct can_bittiming_const *btc,
+			     const u32 *bitrate_const,
+			     const unsigned int bitrate_const_cnt)
 {
 	int err;
 
@@ -290,10 +311,13 @@ static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	 * alternatively the CAN timing parameters (tq, prop_seg, etc.) are
 	 * provided directly which are then checked and fixed up.
 	 */
-	if (!bt->tq && bt->bitrate)
+	if (!bt->tq && bt->bitrate && btc)
 		err = can_calc_bittiming(dev, bt, btc);
-	else if (bt->tq && !bt->bitrate)
+	else if (bt->tq && !bt->bitrate && btc)
 		err = can_fixup_bittiming(dev, bt, btc);
+	else if (!bt->tq && bt->bitrate && bitrate_const)
+		err = can_validate_bitrate(dev, bt, bitrate_const,
+					   bitrate_const_cnt);
 	else
 		err = -EINVAL;
 
@@ -878,12 +902,12 @@ static int can_changelink(struct net_device *dev,
 			return -EOPNOTSUPP;
 
 		memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt));
-		if (priv->bittiming_const) {
-			err = can_get_bittiming(dev, &bt,
-						priv->bittiming_const);
-			if (err)
-				return err;
-		}
+		err = can_get_bittiming(dev, &bt,
+					priv->bittiming_const,
+					priv->bitrate_const,
+					priv->bitrate_const_cnt);
+		if (err)
+			return err;
 		memcpy(&priv->bittiming, &bt, sizeof(bt));
 
 		if (priv->do_set_bittiming) {
@@ -962,12 +986,12 @@ static int can_changelink(struct net_device *dev,
 
 		memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]),
 		       sizeof(dbt));
-		if (priv->data_bittiming_const) {
-			err = can_get_bittiming(dev, &dbt,
-						priv->data_bittiming_const);
-			if (err)
-				return err;
-		}
+		err = can_get_bittiming(dev, &dbt,
+					priv->data_bittiming_const,
+					priv->data_bitrate_const,
+					priv->data_bitrate_const_cnt);
+		if (err)
+			return err;
 		memcpy(&priv->data_bittiming, &dbt, sizeof(dbt));
 
 		if (priv->do_set_data_bittiming) {
@@ -1029,6 +1053,12 @@ static size_t can_get_size(const struct net_device *dev)
 		size += nla_total_size(sizeof(*priv->termination_const) *	/* IFLA_CAN_TERMINATION_CONST */
 				       priv->termination_const_cnt);
 	}
+	if (priv->bitrate_const)				/* IFLA_CAN_BITRATE_CONST */
+		size += nla_total_size(sizeof(*priv->bitrate_const) *
+				       priv->bitrate_const_cnt);
+	if (priv->data_bitrate_const)				/* IFLA_CAN_DATA_BITRATE_CONST */
+		size += nla_total_size(sizeof(*priv->data_bitrate_const) *
+				       priv->data_bitrate_const_cnt);
 
 	return size;
 }
@@ -1074,7 +1104,20 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	      nla_put(skb, IFLA_CAN_TERMINATION_CONST,
 		      sizeof(*priv->termination_const) *
 		      priv->termination_const_cnt,
-		      priv->termination_const))))
+		      priv->termination_const))) ||
+
+	    (priv->bitrate_const &&
+	     nla_put(skb, IFLA_CAN_BITRATE_CONST,
+		     sizeof(*priv->bitrate_const) *
+		     priv->bitrate_const_cnt,
+		     priv->bitrate_const)) ||
+
+	    (priv->data_bitrate_const &&
+	     nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST,
+		     sizeof(*priv->data_bitrate_const) *
+		     priv->data_bitrate_const_cnt,
+		     priv->data_bitrate_const))
+	    )
 
 		return -EMSGSIZE;
 
@@ -1140,6 +1183,12 @@ int register_candev(struct net_device *dev)
 	    (!priv->termination_const != !priv->do_set_termination))
 		return -EINVAL;
 
+	if (!priv->bitrate_const != !priv->bitrate_const_cnt)
+		return -EINVAL;
+
+	if (!priv->data_bitrate_const != !priv->data_bitrate_const_cnt)
+		return -EINVAL;
+
 	dev->rtnl_link_ops = &can_link_ops;
 	return register_netdev(dev);
 }
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index f6a57f322f00..141b05aade81 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -41,6 +41,10 @@ struct can_priv {
 	const u16 *termination_const;
 	unsigned int termination_const_cnt;
 	u16 termination;
+	const u32 *bitrate_const;
+	unsigned int bitrate_const_cnt;
+	const u32 *data_bitrate_const;
+	unsigned int data_bitrate_const_cnt;
 	struct can_clock clock;
 
 	enum can_state state;
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 7414771926fb..fdf75f74fdaf 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -129,6 +129,8 @@ enum {
 	IFLA_CAN_DATA_BITTIMING_CONST,
 	IFLA_CAN_TERMINATION,
 	IFLA_CAN_TERMINATION_CONST,
+	IFLA_CAN_BITRATE_CONST,
+	IFLA_CAN_DATA_BITRATE_CONST,
 	__IFLA_CAN_MAX
 };
 
-- 
cgit v1.2.3


From 5299709d0a87342dadc1fc9850484fadeb488bf8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:01 -0800
Subject: treewide: Constify most dma_map_ops structures

Most dma_map_ops structures are never modified. Constify these
structures such that these can be write-protected. This patch
has been generated as follows:

git grep -l 'struct dma_map_ops' |
  xargs -d\\n sed -i \
    -e 's/struct dma_map_ops/const struct dma_map_ops/g' \
    -e 's/const struct dma_map_ops {/struct dma_map_ops {/g' \
    -e 's/^const struct dma_map_ops;$/struct dma_map_ops;/' \
    -e 's/const const struct dma_map_ops /const struct dma_map_ops /g';
sed -i -e 's/const \(struct dma_map_ops intel_dma_ops\)/\1/' \
  $(git grep -l 'struct dma_map_ops intel_dma_ops');
sed -i -e 's/const \(struct dma_map_ops dma_iommu_ops\)/\1/' \
  $(git grep -l 'struct dma_map_ops' | grep ^arch/powerpc);
sed -i -e '/^struct vmd_dev {$/,/^};$/ s/const \(struct dma_map_ops[[:blank:]]dma_ops;\)/\1/' \
       -e '/^static void vmd_setup_dma_ops/,/^}$/ s/const \(struct dma_map_ops \*dest\)/\1/' \
       -e 's/const \(struct dma_map_ops \*dest = \&vmd->dma_ops\)/\1/' \
    drivers/pci/host/*.c
sed -i -e '/^void __init pci_iommu_alloc(void)$/,/^}$/ s/dma_ops->/intel_dma_ops./' arch/ia64/kernel/pci-dma.c
sed -i -e 's/static const struct dma_map_ops sn_dma_ops/static struct dma_map_ops sn_dma_ops/' arch/ia64/sn/pci/pci_dma.c
sed -i -e 's/(const struct dma_map_ops \*)//' drivers/misc/mic/bus/vop_bus.c

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Russell King <linux@armlinux.org.uk>
Cc: x86@kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 arch/alpha/include/asm/dma-mapping.h               |  4 +--
 arch/alpha/kernel/pci-noop.c                       |  4 +--
 arch/alpha/kernel/pci_iommu.c                      |  4 +--
 arch/arc/include/asm/dma-mapping.h                 |  4 +--
 arch/arc/mm/dma.c                                  |  2 +-
 arch/arm/common/dmabounce.c                        |  2 +-
 arch/arm/include/asm/device.h                      |  2 +-
 arch/arm/include/asm/dma-mapping.h                 | 10 +++---
 arch/arm/mm/dma-mapping.c                          | 22 ++++++------
 arch/arm/xen/mm.c                                  |  4 +--
 arch/arm64/include/asm/device.h                    |  2 +-
 arch/arm64/include/asm/dma-mapping.h               |  6 ++--
 arch/arm64/mm/dma-mapping.c                        |  6 ++--
 arch/avr32/include/asm/dma-mapping.h               |  4 +--
 arch/avr32/mm/dma-coherent.c                       |  2 +-
 arch/blackfin/include/asm/dma-mapping.h            |  4 +--
 arch/blackfin/kernel/dma-mapping.c                 |  2 +-
 arch/c6x/include/asm/dma-mapping.h                 |  4 +--
 arch/c6x/kernel/dma.c                              |  2 +-
 arch/cris/arch-v32/drivers/pci/dma.c               |  2 +-
 arch/cris/include/asm/dma-mapping.h                |  6 ++--
 arch/frv/include/asm/dma-mapping.h                 |  4 +--
 arch/frv/mb93090-mb00/pci-dma-nommu.c              |  2 +-
 arch/frv/mb93090-mb00/pci-dma.c                    |  2 +-
 arch/h8300/include/asm/dma-mapping.h               |  4 +--
 arch/h8300/kernel/dma.c                            |  2 +-
 arch/hexagon/include/asm/dma-mapping.h             |  4 +--
 arch/hexagon/kernel/dma.c                          |  4 +--
 arch/ia64/hp/common/hwsw_iommu.c                   |  4 +--
 arch/ia64/hp/common/sba_iommu.c                    |  4 +--
 arch/ia64/include/asm/dma-mapping.h                |  2 +-
 arch/ia64/include/asm/machvec.h                    |  4 +--
 arch/ia64/kernel/dma-mapping.c                     |  4 +--
 arch/ia64/kernel/pci-dma.c                         | 10 +++---
 arch/ia64/kernel/pci-swiotlb.c                     |  2 +-
 arch/m32r/include/asm/device.h                     |  2 +-
 arch/m32r/include/asm/dma-mapping.h                |  2 +-
 arch/m68k/include/asm/dma-mapping.h                |  4 +--
 arch/m68k/kernel/dma.c                             |  2 +-
 arch/metag/include/asm/dma-mapping.h               |  4 +--
 arch/metag/kernel/dma.c                            |  2 +-
 arch/microblaze/include/asm/dma-mapping.h          |  4 +--
 arch/microblaze/kernel/dma.c                       |  2 +-
 arch/mips/cavium-octeon/dma-octeon.c               |  4 +--
 arch/mips/include/asm/device.h                     |  2 +-
 arch/mips/include/asm/dma-mapping.h                |  4 +--
 .../include/asm/mach-cavium-octeon/dma-coherence.h |  2 +-
 arch/mips/include/asm/netlogic/common.h            |  2 +-
 arch/mips/loongson64/common/dma-swiotlb.c          |  2 +-
 arch/mips/mm/dma-default.c                         |  4 +--
 arch/mips/netlogic/common/nlm-dma.c                |  2 +-
 arch/mn10300/include/asm/dma-mapping.h             |  4 +--
 arch/mn10300/mm/dma-alloc.c                        |  2 +-
 arch/nios2/include/asm/dma-mapping.h               |  4 +--
 arch/nios2/mm/dma-mapping.c                        |  2 +-
 arch/openrisc/include/asm/dma-mapping.h            |  4 +--
 arch/openrisc/kernel/dma.c                         |  2 +-
 arch/parisc/include/asm/dma-mapping.h              |  8 ++---
 arch/parisc/kernel/drivers.c                       |  2 +-
 arch/parisc/kernel/pci-dma.c                       |  4 +--
 arch/powerpc/include/asm/device.h                  |  2 +-
 arch/powerpc/include/asm/dma-mapping.h             |  6 ++--
 arch/powerpc/include/asm/pci.h                     |  4 +--
 arch/powerpc/include/asm/swiotlb.h                 |  2 +-
 arch/powerpc/kernel/dma-swiotlb.c                  |  2 +-
 arch/powerpc/kernel/dma.c                          |  6 ++--
 arch/powerpc/kernel/pci-common.c                   |  6 ++--
 arch/powerpc/platforms/cell/iommu.c                |  4 +--
 arch/powerpc/platforms/powernv/npu-dma.c           |  2 +-
 arch/powerpc/platforms/ps3/system-bus.c            |  4 +--
 arch/powerpc/platforms/pseries/ibmebus.c           |  2 +-
 arch/powerpc/platforms/pseries/vio.c               |  2 +-
 arch/s390/include/asm/device.h                     |  2 +-
 arch/s390/include/asm/dma-mapping.h                |  4 +--
 arch/s390/pci/pci_dma.c                            |  2 +-
 arch/sh/include/asm/dma-mapping.h                  |  4 +--
 arch/sh/kernel/dma-nommu.c                         |  2 +-
 arch/sh/mm/consistent.c                            |  2 +-
 arch/sparc/include/asm/dma-mapping.h               |  8 ++---
 arch/sparc/kernel/iommu.c                          |  4 +--
 arch/sparc/kernel/ioport.c                         |  8 ++---
 arch/sparc/kernel/pci_sun4v.c                      |  2 +-
 arch/tile/include/asm/device.h                     |  2 +-
 arch/tile/include/asm/dma-mapping.h                | 12 +++----
 arch/tile/kernel/pci-dma.c                         | 24 ++++++-------
 arch/unicore32/include/asm/dma-mapping.h           |  4 +--
 arch/unicore32/mm/dma-swiotlb.c                    |  2 +-
 arch/x86/include/asm/device.h                      |  4 +--
 arch/x86/include/asm/dma-mapping.h                 |  4 +--
 arch/x86/include/asm/iommu.h                       |  2 +-
 arch/x86/kernel/amd_gart_64.c                      |  2 +-
 arch/x86/kernel/pci-calgary_64.c                   |  2 +-
 arch/x86/kernel/pci-dma.c                          |  4 +--
 arch/x86/kernel/pci-nommu.c                        |  2 +-
 arch/x86/kernel/pci-swiotlb.c                      |  2 +-
 arch/x86/pci/sta2x11-fixup.c                       |  2 +-
 arch/x86/xen/pci-swiotlb-xen.c                     |  2 +-
 arch/xtensa/include/asm/device.h                   |  2 +-
 arch/xtensa/include/asm/dma-mapping.h              |  4 +--
 arch/xtensa/kernel/pci-dma.c                       |  2 +-
 drivers/iommu/amd_iommu.c                          |  4 +--
 drivers/misc/mic/bus/mic_bus.c                     |  2 +-
 drivers/misc/mic/bus/scif_bus.c                    |  2 +-
 drivers/misc/mic/bus/scif_bus.h                    |  2 +-
 drivers/misc/mic/bus/vop_bus.c                     |  2 +-
 drivers/misc/mic/host/mic_boot.c                   |  4 +--
 drivers/parisc/ccio-dma.c                          |  2 +-
 drivers/parisc/sba_iommu.c                         |  2 +-
 drivers/pci/host/vmd.c                             |  2 +-
 include/linux/dma-mapping.h                        | 42 +++++++++++-----------
 include/linux/mic_bus.h                            |  2 +-
 include/xen/arm/hypervisor.h                       |  2 +-
 lib/dma-noop.c                                     |  2 +-
 113 files changed, 227 insertions(+), 227 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index c63b6ac19ee5..d3480562411d 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -1,9 +1,9 @@
 #ifndef _ALPHA_DMA_MAPPING_H
 #define _ALPHA_DMA_MAPPING_H
 
-extern struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops *dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return dma_ops;
 }
diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index bb152e21e5ae..ffbdb3fb672f 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -128,7 +128,7 @@ static int alpha_noop_supported(struct device *dev, u64 mask)
 	return mask < 0x00ffffffUL ? 0 : 1;
 }
 
-struct dma_map_ops alpha_noop_ops = {
+const struct dma_map_ops alpha_noop_ops = {
 	.alloc			= alpha_noop_alloc_coherent,
 	.free			= dma_noop_free_coherent,
 	.map_page		= dma_noop_map_page,
@@ -137,5 +137,5 @@ struct dma_map_ops alpha_noop_ops = {
 	.dma_supported		= alpha_noop_supported,
 };
 
-struct dma_map_ops *dma_ops = &alpha_noop_ops;
+const struct dma_map_ops *dma_ops = &alpha_noop_ops;
 EXPORT_SYMBOL(dma_ops);
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 451fc9cdd323..7fd2329038a3 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -939,7 +939,7 @@ static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return dma_addr == 0;
 }
 
-struct dma_map_ops alpha_pci_ops = {
+const struct dma_map_ops alpha_pci_ops = {
 	.alloc			= alpha_pci_alloc_coherent,
 	.free			= alpha_pci_free_coherent,
 	.map_page		= alpha_pci_map_page,
@@ -950,5 +950,5 @@ struct dma_map_ops alpha_pci_ops = {
 	.dma_supported		= alpha_pci_supported,
 };
 
-struct dma_map_ops *dma_ops = &alpha_pci_ops;
+const struct dma_map_ops *dma_ops = &alpha_pci_ops;
 EXPORT_SYMBOL(dma_ops);
diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h
index 266f11c9bd59..fdff3aa60052 100644
--- a/arch/arc/include/asm/dma-mapping.h
+++ b/arch/arc/include/asm/dma-mapping.h
@@ -18,9 +18,9 @@
 #include <plat/dma.h>
 #endif
 
-extern struct dma_map_ops arc_dma_ops;
+extern const struct dma_map_ops arc_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &arc_dma_ops;
 }
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 08450a1a5b5f..2a07e6ecafbd 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -218,7 +218,7 @@ static int arc_dma_supported(struct device *dev, u64 dma_mask)
 	return dma_mask == DMA_BIT_MASK(32);
 }
 
-struct dma_map_ops arc_dma_ops = {
+const struct dma_map_ops arc_dma_ops = {
 	.alloc			= arc_dma_alloc,
 	.free			= arc_dma_free,
 	.mmap			= arc_dma_mmap,
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 75055df1cda3..9b1b7be2ec0e 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -452,7 +452,7 @@ static int dmabounce_set_mask(struct device *dev, u64 dma_mask)
 	return arm_dma_ops.set_dma_mask(dev, dma_mask);
 }
 
-static struct dma_map_ops dmabounce_ops = {
+static const struct dma_map_ops dmabounce_ops = {
 	.alloc			= arm_dma_alloc,
 	.free			= arm_dma_free,
 	.mmap			= arm_dma_mmap,
diff --git a/arch/arm/include/asm/device.h b/arch/arm/include/asm/device.h
index 4111592f0130..d8a572f9c187 100644
--- a/arch/arm/include/asm/device.h
+++ b/arch/arm/include/asm/device.h
@@ -7,7 +7,7 @@
 #define ASMARM_DEVICE_H
 
 struct dev_archdata {
-	struct dma_map_ops	*dma_ops;
+	const struct dma_map_ops	*dma_ops;
 #ifdef CONFIG_DMABOUNCE
 	struct dmabounce_device_info *dmabounce;
 #endif
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index bf02dbd9ccda..1aabd781306f 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -13,17 +13,17 @@
 #include <asm/xen/hypervisor.h>
 
 #define DMA_ERROR_CODE	(~(dma_addr_t)0x0)
-extern struct dma_map_ops arm_dma_ops;
-extern struct dma_map_ops arm_coherent_dma_ops;
+extern const struct dma_map_ops arm_dma_ops;
+extern const struct dma_map_ops arm_coherent_dma_ops;
 
-static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
 	return &arm_dma_ops;
 }
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (xen_initial_domain())
 		return xen_dma_ops;
@@ -31,7 +31,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 		return __generic_dma_ops(dev);
 }
 
-static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
+static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
 {
 	BUG_ON(!dev);
 	dev->archdata.dma_ops = ops;
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ab7710002ba6..d26fe1a35687 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -180,7 +180,7 @@ static void arm_dma_sync_single_for_device(struct device *dev,
 	__dma_page_cpu_to_dev(page, offset, size, dir);
 }
 
-struct dma_map_ops arm_dma_ops = {
+const struct dma_map_ops arm_dma_ops = {
 	.alloc			= arm_dma_alloc,
 	.free			= arm_dma_free,
 	.mmap			= arm_dma_mmap,
@@ -204,7 +204,7 @@ static int arm_coherent_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		 unsigned long attrs);
 
-struct dma_map_ops arm_coherent_dma_ops = {
+const struct dma_map_ops arm_coherent_dma_ops = {
 	.alloc			= arm_coherent_dma_alloc,
 	.free			= arm_coherent_dma_free,
 	.mmap			= arm_coherent_dma_mmap,
@@ -1067,7 +1067,7 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 int arm_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	struct scatterlist *s;
 	int i, j;
 
@@ -1101,7 +1101,7 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 void arm_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	struct scatterlist *s;
 
 	int i;
@@ -1120,7 +1120,7 @@ void arm_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 void arm_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 			int nents, enum dma_data_direction dir)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	struct scatterlist *s;
 	int i;
 
@@ -1139,7 +1139,7 @@ void arm_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 void arm_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 			int nents, enum dma_data_direction dir)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	struct scatterlist *s;
 	int i;
 
@@ -2099,7 +2099,7 @@ static void arm_iommu_sync_single_for_device(struct device *dev,
 	__dma_page_cpu_to_dev(page, offset, size, dir);
 }
 
-struct dma_map_ops iommu_ops = {
+const struct dma_map_ops iommu_ops = {
 	.alloc		= arm_iommu_alloc_attrs,
 	.free		= arm_iommu_free_attrs,
 	.mmap		= arm_iommu_mmap_attrs,
@@ -2119,7 +2119,7 @@ struct dma_map_ops iommu_ops = {
 	.unmap_resource		= arm_iommu_unmap_resource,
 };
 
-struct dma_map_ops iommu_coherent_ops = {
+const struct dma_map_ops iommu_coherent_ops = {
 	.alloc		= arm_coherent_iommu_alloc_attrs,
 	.free		= arm_coherent_iommu_free_attrs,
 	.mmap		= arm_coherent_iommu_mmap_attrs,
@@ -2319,7 +2319,7 @@ void arm_iommu_detach_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(arm_iommu_detach_device);
 
-static struct dma_map_ops *arm_get_iommu_dma_map_ops(bool coherent)
+static const struct dma_map_ops *arm_get_iommu_dma_map_ops(bool coherent)
 {
 	return coherent ? &iommu_coherent_ops : &iommu_ops;
 }
@@ -2374,7 +2374,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev) { }
 
 #endif	/* CONFIG_ARM_DMA_USE_IOMMU */
 
-static struct dma_map_ops *arm_get_dma_map_ops(bool coherent)
+static const struct dma_map_ops *arm_get_dma_map_ops(bool coherent)
 {
 	return coherent ? &arm_coherent_dma_ops : &arm_dma_ops;
 }
@@ -2382,7 +2382,7 @@ static struct dma_map_ops *arm_get_dma_map_ops(bool coherent)
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 
 	dev->archdata.dma_coherent = coherent;
 	if (arm_setup_iommu_dma_ops(dev, dma_base, size, iommu))
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index bd62d94f8ac5..ce18c91b50a1 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -182,10 +182,10 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
 }
 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
 
-struct dma_map_ops *xen_dma_ops;
+const struct dma_map_ops *xen_dma_ops;
 EXPORT_SYMBOL(xen_dma_ops);
 
-static struct dma_map_ops xen_swiotlb_dma_ops = {
+static const struct dma_map_ops xen_swiotlb_dma_ops = {
 	.alloc = xen_swiotlb_alloc_coherent,
 	.free = xen_swiotlb_free_coherent,
 	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
diff --git a/arch/arm64/include/asm/device.h b/arch/arm64/include/asm/device.h
index 243ef256b8c9..00c678cc31e1 100644
--- a/arch/arm64/include/asm/device.h
+++ b/arch/arm64/include/asm/device.h
@@ -17,7 +17,7 @@
 #define __ASM_DEVICE_H
 
 struct dev_archdata {
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 #ifdef CONFIG_IOMMU_API
 	void *iommu;			/* private IOMMU data */
 #endif
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index ccea82c2b089..1fedb43be712 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -25,9 +25,9 @@
 #include <asm/xen/hypervisor.h>
 
 #define DMA_ERROR_CODE	(~(dma_addr_t)0)
-extern struct dma_map_ops dummy_dma_ops;
+extern const struct dma_map_ops dummy_dma_ops;
 
-static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
@@ -39,7 +39,7 @@ static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
 	return &dummy_dma_ops;
 }
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (xen_initial_domain())
 		return xen_dma_ops;
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index e04082700bb1..bcef6368d48f 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -352,7 +352,7 @@ static int __swiotlb_dma_supported(struct device *hwdev, u64 mask)
 	return 1;
 }
 
-static struct dma_map_ops swiotlb_dma_ops = {
+static const struct dma_map_ops swiotlb_dma_ops = {
 	.alloc = __dma_alloc,
 	.free = __dma_free,
 	.mmap = __swiotlb_mmap,
@@ -505,7 +505,7 @@ static int __dummy_dma_supported(struct device *hwdev, u64 mask)
 	return 0;
 }
 
-struct dma_map_ops dummy_dma_ops = {
+const struct dma_map_ops dummy_dma_ops = {
 	.alloc                  = __dummy_alloc,
 	.free                   = __dummy_free,
 	.mmap                   = __dummy_mmap,
@@ -784,7 +784,7 @@ static void __iommu_unmap_sg_attrs(struct device *dev,
 	iommu_dma_unmap_sg(dev, sgl, nelems, dir, attrs);
 }
 
-static struct dma_map_ops iommu_dma_ops = {
+static const struct dma_map_ops iommu_dma_ops = {
 	.alloc = __iommu_alloc_attrs,
 	.free = __iommu_free_attrs,
 	.mmap = __iommu_mmap_attrs,
diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h
index 1115f2a645d1..b2b43c0e0774 100644
--- a/arch/avr32/include/asm/dma-mapping.h
+++ b/arch/avr32/include/asm/dma-mapping.h
@@ -4,9 +4,9 @@
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	int direction);
 
-extern struct dma_map_ops avr32_dma_ops;
+extern const struct dma_map_ops avr32_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &avr32_dma_ops;
 }
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
index 54534e5d0781..555222d4f414 100644
--- a/arch/avr32/mm/dma-coherent.c
+++ b/arch/avr32/mm/dma-coherent.c
@@ -191,7 +191,7 @@ static void avr32_dma_sync_sg_for_device(struct device *dev,
 		dma_cache_sync(dev, sg_virt(sg), sg->length, direction);
 }
 
-struct dma_map_ops avr32_dma_ops = {
+const struct dma_map_ops avr32_dma_ops = {
 	.alloc			= avr32_dma_alloc,
 	.free			= avr32_dma_free,
 	.map_page		= avr32_dma_map_page,
diff --git a/arch/blackfin/include/asm/dma-mapping.h b/arch/blackfin/include/asm/dma-mapping.h
index 3490570aaa82..320fb50fbd41 100644
--- a/arch/blackfin/include/asm/dma-mapping.h
+++ b/arch/blackfin/include/asm/dma-mapping.h
@@ -36,9 +36,9 @@ _dma_sync(dma_addr_t addr, size_t size, enum dma_data_direction dir)
 		__dma_sync(addr, size, dir);
 }
 
-extern struct dma_map_ops bfin_dma_ops;
+extern const struct dma_map_ops bfin_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &bfin_dma_ops;
 }
diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c
index a27a74a18fb0..477bb29a7987 100644
--- a/arch/blackfin/kernel/dma-mapping.c
+++ b/arch/blackfin/kernel/dma-mapping.c
@@ -159,7 +159,7 @@ static inline void bfin_dma_sync_single_for_device(struct device *dev,
 	_dma_sync(handle, size, dir);
 }
 
-struct dma_map_ops bfin_dma_ops = {
+const struct dma_map_ops bfin_dma_ops = {
 	.alloc			= bfin_dma_alloc,
 	.free			= bfin_dma_free,
 
diff --git a/arch/c6x/include/asm/dma-mapping.h b/arch/c6x/include/asm/dma-mapping.h
index 5717b1e52d96..88258b9ebc8e 100644
--- a/arch/c6x/include/asm/dma-mapping.h
+++ b/arch/c6x/include/asm/dma-mapping.h
@@ -17,9 +17,9 @@
  */
 #define DMA_ERROR_CODE ~0
 
-extern struct dma_map_ops c6x_dma_ops;
+extern const struct dma_map_ops c6x_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &c6x_dma_ops;
 }
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
index 6752df32ef06..9fff8be75f58 100644
--- a/arch/c6x/kernel/dma.c
+++ b/arch/c6x/kernel/dma.c
@@ -123,7 +123,7 @@ static void c6x_dma_sync_sg_for_device(struct device *dev,
 
 }
 
-struct dma_map_ops c6x_dma_ops = {
+const struct dma_map_ops c6x_dma_ops = {
 	.alloc			= c6x_dma_alloc,
 	.free			= c6x_dma_free,
 	.map_page		= c6x_dma_map_page,
diff --git a/arch/cris/arch-v32/drivers/pci/dma.c b/arch/cris/arch-v32/drivers/pci/dma.c
index 1f0636793f0c..7072341995ff 100644
--- a/arch/cris/arch-v32/drivers/pci/dma.c
+++ b/arch/cris/arch-v32/drivers/pci/dma.c
@@ -69,7 +69,7 @@ static inline int v32_dma_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-struct dma_map_ops v32_dma_ops = {
+const struct dma_map_ops v32_dma_ops = {
 	.alloc			= v32_dma_alloc,
 	.free			= v32_dma_free,
 	.map_page		= v32_dma_map_page,
diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h
index 5a370178a0e9..aae4fbc0a656 100644
--- a/arch/cris/include/asm/dma-mapping.h
+++ b/arch/cris/include/asm/dma-mapping.h
@@ -2,14 +2,14 @@
 #define _ASM_CRIS_DMA_MAPPING_H
 
 #ifdef CONFIG_PCI
-extern struct dma_map_ops v32_dma_ops;
+extern const struct dma_map_ops v32_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &v32_dma_ops;
 }
 #else
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	BUG();
 	return NULL;
diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h
index 9a82bfa4303b..150cc00544a8 100644
--- a/arch/frv/include/asm/dma-mapping.h
+++ b/arch/frv/include/asm/dma-mapping.h
@@ -7,9 +7,9 @@
 extern unsigned long __nongprelbss dma_coherent_mem_start;
 extern unsigned long __nongprelbss dma_coherent_mem_end;
 
-extern struct dma_map_ops frv_dma_ops;
+extern const struct dma_map_ops frv_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &frv_dma_ops;
 }
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index 187688128c65..4a96de7f0af4 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -164,7 +164,7 @@ static int frv_dma_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-struct dma_map_ops frv_dma_ops = {
+const struct dma_map_ops frv_dma_ops = {
 	.alloc			= frv_dma_alloc,
 	.free			= frv_dma_free,
 	.map_page		= frv_dma_map_page,
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index dba7df918144..e7130abc0dae 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -106,7 +106,7 @@ static int frv_dma_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-struct dma_map_ops frv_dma_ops = {
+const struct dma_map_ops frv_dma_ops = {
 	.alloc			= frv_dma_alloc,
 	.free			= frv_dma_free,
 	.map_page		= frv_dma_map_page,
diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h
index 7ac7fadffed0..f804bca4c13f 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -1,9 +1,9 @@
 #ifndef _H8300_DMA_MAPPING_H
 #define _H8300_DMA_MAPPING_H
 
-extern struct dma_map_ops h8300_dma_map_ops;
+extern const struct dma_map_ops h8300_dma_map_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &h8300_dma_map_ops;
 }
diff --git a/arch/h8300/kernel/dma.c b/arch/h8300/kernel/dma.c
index 3651da045806..225dd0a188dc 100644
--- a/arch/h8300/kernel/dma.c
+++ b/arch/h8300/kernel/dma.c
@@ -60,7 +60,7 @@ static int map_sg(struct device *dev, struct scatterlist *sgl,
 	return nents;
 }
 
-struct dma_map_ops h8300_dma_map_ops = {
+const struct dma_map_ops h8300_dma_map_ops = {
 	.alloc = dma_alloc,
 	.free = dma_free,
 	.map_page = map_page,
diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h
index 7ef58df909fc..b812e917cd95 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -32,9 +32,9 @@ struct device;
 extern int bad_dma_address;
 #define DMA_ERROR_CODE bad_dma_address
 
-extern struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops *dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (unlikely(dev == NULL))
 		return NULL;
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index dbc4f1003da4..e74b65009587 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <asm/page.h>
 
-struct dma_map_ops *dma_ops;
+const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 int bad_dma_address;  /*  globals are automatically initialized to zero  */
@@ -203,7 +203,7 @@ static void hexagon_sync_single_for_device(struct device *dev,
 	dma_sync(dma_addr_to_virt(dma_handle), size, dir);
 }
 
-struct dma_map_ops hexagon_dma_ops = {
+const struct dma_map_ops hexagon_dma_ops = {
 	.alloc		= hexagon_dma_alloc_coherent,
 	.free		= hexagon_free_coherent,
 	.map_sg		= hexagon_map_sg,
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index 1e4cae5ae053..0310078a95f8 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -18,7 +18,7 @@
 #include <linux/export.h>
 #include <asm/machvec.h>
 
-extern struct dma_map_ops sba_dma_ops, swiotlb_dma_ops;
+extern const struct dma_map_ops sba_dma_ops, swiotlb_dma_ops;
 
 /* swiotlb declarations & definitions: */
 extern int swiotlb_late_init_with_default_size (size_t size);
@@ -34,7 +34,7 @@ static inline int use_swiotlb(struct device *dev)
 		!sba_dma_ops.dma_supported(dev, *dev->dma_mask);
 }
 
-struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
+const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
 {
 	if (use_swiotlb(dev))
 		return &swiotlb_dma_ops;
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 630ee8073899..aec4a3354abe 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2096,7 +2096,7 @@ static int __init acpi_sba_ioc_init_acpi(void)
 /* This has to run before acpi_scan_init(). */
 arch_initcall(acpi_sba_ioc_init_acpi);
 
-extern struct dma_map_ops swiotlb_dma_ops;
+extern const struct dma_map_ops swiotlb_dma_ops;
 
 static int __init
 sba_init(void)
@@ -2216,7 +2216,7 @@ sba_page_override(char *str)
 
 __setup("sbapagesize=",sba_page_override);
 
-struct dma_map_ops sba_dma_ops = {
+const struct dma_map_ops sba_dma_ops = {
 	.alloc			= sba_alloc_coherent,
 	.free			= sba_free_coherent,
 	.map_page		= sba_map_page,
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index d472805edfa9..05e467d56d86 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -14,7 +14,7 @@
 
 #define DMA_ERROR_CODE 0
 
-extern struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops *dma_ops;
 extern struct ia64_machine_vector ia64_mv;
 extern void set_iommu_machvec(void);
 
diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h
index ed7f09089f12..af285c423e1e 100644
--- a/arch/ia64/include/asm/machvec.h
+++ b/arch/ia64/include/asm/machvec.h
@@ -44,7 +44,7 @@ typedef void ia64_mv_kernel_launch_event_t(void);
 /* DMA-mapping interface: */
 typedef void ia64_mv_dma_init (void);
 typedef u64 ia64_mv_dma_get_required_mask (struct device *);
-typedef struct dma_map_ops *ia64_mv_dma_get_ops(struct device *);
+typedef const struct dma_map_ops *ia64_mv_dma_get_ops(struct device *);
 
 /*
  * WARNING: The legacy I/O space is _architected_.  Platforms are
@@ -248,7 +248,7 @@ extern void machvec_init_from_cmdline(const char *cmdline);
 # endif /* CONFIG_IA64_GENERIC */
 
 extern void swiotlb_dma_init(void);
-extern struct dma_map_ops *dma_get_ops(struct device *);
+extern const struct dma_map_ops *dma_get_ops(struct device *);
 
 /*
  * Define default versions so we can extend machvec for new platforms without having
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 7f7916238208..e0dd97f4eb69 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -4,7 +4,7 @@
 /* Set this to 1 if there is a HW IOMMU in the system */
 int iommu_detected __read_mostly;
 
-struct dma_map_ops *dma_ops;
+const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
@@ -17,7 +17,7 @@ static int __init dma_init(void)
 }
 fs_initcall(dma_init);
 
-struct dma_map_ops *dma_get_ops(struct device *dev)
+const struct dma_map_ops *dma_get_ops(struct device *dev)
 {
 	return dma_ops;
 }
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index 992c1098c522..9094a73f996f 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -90,11 +90,11 @@ void __init pci_iommu_alloc(void)
 {
 	dma_ops = &intel_dma_ops;
 
-	dma_ops->sync_single_for_cpu = machvec_dma_sync_single;
-	dma_ops->sync_sg_for_cpu = machvec_dma_sync_sg;
-	dma_ops->sync_single_for_device = machvec_dma_sync_single;
-	dma_ops->sync_sg_for_device = machvec_dma_sync_sg;
-	dma_ops->dma_supported = iommu_dma_supported;
+	intel_dma_ops.sync_single_for_cpu = machvec_dma_sync_single;
+	intel_dma_ops.sync_sg_for_cpu = machvec_dma_sync_sg;
+	intel_dma_ops.sync_single_for_device = machvec_dma_sync_single;
+	intel_dma_ops.sync_sg_for_device = machvec_dma_sync_sg;
+	intel_dma_ops.dma_supported = iommu_dma_supported;
 
 	/*
 	 * The order of these functions is important for
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 2933208c0285..a14989dacded 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -30,7 +30,7 @@ static void ia64_swiotlb_free_coherent(struct device *dev, size_t size,
 	swiotlb_free_coherent(dev, size, vaddr, dma_addr);
 }
 
-struct dma_map_ops swiotlb_dma_ops = {
+const struct dma_map_ops swiotlb_dma_ops = {
 	.alloc = ia64_swiotlb_alloc_coherent,
 	.free = ia64_swiotlb_free_coherent,
 	.map_page = swiotlb_map_page,
diff --git a/arch/m32r/include/asm/device.h b/arch/m32r/include/asm/device.h
index 4a9f35e0973f..7955a9799466 100644
--- a/arch/m32r/include/asm/device.h
+++ b/arch/m32r/include/asm/device.h
@@ -4,7 +4,7 @@
  * This file is released under the GPLv2
  */
 struct dev_archdata {
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 };
 
 struct pdev_archdata {
diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h
index 2c43a77fe942..99c43d2f05dc 100644
--- a/arch/m32r/include/asm/dma-mapping.h
+++ b/arch/m32r/include/asm/dma-mapping.h
@@ -10,7 +10,7 @@
 
 #define DMA_ERROR_CODE (~(dma_addr_t)0x0)
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h
index 96c536194287..863509939d5a 100644
--- a/arch/m68k/include/asm/dma-mapping.h
+++ b/arch/m68k/include/asm/dma-mapping.h
@@ -1,9 +1,9 @@
 #ifndef _M68K_DMA_MAPPING_H
 #define _M68K_DMA_MAPPING_H
 
-extern struct dma_map_ops m68k_dma_ops;
+extern const struct dma_map_ops m68k_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
         return &m68k_dma_ops;
 }
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index 07070065a425..0fc5dabb4a42 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -158,7 +158,7 @@ static int m68k_dma_map_sg(struct device *dev, struct scatterlist *sglist,
 	return nents;
 }
 
-struct dma_map_ops m68k_dma_ops = {
+const struct dma_map_ops m68k_dma_ops = {
 	.alloc			= m68k_dma_alloc,
 	.free			= m68k_dma_free,
 	.map_page		= m68k_dma_map_page,
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index 27af5d479ce6..c156a7ac732f 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -1,9 +1,9 @@
 #ifndef _ASM_METAG_DMA_MAPPING_H
 #define _ASM_METAG_DMA_MAPPING_H
 
-extern struct dma_map_ops metag_dma_ops;
+extern const struct dma_map_ops metag_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &metag_dma_ops;
 }
diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c
index 91968d92652b..f0ab3a498328 100644
--- a/arch/metag/kernel/dma.c
+++ b/arch/metag/kernel/dma.c
@@ -575,7 +575,7 @@ static void metag_dma_sync_sg_for_device(struct device *dev,
 		dma_sync_for_device(sg_virt(sg), sg->length, direction);
 }
 
-struct dma_map_ops metag_dma_ops = {
+const struct dma_map_ops metag_dma_ops = {
 	.alloc			= metag_dma_alloc,
 	.free			= metag_dma_free,
 	.map_page		= metag_dma_map_page,
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index 1768d4bdc8d3..c7faf2fb51d6 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -36,9 +36,9 @@
 /*
  * Available generic sets of operations
  */
-extern struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_direct_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &dma_direct_ops;
 }
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index 818daf230eb4..12e093a03e60 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -187,7 +187,7 @@ int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
 #endif
 }
 
-struct dma_map_ops dma_direct_ops = {
+const struct dma_map_ops dma_direct_ops = {
 	.alloc		= dma_direct_alloc_coherent,
 	.free		= dma_direct_free_coherent,
 	.mmap		= dma_direct_mmap_coherent,
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index fd69528b24fb..897d32c888ee 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -205,7 +205,7 @@ static phys_addr_t octeon_unity_dma_to_phys(struct device *dev, dma_addr_t daddr
 }
 
 struct octeon_dma_map_ops {
-	struct dma_map_ops dma_map_ops;
+	const struct dma_map_ops dma_map_ops;
 	dma_addr_t (*phys_to_dma)(struct device *dev, phys_addr_t paddr);
 	phys_addr_t (*dma_to_phys)(struct device *dev, dma_addr_t daddr);
 };
@@ -333,7 +333,7 @@ static struct octeon_dma_map_ops _octeon_pci_dma_map_ops = {
 	},
 };
 
-struct dma_map_ops *octeon_pci_dma_map_ops;
+const struct dma_map_ops *octeon_pci_dma_map_ops;
 
 void __init octeon_pci_dma_init(void)
 {
diff --git a/arch/mips/include/asm/device.h b/arch/mips/include/asm/device.h
index 21c2082a0dfb..ebc5c1265473 100644
--- a/arch/mips/include/asm/device.h
+++ b/arch/mips/include/asm/device.h
@@ -10,7 +10,7 @@ struct dma_map_ops;
 
 struct dev_archdata {
 	/* DMA operations on that device */
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 
 #ifdef CONFIG_DMA_PERDEV_COHERENT
 	/* Non-zero if DMA is coherent with CPU caches */
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 7aa71b9b0258..b59b084a7569 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -9,9 +9,9 @@
 #include <dma-coherence.h>
 #endif
 
-extern struct dma_map_ops *mips_dma_map_ops;
+extern const struct dma_map_ops *mips_dma_map_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
diff --git a/arch/mips/include/asm/mach-cavium-octeon/dma-coherence.h b/arch/mips/include/asm/mach-cavium-octeon/dma-coherence.h
index 460042ee5d6f..9110988b92a1 100644
--- a/arch/mips/include/asm/mach-cavium-octeon/dma-coherence.h
+++ b/arch/mips/include/asm/mach-cavium-octeon/dma-coherence.h
@@ -65,7 +65,7 @@ dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
 phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr);
 
 struct dma_map_ops;
-extern struct dma_map_ops *octeon_pci_dma_map_ops;
+extern const struct dma_map_ops *octeon_pci_dma_map_ops;
 extern char *octeon_swiotlb;
 
 #endif /* __ASM_MACH_CAVIUM_OCTEON_DMA_COHERENCE_H */
diff --git a/arch/mips/include/asm/netlogic/common.h b/arch/mips/include/asm/netlogic/common.h
index be52c2125d71..e0717d10e650 100644
--- a/arch/mips/include/asm/netlogic/common.h
+++ b/arch/mips/include/asm/netlogic/common.h
@@ -88,7 +88,7 @@ extern struct plat_smp_ops nlm_smp_ops;
 extern char nlm_reset_entry[], nlm_reset_entry_end[];
 
 /* SWIOTLB */
-extern struct dma_map_ops nlm_swiotlb_dma_ops;
+extern const struct dma_map_ops nlm_swiotlb_dma_ops;
 
 extern unsigned int nlm_threads_per_core;
 extern cpumask_t nlm_cpumask;
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c
index aab4fd681e1f..7296df043d92 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -122,7 +122,7 @@ phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return daddr;
 }
 
-static struct dma_map_ops loongson_dma_map_ops = {
+static const struct dma_map_ops loongson_dma_map_ops = {
 	.alloc = loongson_dma_alloc_coherent,
 	.free = loongson_dma_free_coherent,
 	.map_page = loongson_dma_map_page,
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index a39c36af97ad..1cb84472cb58 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -417,7 +417,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 
 EXPORT_SYMBOL(dma_cache_sync);
 
-static struct dma_map_ops mips_default_dma_map_ops = {
+static const struct dma_map_ops mips_default_dma_map_ops = {
 	.alloc = mips_dma_alloc_coherent,
 	.free = mips_dma_free_coherent,
 	.mmap = mips_dma_mmap,
@@ -433,7 +433,7 @@ static struct dma_map_ops mips_default_dma_map_ops = {
 	.dma_supported = mips_dma_supported
 };
 
-struct dma_map_ops *mips_dma_map_ops = &mips_default_dma_map_ops;
+const struct dma_map_ops *mips_dma_map_ops = &mips_default_dma_map_ops;
 EXPORT_SYMBOL(mips_dma_map_ops);
 
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
diff --git a/arch/mips/netlogic/common/nlm-dma.c b/arch/mips/netlogic/common/nlm-dma.c
index 0630693bec2a..0ec9d9da6d51 100644
--- a/arch/mips/netlogic/common/nlm-dma.c
+++ b/arch/mips/netlogic/common/nlm-dma.c
@@ -67,7 +67,7 @@ static void nlm_dma_free_coherent(struct device *dev, size_t size,
 	swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
-struct dma_map_ops nlm_swiotlb_dma_ops = {
+const struct dma_map_ops nlm_swiotlb_dma_ops = {
 	.alloc = nlm_dma_alloc_coherent,
 	.free = nlm_dma_free_coherent,
 	.map_page = swiotlb_map_page,
diff --git a/arch/mn10300/include/asm/dma-mapping.h b/arch/mn10300/include/asm/dma-mapping.h
index 1dcd44757f32..564e3927e005 100644
--- a/arch/mn10300/include/asm/dma-mapping.h
+++ b/arch/mn10300/include/asm/dma-mapping.h
@@ -14,9 +14,9 @@
 #include <asm/cache.h>
 #include <asm/io.h>
 
-extern struct dma_map_ops mn10300_dma_ops;
+extern const struct dma_map_ops mn10300_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &mn10300_dma_ops;
 }
diff --git a/arch/mn10300/mm/dma-alloc.c b/arch/mn10300/mm/dma-alloc.c
index 4f4b9029f0ea..86108d2496b3 100644
--- a/arch/mn10300/mm/dma-alloc.c
+++ b/arch/mn10300/mm/dma-alloc.c
@@ -121,7 +121,7 @@ static int mn10300_dma_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-struct dma_map_ops mn10300_dma_ops = {
+const struct dma_map_ops mn10300_dma_ops = {
 	.alloc			= mn10300_dma_alloc,
 	.free			= mn10300_dma_free,
 	.map_page		= mn10300_dma_map_page,
diff --git a/arch/nios2/include/asm/dma-mapping.h b/arch/nios2/include/asm/dma-mapping.h
index bec8ac8e6ad2..aa00d839a64b 100644
--- a/arch/nios2/include/asm/dma-mapping.h
+++ b/arch/nios2/include/asm/dma-mapping.h
@@ -10,9 +10,9 @@
 #ifndef _ASM_NIOS2_DMA_MAPPING_H
 #define _ASM_NIOS2_DMA_MAPPING_H
 
-extern struct dma_map_ops nios2_dma_ops;
+extern const struct dma_map_ops nios2_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &nios2_dma_ops;
 }
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c
index f6a5dcf9d682..7040c1adbb5e 100644
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@@ -192,7 +192,7 @@ static void nios2_dma_sync_sg_for_device(struct device *dev,
 
 }
 
-struct dma_map_ops nios2_dma_ops = {
+const struct dma_map_ops nios2_dma_ops = {
 	.alloc			= nios2_dma_alloc,
 	.free			= nios2_dma_free,
 	.map_page		= nios2_dma_map_page,
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index 1f260bccb368..88acbedb4947 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -28,9 +28,9 @@
 
 #define DMA_ERROR_CODE		(~(dma_addr_t)0x0)
 
-extern struct dma_map_ops or1k_dma_map_ops;
+extern const struct dma_map_ops or1k_dma_map_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &or1k_dma_map_ops;
 }
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 906998bac957..b10369b7e31b 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -232,7 +232,7 @@ or1k_sync_single_for_device(struct device *dev,
 		mtspr(SPR_DCBFR, cl);
 }
 
-struct dma_map_ops or1k_dma_map_ops = {
+const struct dma_map_ops or1k_dma_map_ops = {
 	.alloc = or1k_dma_alloc,
 	.free = or1k_dma_free,
 	.map_page = or1k_map_page,
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h
index 16e024602737..1749073e44fc 100644
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -21,13 +21,13 @@
 */
 
 #ifdef CONFIG_PA11
-extern struct dma_map_ops pcxl_dma_ops;
-extern struct dma_map_ops pcx_dma_ops;
+extern const struct dma_map_ops pcxl_dma_ops;
+extern const struct dma_map_ops pcx_dma_ops;
 #endif
 
-extern struct dma_map_ops *hppa_dma_ops;
+extern const struct dma_map_ops *hppa_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return hppa_dma_ops;
 }
diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index 700e2d2da096..fa78419100c8 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -40,7 +40,7 @@
 #include <asm/parisc-device.h>
 
 /* See comments in include/asm-parisc/pci.h */
-struct dma_map_ops *hppa_dma_ops __read_mostly;
+const struct dma_map_ops *hppa_dma_ops __read_mostly;
 EXPORT_SYMBOL(hppa_dma_ops);
 
 static struct device root = {
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 697c53543a4d..5f0067a62738 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -572,7 +572,7 @@ static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *
 		flush_kernel_vmap_range(sg_virt(sg), sg->length);
 }
 
-struct dma_map_ops pcxl_dma_ops = {
+const struct dma_map_ops pcxl_dma_ops = {
 	.dma_supported =	pa11_dma_supported,
 	.alloc =		pa11_dma_alloc,
 	.free =			pa11_dma_free,
@@ -608,7 +608,7 @@ static void pcx_dma_free(struct device *dev, size_t size, void *vaddr,
 	return;
 }
 
-struct dma_map_ops pcx_dma_ops = {
+const struct dma_map_ops pcx_dma_ops = {
 	.dma_supported =	pa11_dma_supported,
 	.alloc =		pcx_dma_alloc,
 	.free =			pcx_dma_free,
diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 406c2b1ff82d..49cbb0fca233 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -21,7 +21,7 @@ struct iommu_table;
  */
 struct dev_archdata {
 	/* DMA operations on that device */
-	struct dma_map_ops	*dma_ops;
+	const struct dma_map_ops	*dma_ops;
 
 	/*
 	 * These two used to be a union. However, with the hybrid ops we need
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 84e3f8dd5e4f..2ec3eadf336f 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -76,9 +76,9 @@ static inline unsigned long device_to_mask(struct device *dev)
 #ifdef CONFIG_PPC64
 extern struct dma_map_ops dma_iommu_ops;
 #endif
-extern struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_direct_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	/* We don't handle the NULL dev case for ISA for now. We could
 	 * do it via an out of line call but it is not needed for now. The
@@ -91,7 +91,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 	return dev->archdata.dma_ops;
 }
 
-static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
+static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
 {
 	dev->archdata.dma_ops = ops;
 }
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index e9bd6cf0212f..93eded8d3843 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -53,8 +53,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 }
 
 #ifdef CONFIG_PCI
-extern void set_pci_dma_ops(struct dma_map_ops *dma_ops);
-extern struct dma_map_ops *get_pci_dma_ops(void);
+extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops);
+extern const struct dma_map_ops *get_pci_dma_ops(void);
 #else	/* CONFIG_PCI */
 #define set_pci_dma_ops(d)
 #define get_pci_dma_ops()	NULL
diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h
index de99d6e29430..01d45a5fd00b 100644
--- a/arch/powerpc/include/asm/swiotlb.h
+++ b/arch/powerpc/include/asm/swiotlb.h
@@ -13,7 +13,7 @@
 
 #include <linux/swiotlb.h>
 
-extern struct dma_map_ops swiotlb_dma_ops;
+extern const struct dma_map_ops swiotlb_dma_ops;
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index c6689f658b50..d0ea7860e02b 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -46,7 +46,7 @@ static u64 swiotlb_powerpc_get_required(struct device *dev)
  * map_page, and unmap_page on highmem, use normal dma_ops
  * for everything else.
  */
-struct dma_map_ops swiotlb_dma_ops = {
+const struct dma_map_ops swiotlb_dma_ops = {
 	.alloc = __dma_direct_alloc_coherent,
 	.free = __dma_direct_free_coherent,
 	.mmap = dma_direct_mmap_coherent,
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 6877e3fa95bb..03b98f1f98ec 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -274,7 +274,7 @@ static inline void dma_direct_sync_single(struct device *dev,
 }
 #endif
 
-struct dma_map_ops dma_direct_ops = {
+const struct dma_map_ops dma_direct_ops = {
 	.alloc				= dma_direct_alloc_coherent,
 	.free				= dma_direct_free_coherent,
 	.mmap				= dma_direct_mmap_coherent,
@@ -316,7 +316,7 @@ EXPORT_SYMBOL(dma_set_coherent_mask);
 
 int __dma_set_mask(struct device *dev, u64 dma_mask)
 {
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+	const struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
 	if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL))
 		return dma_ops->set_dma_mask(dev, dma_mask);
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(dma_set_mask);
 
 u64 __dma_get_required_mask(struct device *dev)
 {
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+	const struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
 	if (unlikely(dma_ops == NULL))
 		return 0;
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 74bec5498972..09db4778435c 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -59,14 +59,14 @@ resource_size_t isa_mem_base;
 EXPORT_SYMBOL(isa_mem_base);
 
 
-static struct dma_map_ops *pci_dma_ops = &dma_direct_ops;
+static const struct dma_map_ops *pci_dma_ops = &dma_direct_ops;
 
-void set_pci_dma_ops(struct dma_map_ops *dma_ops)
+void set_pci_dma_ops(const struct dma_map_ops *dma_ops)
 {
 	pci_dma_ops = dma_ops;
 }
 
-struct dma_map_ops *get_pci_dma_ops(void)
+const struct dma_map_ops *get_pci_dma_ops(void)
 {
 	return pci_dma_ops;
 }
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 7ff51f96a00e..e1413e69e5fe 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -651,7 +651,7 @@ static int dma_fixed_dma_supported(struct device *dev, u64 mask)
 
 static int dma_set_mask_and_switch(struct device *dev, u64 dma_mask);
 
-static struct dma_map_ops dma_iommu_fixed_ops = {
+static const struct dma_map_ops dma_iommu_fixed_ops = {
 	.alloc          = dma_fixed_alloc_coherent,
 	.free           = dma_fixed_free_coherent,
 	.map_sg         = dma_fixed_map_sg,
@@ -1172,7 +1172,7 @@ __setup("iommu_fixed=", setup_iommu_fixed);
 
 static u64 cell_dma_get_required_mask(struct device *dev)
 {
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 
 	if (!dev->dma_mask)
 		return 0;
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 73b155fd4481..1c383f38031d 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -115,7 +115,7 @@ static u64 dma_npu_get_required_mask(struct device *dev)
 	return 0;
 }
 
-static struct dma_map_ops dma_npu_ops = {
+static const struct dma_map_ops dma_npu_ops = {
 	.map_page		= dma_npu_map_page,
 	.map_sg			= dma_npu_map_sg,
 	.alloc			= dma_npu_alloc,
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index 8af1c15aef85..c81450d98794 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -701,7 +701,7 @@ static u64 ps3_dma_get_required_mask(struct device *_dev)
 	return DMA_BIT_MASK(32);
 }
 
-static struct dma_map_ops ps3_sb_dma_ops = {
+static const struct dma_map_ops ps3_sb_dma_ops = {
 	.alloc = ps3_alloc_coherent,
 	.free = ps3_free_coherent,
 	.map_sg = ps3_sb_map_sg,
@@ -712,7 +712,7 @@ static struct dma_map_ops ps3_sb_dma_ops = {
 	.unmap_page = ps3_unmap_page,
 };
 
-static struct dma_map_ops ps3_ioc0_dma_ops = {
+static const struct dma_map_ops ps3_ioc0_dma_ops = {
 	.alloc = ps3_alloc_coherent,
 	.free = ps3_free_coherent,
 	.map_sg = ps3_ioc0_map_sg,
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
index 614c28537141..2e36a0b8944a 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -136,7 +136,7 @@ static u64 ibmebus_dma_get_required_mask(struct device *dev)
 	return DMA_BIT_MASK(64);
 }
 
-static struct dma_map_ops ibmebus_dma_ops = {
+static const struct dma_map_ops ibmebus_dma_ops = {
 	.alloc              = ibmebus_alloc_coherent,
 	.free               = ibmebus_free_coherent,
 	.map_sg             = ibmebus_map_sg,
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 2c8fb3ec989e..720493932486 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -615,7 +615,7 @@ static u64 vio_dma_get_required_mask(struct device *dev)
         return dma_iommu_ops.get_required_mask(dev);
 }
 
-static struct dma_map_ops vio_dma_mapping_ops = {
+static const struct dma_map_ops vio_dma_mapping_ops = {
 	.alloc             = vio_dma_iommu_alloc_coherent,
 	.free              = vio_dma_iommu_free_coherent,
 	.mmap		   = dma_direct_mmap_coherent,
diff --git a/arch/s390/include/asm/device.h b/arch/s390/include/asm/device.h
index 4a9f35e0973f..7955a9799466 100644
--- a/arch/s390/include/asm/device.h
+++ b/arch/s390/include/asm/device.h
@@ -4,7 +4,7 @@
  * This file is released under the GPLv2
  */
 struct dev_archdata {
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 };
 
 struct pdev_archdata {
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index ffaba07f50ab..2776d205b1ff 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -10,9 +10,9 @@
 
 #define DMA_ERROR_CODE		(~(dma_addr_t) 0x0)
 
-extern struct dma_map_ops s390_pci_dma_ops;
+extern const struct dma_map_ops s390_pci_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 1d7a9c71944a..9081a57fa340 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -650,7 +650,7 @@ static int __init dma_debug_do_init(void)
 }
 fs_initcall(dma_debug_do_init);
 
-struct dma_map_ops s390_pci_dma_ops = {
+const struct dma_map_ops s390_pci_dma_ops = {
 	.alloc		= s390_dma_alloc,
 	.free		= s390_dma_free,
 	.map_sg		= s390_dma_map_sg,
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index 0052ad40e86d..a7382c34c241 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -1,10 +1,10 @@
 #ifndef __ASM_SH_DMA_MAPPING_H
 #define __ASM_SH_DMA_MAPPING_H
 
-extern struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops *dma_ops;
 extern void no_iommu_init(void);
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return dma_ops;
 }
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c
index 47fee3b6e29c..d24c707b2181 100644
--- a/arch/sh/kernel/dma-nommu.c
+++ b/arch/sh/kernel/dma-nommu.c
@@ -65,7 +65,7 @@ static void nommu_sync_sg(struct device *dev, struct scatterlist *sg,
 }
 #endif
 
-struct dma_map_ops nommu_dma_ops = {
+const struct dma_map_ops nommu_dma_ops = {
 	.alloc			= dma_generic_alloc_coherent,
 	.free			= dma_generic_free_coherent,
 	.map_page		= nommu_map_page,
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index 92b6976fde59..d1275adfa0ef 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -22,7 +22,7 @@
 
 #define PREALLOC_DMA_DEBUG_ENTRIES	4096
 
-struct dma_map_ops *dma_ops;
+const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int __init dma_init(void)
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 1180ae254154..3d2babc0c4c6 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -18,13 +18,13 @@ static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	 */
 }
 
-extern struct dma_map_ops *dma_ops;
-extern struct dma_map_ops *leon_dma_ops;
-extern struct dma_map_ops pci32_dma_ops;
+extern const struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops *leon_dma_ops;
+extern const struct dma_map_ops pci32_dma_ops;
 
 extern struct bus_type pci_bus_type;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 #ifdef CONFIG_SPARC_LEON
 	if (sparc_cpu_model == sparc_leon)
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 9df997995f6b..c63ba99ca551 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -741,7 +741,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev,
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-static struct dma_map_ops sun4u_dma_ops = {
+static const struct dma_map_ops sun4u_dma_ops = {
 	.alloc			= dma_4u_alloc_coherent,
 	.free			= dma_4u_free_coherent,
 	.map_page		= dma_4u_map_page,
@@ -752,7 +752,7 @@ static struct dma_map_ops sun4u_dma_ops = {
 	.sync_sg_for_cpu	= dma_4u_sync_sg_for_cpu,
 };
 
-struct dma_map_ops *dma_ops = &sun4u_dma_ops;
+const struct dma_map_ops *dma_ops = &sun4u_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 int dma_supported(struct device *dev, u64 device_mask)
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 6ffaec44931a..cf20033a1458 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -401,7 +401,7 @@ static void sbus_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	BUG();
 }
 
-static struct dma_map_ops sbus_dma_ops = {
+static const struct dma_map_ops sbus_dma_ops = {
 	.alloc			= sbus_alloc_coherent,
 	.free			= sbus_free_coherent,
 	.map_page		= sbus_map_page,
@@ -637,7 +637,7 @@ static void pci32_sync_sg_for_device(struct device *device, struct scatterlist *
 	}
 }
 
-struct dma_map_ops pci32_dma_ops = {
+const struct dma_map_ops pci32_dma_ops = {
 	.alloc			= pci32_alloc_coherent,
 	.free			= pci32_free_coherent,
 	.map_page		= pci32_map_page,
@@ -652,10 +652,10 @@ struct dma_map_ops pci32_dma_ops = {
 EXPORT_SYMBOL(pci32_dma_ops);
 
 /* leon re-uses pci32_dma_ops */
-struct dma_map_ops *leon_dma_ops = &pci32_dma_ops;
+const struct dma_map_ops *leon_dma_ops = &pci32_dma_ops;
 EXPORT_SYMBOL(leon_dma_ops);
 
-struct dma_map_ops *dma_ops = &sbus_dma_ops;
+const struct dma_map_ops *dma_ops = &sbus_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index f4daccd12bf5..68bec7c97cb8 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -669,7 +669,7 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
 	local_irq_restore(flags);
 }
 
-static struct dma_map_ops sun4v_dma_ops = {
+static const struct dma_map_ops sun4v_dma_ops = {
 	.alloc				= dma_4v_alloc_coherent,
 	.free				= dma_4v_free_coherent,
 	.map_page			= dma_4v_map_page,
diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
index 6ab8bf146d4c..25f23ac7d361 100644
--- a/arch/tile/include/asm/device.h
+++ b/arch/tile/include/asm/device.h
@@ -18,7 +18,7 @@
 
 struct dev_archdata {
 	/* DMA operations on that device */
-        struct dma_map_ops	*dma_ops;
+        const struct dma_map_ops	*dma_ops;
 
 	/* Offset of the DMA address from the PA. */
 	dma_addr_t		dma_offset;
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 01ceb4a895b0..4a06cc75b856 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -24,12 +24,12 @@
 #define ARCH_HAS_DMA_GET_REQUIRED_MASK
 #endif
 
-extern struct dma_map_ops *tile_dma_map_ops;
-extern struct dma_map_ops *gx_pci_dma_map_ops;
-extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
-extern struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
+extern const struct dma_map_ops *tile_dma_map_ops;
+extern const struct dma_map_ops *gx_pci_dma_map_ops;
+extern const struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+extern const struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
@@ -59,7 +59,7 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
+static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
 {
 	dev->archdata.dma_ops = ops;
 }
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index 24e0f8c21f2f..569bb6dd154a 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -329,7 +329,7 @@ tile_dma_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-static struct dma_map_ops tile_default_dma_map_ops = {
+static const struct dma_map_ops tile_default_dma_map_ops = {
 	.alloc = tile_dma_alloc_coherent,
 	.free = tile_dma_free_coherent,
 	.map_page = tile_dma_map_page,
@@ -344,7 +344,7 @@ static struct dma_map_ops tile_default_dma_map_ops = {
 	.dma_supported = tile_dma_supported
 };
 
-struct dma_map_ops *tile_dma_map_ops = &tile_default_dma_map_ops;
+const struct dma_map_ops *tile_dma_map_ops = &tile_default_dma_map_ops;
 EXPORT_SYMBOL(tile_dma_map_ops);
 
 /* Generic PCI DMA mapping functions */
@@ -516,7 +516,7 @@ tile_pci_dma_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-static struct dma_map_ops tile_pci_default_dma_map_ops = {
+static const struct dma_map_ops tile_pci_default_dma_map_ops = {
 	.alloc = tile_pci_dma_alloc_coherent,
 	.free = tile_pci_dma_free_coherent,
 	.map_page = tile_pci_dma_map_page,
@@ -531,7 +531,7 @@ static struct dma_map_ops tile_pci_default_dma_map_ops = {
 	.dma_supported = tile_pci_dma_supported
 };
 
-struct dma_map_ops *gx_pci_dma_map_ops = &tile_pci_default_dma_map_ops;
+const struct dma_map_ops *gx_pci_dma_map_ops = &tile_pci_default_dma_map_ops;
 EXPORT_SYMBOL(gx_pci_dma_map_ops);
 
 /* PCI DMA mapping functions for legacy PCI devices */
@@ -552,7 +552,7 @@ static void tile_swiotlb_free_coherent(struct device *dev, size_t size,
 	swiotlb_free_coherent(dev, size, vaddr, dma_addr);
 }
 
-static struct dma_map_ops pci_swiotlb_dma_ops = {
+static const struct dma_map_ops pci_swiotlb_dma_ops = {
 	.alloc = tile_swiotlb_alloc_coherent,
 	.free = tile_swiotlb_free_coherent,
 	.map_page = swiotlb_map_page,
@@ -567,7 +567,7 @@ static struct dma_map_ops pci_swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 };
 
-static struct dma_map_ops pci_hybrid_dma_ops = {
+static const struct dma_map_ops pci_hybrid_dma_ops = {
 	.alloc = tile_swiotlb_alloc_coherent,
 	.free = tile_swiotlb_free_coherent,
 	.map_page = tile_pci_dma_map_page,
@@ -582,18 +582,18 @@ static struct dma_map_ops pci_hybrid_dma_ops = {
 	.dma_supported = tile_pci_dma_supported
 };
 
-struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
-struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
+const struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
+const struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
 #else
-struct dma_map_ops *gx_legacy_pci_dma_map_ops;
-struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
+const struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+const struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 #endif
 EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops);
 EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops);
 
 int dma_set_mask(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+	const struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
 	/*
 	 * For PCI devices with 64-bit DMA addressing capability, promote
@@ -623,7 +623,7 @@ EXPORT_SYMBOL(dma_set_mask);
 #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
 int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+	const struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
 	/*
 	 * For PCI devices with 64-bit DMA addressing capability, promote
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index 4749854afd03..14d7729c7b73 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -21,9 +21,9 @@
 #include <asm/memory.h>
 #include <asm/cacheflush.h>
 
-extern struct dma_map_ops swiotlb_dma_map_ops;
+extern const struct dma_map_ops swiotlb_dma_map_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &swiotlb_dma_map_ops;
 }
diff --git a/arch/unicore32/mm/dma-swiotlb.c b/arch/unicore32/mm/dma-swiotlb.c
index 3e9f6489ba38..525413d6690e 100644
--- a/arch/unicore32/mm/dma-swiotlb.c
+++ b/arch/unicore32/mm/dma-swiotlb.c
@@ -31,7 +31,7 @@ static void unicore_swiotlb_free_coherent(struct device *dev, size_t size,
 	swiotlb_free_coherent(dev, size, vaddr, dma_addr);
 }
 
-struct dma_map_ops swiotlb_dma_map_ops = {
+const struct dma_map_ops swiotlb_dma_map_ops = {
 	.alloc = unicore_swiotlb_alloc_coherent,
 	.free = unicore_swiotlb_free_coherent,
 	.map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 684ed6c3aa67..b2d0b4ced7e3 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -3,7 +3,7 @@
 
 struct dev_archdata {
 #ifdef CONFIG_X86_DEV_DMA_OPS
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 #endif
 #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
 	void *iommu; /* hook for IOMMU specific extension */
@@ -13,7 +13,7 @@ struct dev_archdata {
 #if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
 struct dma_domain {
 	struct list_head node;
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 	int domain_nr;
 };
 void add_dma_domain(struct dma_domain *domain);
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 44461626830e..5e4772886a1e 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -25,9 +25,9 @@ extern int iommu_merge;
 extern struct device x86_dma_fallback_dev;
 extern int panic_on_overflow;
 
-extern struct dma_map_ops *dma_ops;
+extern const struct dma_map_ops *dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 #ifndef CONFIG_X86_DEV_DMA_OPS
 	return dma_ops;
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99cef152..793869879464 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_IOMMU_H
 #define _ASM_X86_IOMMU_H
 
-extern struct dma_map_ops nommu_dma_ops;
+extern const struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 extern int iommu_pass_through;
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 63ff468a7986..82dfe32faaf4 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -695,7 +695,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
 	return -1;
 }
 
-static struct dma_map_ops gart_dma_ops = {
+static const struct dma_map_ops gart_dma_ops = {
 	.map_sg				= gart_map_sg,
 	.unmap_sg			= gart_unmap_sg,
 	.map_page			= gart_map_page,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 5d400ba1349d..17f180148c80 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -478,7 +478,7 @@ static void calgary_free_coherent(struct device *dev, size_t size,
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-static struct dma_map_ops calgary_dma_ops = {
+static const struct dma_map_ops calgary_dma_ops = {
 	.alloc = calgary_alloc_coherent,
 	.free = calgary_free_coherent,
 	.map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d30c37750765..76f4c039baae 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -17,7 +17,7 @@
 
 static int forbid_dac __read_mostly;
 
-struct dma_map_ops *dma_ops = &nommu_dma_ops;
+const struct dma_map_ops *dma_ops = &nommu_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -214,7 +214,7 @@ early_param("iommu", iommu_setup);
 
 int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 00e71ce396a8..a88952ef371c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -88,7 +88,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
 	flush_write_buffers();
 }
 
-struct dma_map_ops nommu_dma_ops = {
+const struct dma_map_ops nommu_dma_ops = {
 	.alloc			= dma_generic_alloc_coherent,
 	.free			= dma_generic_free_coherent,
 	.map_sg			= nommu_map_sg,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 410efb2c7b80..1e23577e17cf 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -45,7 +45,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size,
 		dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
 }
 
-static struct dma_map_ops swiotlb_dma_ops = {
+static const struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc = x86_swiotlb_alloc_coherent,
 	.free = x86_swiotlb_free_coherent,
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 052c1cb76305..aa3828823170 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -179,7 +179,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
 }
 
 /* We have our own dma_ops: the same as swiotlb but from alloc (above) */
-static struct dma_map_ops sta2x11_dma_ops = {
+static const struct dma_map_ops sta2x11_dma_ops = {
 	.alloc = sta2x11_swiotlb_alloc_coherent,
 	.free = x86_swiotlb_free_coherent,
 	.map_page = swiotlb_map_page,
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a0b36a9d5df1..42b08f8fc2ca 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -18,7 +18,7 @@
 
 int xen_swiotlb __read_mostly;
 
-static struct dma_map_ops xen_swiotlb_dma_ops = {
+static const struct dma_map_ops xen_swiotlb_dma_ops = {
 	.alloc = xen_swiotlb_alloc_coherent,
 	.free = xen_swiotlb_free_coherent,
 	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
diff --git a/arch/xtensa/include/asm/device.h b/arch/xtensa/include/asm/device.h
index fe1f5c878493..a77d45d39f35 100644
--- a/arch/xtensa/include/asm/device.h
+++ b/arch/xtensa/include/asm/device.h
@@ -10,7 +10,7 @@ struct dma_map_ops;
 
 struct dev_archdata {
 	/* DMA operations on that device */
-	struct dma_map_ops *dma_ops;
+	const struct dma_map_ops *dma_ops;
 };
 
 struct pdev_archdata {
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 3fc1170a6488..50d23106cce0 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -18,9 +18,9 @@
 
 #define DMA_ERROR_CODE		(~(dma_addr_t)0x0)
 
-extern struct dma_map_ops xtensa_dma_map_ops;
+extern const struct dma_map_ops xtensa_dma_map_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 70e362e6038e..ecec5265a66d 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -249,7 +249,7 @@ int xtensa_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
-struct dma_map_ops xtensa_dma_map_ops = {
+const struct dma_map_ops xtensa_dma_map_ops = {
 	.alloc = xtensa_dma_alloc,
 	.free = xtensa_dma_free,
 	.map_page = xtensa_map_page,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 3ef0f42984f2..3703fb9db419 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -117,7 +117,7 @@ static const struct iommu_ops amd_iommu_ops;
 static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
 int amd_iommu_max_glx_val = -1;
 
-static struct dma_map_ops amd_iommu_dma_ops;
+static const struct dma_map_ops amd_iommu_dma_ops;
 
 /*
  * This struct contains device specific data for the IOMMU
@@ -2728,7 +2728,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
 	return check_device(dev);
 }
 
-static struct dma_map_ops amd_iommu_dma_ops = {
+static const struct dma_map_ops amd_iommu_dma_ops = {
 	.alloc		= alloc_coherent,
 	.free		= free_coherent,
 	.map_page	= map_page,
diff --git a/drivers/misc/mic/bus/mic_bus.c b/drivers/misc/mic/bus/mic_bus.c
index be37890abb93..c4b27a25662a 100644
--- a/drivers/misc/mic/bus/mic_bus.c
+++ b/drivers/misc/mic/bus/mic_bus.c
@@ -143,7 +143,7 @@ static void mbus_release_dev(struct device *d)
 }
 
 struct mbus_device *
-mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
+mbus_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops,
 		     struct mbus_hw_ops *hw_ops, int index,
 		     void __iomem *mmio_va)
 {
diff --git a/drivers/misc/mic/bus/scif_bus.c b/drivers/misc/mic/bus/scif_bus.c
index ff6e01c25810..e5d377e97c86 100644
--- a/drivers/misc/mic/bus/scif_bus.c
+++ b/drivers/misc/mic/bus/scif_bus.c
@@ -138,7 +138,7 @@ static void scif_release_dev(struct device *d)
 }
 
 struct scif_hw_dev *
-scif_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
+scif_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops,
 		     struct scif_hw_ops *hw_ops, u8 dnode, u8 snode,
 		     struct mic_mw *mmio, struct mic_mw *aper, void *dp,
 		     void __iomem *rdp, struct dma_chan **chan, int num_chan,
diff --git a/drivers/misc/mic/bus/scif_bus.h b/drivers/misc/mic/bus/scif_bus.h
index 94f29ac608b6..ff59568219ad 100644
--- a/drivers/misc/mic/bus/scif_bus.h
+++ b/drivers/misc/mic/bus/scif_bus.h
@@ -113,7 +113,7 @@ int scif_register_driver(struct scif_driver *driver);
 void scif_unregister_driver(struct scif_driver *driver);
 struct scif_hw_dev *
 scif_register_device(struct device *pdev, int id,
-		     struct dma_map_ops *dma_ops,
+		     const struct dma_map_ops *dma_ops,
 		     struct scif_hw_ops *hw_ops, u8 dnode, u8 snode,
 		     struct mic_mw *mmio, struct mic_mw *aper,
 		     void *dp, void __iomem *rdp,
diff --git a/drivers/misc/mic/bus/vop_bus.c b/drivers/misc/mic/bus/vop_bus.c
index 303da222f5b6..e3caa6c53922 100644
--- a/drivers/misc/mic/bus/vop_bus.c
+++ b/drivers/misc/mic/bus/vop_bus.c
@@ -154,7 +154,7 @@ vop_register_device(struct device *pdev, int id,
 	vdev->dev.parent = pdev;
 	vdev->id.device = id;
 	vdev->id.vendor = VOP_DEV_ANY_ID;
-	vdev->dev.archdata.dma_ops = (struct dma_map_ops *)dma_ops;
+	vdev->dev.archdata.dma_ops = dma_ops;
 	vdev->dev.dma_mask = &vdev->dev.coherent_dma_mask;
 	dma_set_mask(&vdev->dev, DMA_BIT_MASK(64));
 	vdev->dev.release = vop_release_dev;
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
index 9599d732aff3..c327985c9523 100644
--- a/drivers/misc/mic/host/mic_boot.c
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -245,7 +245,7 @@ static void __mic_dma_unmap_sg(struct device *dev,
 	dma_unmap_sg(&mdev->pdev->dev, sg, nents, dir);
 }
 
-static struct dma_map_ops __mic_dma_ops = {
+static const struct dma_map_ops __mic_dma_ops = {
 	.alloc = __mic_dma_alloc,
 	.free = __mic_dma_free,
 	.map_page = __mic_dma_map_page,
@@ -344,7 +344,7 @@ mic_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
 	mic_unmap_single(mdev, dma_addr, size);
 }
 
-static struct dma_map_ops mic_dma_ops = {
+static const struct dma_map_ops mic_dma_ops = {
 	.map_page = mic_dma_map_page,
 	.unmap_page = mic_dma_unmap_page,
 };
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 553ef8a5d588..aeb073b5fe16 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1011,7 +1011,7 @@ ccio_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents,
 	DBG_RUN_SG("%s() DONE (nents %d)\n", __func__, nents);
 }
 
-static struct dma_map_ops ccio_ops = {
+static const struct dma_map_ops ccio_ops = {
 	.dma_supported =	ccio_dma_supported,
 	.alloc =		ccio_alloc,
 	.free =			ccio_free,
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 151b86b6d2e2..33385e574433 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1069,7 +1069,7 @@ sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents,
 
 }
 
-static struct dma_map_ops sba_ops = {
+static const struct dma_map_ops sba_ops = {
 	.dma_supported =	sba_dma_supported,
 	.alloc =		sba_alloc,
 	.free =			sba_free,
diff --git a/drivers/pci/host/vmd.c b/drivers/pci/host/vmd.c
index 18ef1a93c10a..e27ad2a3bd33 100644
--- a/drivers/pci/host/vmd.c
+++ b/drivers/pci/host/vmd.c
@@ -282,7 +282,7 @@ static struct device *to_vmd_dev(struct device *dev)
 	return &vmd->dev->dev;
 }
 
-static struct dma_map_ops *vmd_dma_ops(struct device *dev)
+static const struct dma_map_ops *vmd_dma_ops(struct device *dev)
 {
 	return get_dma_ops(to_vmd_dev(dev));
 }
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 10c5a17b1f51..f1da68b82c63 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -127,7 +127,7 @@ struct dma_map_ops {
 	int is_phys;
 };
 
-extern struct dma_map_ops dma_noop_ops;
+extern const struct dma_map_ops dma_noop_ops;
 
 #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
 
@@ -170,8 +170,8 @@ int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma,
  * dma dependent code.  Code that depends on the dma-mapping
  * API needs to set 'depends on HAS_DMA' in its Kconfig
  */
-extern struct dma_map_ops bad_dma_ops;
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+extern const struct dma_map_ops bad_dma_ops;
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return &bad_dma_ops;
 }
@@ -182,7 +182,7 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 					      enum dma_data_direction dir,
 					      unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
 	kmemcheck_mark_initialized(ptr, size);
@@ -201,7 +201,7 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 					  enum dma_data_direction dir,
 					  unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (ops->unmap_page)
@@ -217,7 +217,7 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 				   int nents, enum dma_data_direction dir,
 				   unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	int i, ents;
 	struct scatterlist *s;
 
@@ -235,7 +235,7 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
 				      int nents, enum dma_data_direction dir,
 				      unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
@@ -249,7 +249,7 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 					    enum dma_data_direction dir,
 					    unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
 	kmemcheck_mark_initialized(page_address(page) + offset, size);
@@ -265,7 +265,7 @@ static inline void dma_unmap_page_attrs(struct device *dev,
 					enum dma_data_direction dir,
 					unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (ops->unmap_page)
@@ -279,7 +279,7 @@ static inline dma_addr_t dma_map_resource(struct device *dev,
 					  enum dma_data_direction dir,
 					  unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
@@ -300,7 +300,7 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
 				      size_t size, enum dma_data_direction dir,
 				      unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (ops->unmap_resource)
@@ -312,7 +312,7 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
 					   size_t size,
 					   enum dma_data_direction dir)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_for_cpu)
@@ -324,7 +324,7 @@ static inline void dma_sync_single_for_device(struct device *dev,
 					      dma_addr_t addr, size_t size,
 					      enum dma_data_direction dir)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_for_device)
@@ -364,7 +364,7 @@ static inline void
 dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		    int nelems, enum dma_data_direction dir)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_sg_for_cpu)
@@ -376,7 +376,7 @@ static inline void
 dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		       int nelems, enum dma_data_direction dir)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_sg_for_device)
@@ -421,7 +421,7 @@ static inline int
 dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr,
 	       dma_addr_t dma_addr, size_t size, unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	BUG_ON(!ops);
 	if (ops->mmap)
 		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
@@ -439,7 +439,7 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 		      dma_addr_t dma_addr, size_t size,
 		      unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	BUG_ON(!ops);
 	if (ops->get_sgtable)
 		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
@@ -457,7 +457,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 				       dma_addr_t *dma_handle, gfp_t flag,
 				       unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 	void *cpu_addr;
 
 	BUG_ON(!ops);
@@ -479,7 +479,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
 				     void *cpu_addr, dma_addr_t dma_handle,
 				     unsigned long attrs)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!ops);
 	WARN_ON(irqs_disabled());
@@ -537,7 +537,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 #ifndef HAVE_ARCH_DMA_SUPPORTED
 static inline int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (!ops)
 		return 0;
@@ -550,7 +550,7 @@ static inline int dma_supported(struct device *dev, u64 mask)
 #ifndef HAVE_ARCH_DMA_SET_MASK
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (ops->set_dma_mask)
 		return ops->set_dma_mask(dev, mask);
diff --git a/include/linux/mic_bus.h b/include/linux/mic_bus.h
index 27d7c95fd0da..504d54c71bdb 100644
--- a/include/linux/mic_bus.h
+++ b/include/linux/mic_bus.h
@@ -90,7 +90,7 @@ struct mbus_hw_ops {
 };
 
 struct mbus_device *
-mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
+mbus_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops,
 		     struct mbus_hw_ops *hw_ops, int index,
 		     void __iomem *mmio_va);
 void mbus_unregister_device(struct mbus_device *mbdev);
diff --git a/include/xen/arm/hypervisor.h b/include/xen/arm/hypervisor.h
index 95251512e2c4..44b587b49904 100644
--- a/include/xen/arm/hypervisor.h
+++ b/include/xen/arm/hypervisor.h
@@ -18,7 +18,7 @@ static inline enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 	return PARAVIRT_LAZY_NONE;
 }
 
-extern struct dma_map_ops *xen_dma_ops;
+extern const struct dma_map_ops *xen_dma_ops;
 
 #ifdef CONFIG_XEN
 void __init xen_early_init(void);
diff --git a/lib/dma-noop.c b/lib/dma-noop.c
index 3d766e78fbe2..65e49dd35b7b 100644
--- a/lib/dma-noop.c
+++ b/lib/dma-noop.c
@@ -64,7 +64,7 @@ static int dma_noop_supported(struct device *dev, u64 mask)
 	return 1;
 }
 
-struct dma_map_ops dma_noop_ops = {
+const struct dma_map_ops dma_noop_ops = {
 	.alloc			= dma_noop_alloc,
 	.free			= dma_noop_free,
 	.map_page		= dma_noop_map_page,
-- 
cgit v1.2.3


From 5657933dbb6e25feaf5d8df8c88f96cdade693a3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:02 -0800
Subject: treewide: Move dma_ops from struct dev_archdata into struct device

Some but not all architectures provide set_dma_ops(). Move dma_ops
from struct dev_archdata into struct device such that it becomes
possible on all architectures to configure dma_ops per device.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Russell King <linux@armlinux.org.uk>
Cc: x86@kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 arch/arm/include/asm/device.h            | 1 -
 arch/arm/include/asm/dma-mapping.h       | 6 +++---
 arch/arm64/include/asm/device.h          | 1 -
 arch/arm64/include/asm/dma-mapping.h     | 4 ++--
 arch/arm64/mm/dma-mapping.c              | 8 ++++----
 arch/m32r/include/asm/device.h           | 1 -
 arch/m32r/include/asm/dma-mapping.h      | 4 ++--
 arch/mips/include/asm/device.h           | 5 -----
 arch/mips/include/asm/dma-mapping.h      | 4 ++--
 arch/mips/pci/pci-octeon.c               | 2 +-
 arch/powerpc/include/asm/device.h        | 4 ----
 arch/powerpc/include/asm/dma-mapping.h   | 4 ++--
 arch/powerpc/kernel/dma.c                | 2 +-
 arch/powerpc/platforms/cell/iommu.c      | 2 +-
 arch/powerpc/platforms/pasemi/iommu.c    | 2 +-
 arch/powerpc/platforms/pasemi/setup.c    | 2 +-
 arch/powerpc/platforms/ps3/system-bus.c  | 4 ++--
 arch/powerpc/platforms/pseries/ibmebus.c | 2 +-
 arch/s390/include/asm/device.h           | 1 -
 arch/s390/include/asm/dma-mapping.h      | 4 ++--
 arch/s390/pci/pci.c                      | 2 +-
 arch/tile/include/asm/device.h           | 3 ---
 arch/tile/include/asm/dma-mapping.h      | 6 +++---
 arch/x86/include/asm/device.h            | 3 ---
 arch/x86/include/asm/dma-mapping.h       | 4 ++--
 arch/x86/kernel/pci-calgary_64.c         | 4 ++--
 arch/x86/pci/common.c                    | 2 +-
 arch/x86/pci/sta2x11-fixup.c             | 8 ++++----
 arch/xtensa/include/asm/device.h         | 4 ----
 arch/xtensa/include/asm/dma-mapping.h    | 4 ++--
 drivers/infiniband/ulp/srpt/ib_srpt.c    | 2 +-
 drivers/iommu/amd_iommu.c                | 6 +++---
 drivers/misc/mic/bus/mic_bus.c           | 2 +-
 drivers/misc/mic/bus/scif_bus.c          | 2 +-
 drivers/misc/mic/bus/vop_bus.c           | 2 +-
 include/linux/device.h                   | 1 +
 36 files changed, 48 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/device.h b/arch/arm/include/asm/device.h
index d8a572f9c187..220ba207be91 100644
--- a/arch/arm/include/asm/device.h
+++ b/arch/arm/include/asm/device.h
@@ -7,7 +7,6 @@
 #define ASMARM_DEVICE_H
 
 struct dev_archdata {
-	const struct dma_map_ops	*dma_ops;
 #ifdef CONFIG_DMABOUNCE
 	struct dmabounce_device_info *dmabounce;
 #endif
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 1aabd781306f..312f4d0564d6 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -18,8 +18,8 @@ extern const struct dma_map_ops arm_coherent_dma_ops;
 
 static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
-	if (dev && dev->archdata.dma_ops)
-		return dev->archdata.dma_ops;
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
 	return &arm_dma_ops;
 }
 
@@ -34,7 +34,7 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
 {
 	BUG_ON(!dev);
-	dev->archdata.dma_ops = ops;
+	dev->dma_ops = ops;
 }
 
 #define HAVE_ARCH_DMA_SUPPORTED 1
diff --git a/arch/arm64/include/asm/device.h b/arch/arm64/include/asm/device.h
index 00c678cc31e1..73d5bab015eb 100644
--- a/arch/arm64/include/asm/device.h
+++ b/arch/arm64/include/asm/device.h
@@ -17,7 +17,6 @@
 #define __ASM_DEVICE_H
 
 struct dev_archdata {
-	const struct dma_map_ops *dma_ops;
 #ifdef CONFIG_IOMMU_API
 	void *iommu;			/* private IOMMU data */
 #endif
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index 1fedb43be712..58ae36cc3b60 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -29,8 +29,8 @@ extern const struct dma_map_ops dummy_dma_ops;
 
 static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
-	if (dev && dev->archdata.dma_ops)
-		return dev->archdata.dma_ops;
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
 
 	/*
 	 * We expect no ISA devices, and all other DMA masters are expected to
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index bcef6368d48f..dbab4c6c084b 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -837,7 +837,7 @@ static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
 		return false;
 	}
 
-	dev->archdata.dma_ops = &iommu_dma_ops;
+	dev->dma_ops = &iommu_dma_ops;
 	return true;
 }
 
@@ -941,7 +941,7 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 
 void arch_teardown_dma_ops(struct device *dev)
 {
-	dev->archdata.dma_ops = NULL;
+	dev->dma_ops = NULL;
 }
 
 #else
@@ -955,8 +955,8 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
-	if (!dev->archdata.dma_ops)
-		dev->archdata.dma_ops = &swiotlb_dma_ops;
+	if (!dev->dma_ops)
+		dev->dma_ops = &swiotlb_dma_ops;
 
 	dev->archdata.dma_coherent = coherent;
 	__iommu_setup_dma_ops(dev, dma_base, size, iommu);
diff --git a/arch/m32r/include/asm/device.h b/arch/m32r/include/asm/device.h
index 7955a9799466..5203fc87f080 100644
--- a/arch/m32r/include/asm/device.h
+++ b/arch/m32r/include/asm/device.h
@@ -4,7 +4,6 @@
  * This file is released under the GPLv2
  */
 struct dev_archdata {
-	const struct dma_map_ops *dma_ops;
 };
 
 struct pdev_archdata {
diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h
index 99c43d2f05dc..27b1597ac563 100644
--- a/arch/m32r/include/asm/dma-mapping.h
+++ b/arch/m32r/include/asm/dma-mapping.h
@@ -12,8 +12,8 @@
 
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-	if (dev && dev->archdata.dma_ops)
-		return dev->archdata.dma_ops;
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
 	return &dma_noop_ops;
 }
 
diff --git a/arch/mips/include/asm/device.h b/arch/mips/include/asm/device.h
index ebc5c1265473..6aa796f1081a 100644
--- a/arch/mips/include/asm/device.h
+++ b/arch/mips/include/asm/device.h
@@ -6,12 +6,7 @@
 #ifndef _ASM_MIPS_DEVICE_H
 #define _ASM_MIPS_DEVICE_H
 
-struct dma_map_ops;
-
 struct dev_archdata {
-	/* DMA operations on that device */
-	const struct dma_map_ops *dma_ops;
-
 #ifdef CONFIG_DMA_PERDEV_COHERENT
 	/* Non-zero if DMA is coherent with CPU caches */
 	bool dma_coherent;
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index b59b084a7569..dad3b09fe993 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -13,8 +13,8 @@ extern const struct dma_map_ops *mips_dma_map_ops;
 
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-	if (dev && dev->archdata.dma_ops)
-		return dev->archdata.dma_ops;
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
 	else
 		return mips_dma_map_ops;
 }
diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c
index 308d051fc45c..9ee01936862e 100644
--- a/arch/mips/pci/pci-octeon.c
+++ b/arch/mips/pci/pci-octeon.c
@@ -167,7 +167,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev)
 		pci_write_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, dconfig);
 	}
 
-	dev->dev.archdata.dma_ops = octeon_pci_dma_map_ops;
+	dev->dev.dma_ops = octeon_pci_dma_map_ops;
 
 	return 0;
 }
diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 49cbb0fca233..0245bfcaac32 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -6,7 +6,6 @@
 #ifndef _ASM_POWERPC_DEVICE_H
 #define _ASM_POWERPC_DEVICE_H
 
-struct dma_map_ops;
 struct device_node;
 #ifdef CONFIG_PPC64
 struct pci_dn;
@@ -20,9 +19,6 @@ struct iommu_table;
  * drivers/macintosh/macio_asic.c
  */
 struct dev_archdata {
-	/* DMA operations on that device */
-	const struct dma_map_ops	*dma_ops;
-
 	/*
 	 * These two used to be a union. However, with the hybrid ops we need
 	 * both so here we store both a DMA offset for direct mappings and
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 2ec3eadf336f..59fbd4abcbf8 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -88,12 +88,12 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 	if (unlikely(dev == NULL))
 		return NULL;
 
-	return dev->archdata.dma_ops;
+	return dev->dma_ops;
 }
 
 static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
 {
-	dev->archdata.dma_ops = ops;
+	dev->dma_ops = ops;
 }
 
 /*
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 03b98f1f98ec..41c749586bd2 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -33,7 +33,7 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev)
 	struct dev_archdata __maybe_unused *sd = &dev->archdata;
 
 #ifdef CONFIG_SWIOTLB
-	if (sd->max_direct_dma_addr && sd->dma_ops == &swiotlb_dma_ops)
+	if (sd->max_direct_dma_addr && dev->dma_ops == &swiotlb_dma_ops)
 		pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT);
 #endif
 
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index e1413e69e5fe..71b995bbcae0 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -692,7 +692,7 @@ static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action,
 		return 0;
 
 	/* We use the PCI DMA ops */
-	dev->archdata.dma_ops = get_pci_dma_ops();
+	dev->dma_ops = get_pci_dma_ops();
 
 	cell_dma_dev_setup(dev);
 
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index e74adc4e7fd8..7fec04de27fc 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -186,7 +186,7 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev)
 	 */
 	if (dev->vendor == 0x1959 && dev->device == 0xa007 &&
 	    !firmware_has_feature(FW_FEATURE_LPAR)) {
-		dev->dev.archdata.dma_ops = &dma_direct_ops;
+		dev->dev.dma_ops = &dma_direct_ops;
 		/*
 		 * Set the coherent DMA mask to prevent the iommu
 		 * being used unnecessarily
diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c
index 3182400cf48f..c4a3e93dc324 100644
--- a/arch/powerpc/platforms/pasemi/setup.c
+++ b/arch/powerpc/platforms/pasemi/setup.c
@@ -363,7 +363,7 @@ static int pcmcia_notify(struct notifier_block *nb, unsigned long action,
 		return 0;
 
 	/* We use the direct ops for localbus */
-	dev->archdata.dma_ops = &dma_direct_ops;
+	dev->dma_ops = &dma_direct_ops;
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index c81450d98794..2d2e5f80a3d3 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -756,11 +756,11 @@ int ps3_system_bus_device_register(struct ps3_system_bus_device *dev)
 
 	switch (dev->dev_type) {
 	case PS3_DEVICE_TYPE_IOC0:
-		dev->core.archdata.dma_ops = &ps3_ioc0_dma_ops;
+		dev->core.dma_ops = &ps3_ioc0_dma_ops;
 		dev_set_name(&dev->core, "ioc0_%02x", ++dev_ioc0_count);
 		break;
 	case PS3_DEVICE_TYPE_SB:
-		dev->core.archdata.dma_ops = &ps3_sb_dma_ops;
+		dev->core.dma_ops = &ps3_sb_dma_ops;
 		dev_set_name(&dev->core, "sb_%02x", ++dev_sb_count);
 
 		break;
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
index 2e36a0b8944a..99a6bf7f3bcf 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -169,7 +169,7 @@ static int ibmebus_create_device(struct device_node *dn)
 		return -ENOMEM;
 
 	dev->dev.bus = &ibmebus_bus_type;
-	dev->dev.archdata.dma_ops = &ibmebus_dma_ops;
+	dev->dev.dma_ops = &ibmebus_dma_ops;
 
 	ret = of_device_add(dev);
 	if (ret)
diff --git a/arch/s390/include/asm/device.h b/arch/s390/include/asm/device.h
index 7955a9799466..5203fc87f080 100644
--- a/arch/s390/include/asm/device.h
+++ b/arch/s390/include/asm/device.h
@@ -4,7 +4,6 @@
  * This file is released under the GPLv2
  */
 struct dev_archdata {
-	const struct dma_map_ops *dma_ops;
 };
 
 struct pdev_archdata {
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index 2776d205b1ff..a872027d0c1b 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -14,8 +14,8 @@ extern const struct dma_map_ops s390_pci_dma_ops;
 
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-	if (dev && dev->archdata.dma_ops)
-		return dev->archdata.dma_ops;
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
 	return &dma_noop_ops;
 }
 
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 38e17d4d9884..82abef8b8574 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -641,7 +641,7 @@ int pcibios_add_device(struct pci_dev *pdev)
 	int i;
 
 	pdev->dev.groups = zpci_attr_groups;
-	pdev->dev.archdata.dma_ops = &s390_pci_dma_ops;
+	pdev->dev.dma_ops = &s390_pci_dma_ops;
 	zpci_map_resources(pdev);
 
 	for (i = 0; i < PCI_BAR_COUNT; i++) {
diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
index 25f23ac7d361..1cf45422a0df 100644
--- a/arch/tile/include/asm/device.h
+++ b/arch/tile/include/asm/device.h
@@ -17,9 +17,6 @@
 #define _ASM_TILE_DEVICE_H
 
 struct dev_archdata {
-	/* DMA operations on that device */
-        const struct dma_map_ops	*dma_ops;
-
 	/* Offset of the DMA address from the PA. */
 	dma_addr_t		dma_offset;
 
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 4a06cc75b856..c0620697eaad 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -31,8 +31,8 @@ extern const struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-	if (dev && dev->archdata.dma_ops)
-		return dev->archdata.dma_ops;
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
 	else
 		return tile_dma_map_ops;
 }
@@ -61,7 +61,7 @@ static inline void dma_mark_clean(void *addr, size_t size) {}
 
 static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
 {
-	dev->archdata.dma_ops = ops;
+	dev->dma_ops = ops;
 }
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index b2d0b4ced7e3..1b3ef26e77df 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -2,9 +2,6 @@
 #define _ASM_X86_DEVICE_H
 
 struct dev_archdata {
-#ifdef CONFIG_X86_DEV_DMA_OPS
-	const struct dma_map_ops *dma_ops;
-#endif
 #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
 	void *iommu; /* hook for IOMMU specific extension */
 #endif
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 5e4772886a1e..94b5b96966cb 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -32,10 +32,10 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 #ifndef CONFIG_X86_DEV_DMA_OPS
 	return dma_ops;
 #else
-	if (unlikely(!dev) || !dev->archdata.dma_ops)
+	if (unlikely(!dev) || !dev->dma_ops)
 		return dma_ops;
 	else
-		return dev->archdata.dma_ops;
+		return dev->dma_ops;
 #endif
 }
 
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 17f180148c80..5070320780c6 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1177,7 +1177,7 @@ static int __init calgary_init(void)
 		tbl = find_iommu_table(&dev->dev);
 
 		if (translation_enabled(tbl))
-			dev->dev.archdata.dma_ops = &calgary_dma_ops;
+			dev->dev.dma_ops = &calgary_dma_ops;
 	}
 
 	return ret;
@@ -1201,7 +1201,7 @@ error:
 		calgary_disable_translation(dev);
 		calgary_free_bus(dev);
 		pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
-		dev->dev.archdata.dma_ops = NULL;
+		dev->dev.dma_ops = NULL;
 	} while (1);
 
 	return ret;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index a4fdfa7dcc1b..0cb52ae0a8f0 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -667,7 +667,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev)
 	spin_lock(&dma_domain_list_lock);
 	list_for_each_entry(domain, &dma_domain_list, node) {
 		if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
-			pdev->dev.archdata.dma_ops = domain->dma_ops;
+			pdev->dev.dma_ops = domain->dma_ops;
 			break;
 		}
 	}
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index aa3828823170..ec008e800b45 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -203,7 +203,7 @@ static void sta2x11_setup_pdev(struct pci_dev *pdev)
 		return;
 	pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
 	pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
-	pdev->dev.archdata.dma_ops = &sta2x11_dma_ops;
+	pdev->dev.dma_ops = &sta2x11_dma_ops;
 
 	/* We must enable all devices as master, for audio DMA to work */
 	pci_set_master(pdev);
@@ -223,7 +223,7 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	struct sta2x11_mapping *map;
 
-	if (dev->archdata.dma_ops != &sta2x11_dma_ops) {
+	if (dev->dma_ops != &sta2x11_dma_ops) {
 		if (!dev->dma_mask)
 			return false;
 		return addr + size - 1 <= *dev->dma_mask;
@@ -247,7 +247,7 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
  */
 dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	if (dev->archdata.dma_ops != &sta2x11_dma_ops)
+	if (dev->dma_ops != &sta2x11_dma_ops)
 		return paddr;
 	return p2a(paddr, to_pci_dev(dev));
 }
@@ -259,7 +259,7 @@ dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
  */
 phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-	if (dev->archdata.dma_ops != &sta2x11_dma_ops)
+	if (dev->dma_ops != &sta2x11_dma_ops)
 		return daddr;
 	return a2p(daddr, to_pci_dev(dev));
 }
diff --git a/arch/xtensa/include/asm/device.h b/arch/xtensa/include/asm/device.h
index a77d45d39f35..1deeb8ebbb1b 100644
--- a/arch/xtensa/include/asm/device.h
+++ b/arch/xtensa/include/asm/device.h
@@ -6,11 +6,7 @@
 #ifndef _ASM_XTENSA_DEVICE_H
 #define _ASM_XTENSA_DEVICE_H
 
-struct dma_map_ops;
-
 struct dev_archdata {
-	/* DMA operations on that device */
-	const struct dma_map_ops *dma_ops;
 };
 
 struct pdev_archdata {
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 50d23106cce0..9eecfc3c5dc4 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -22,8 +22,8 @@ extern const struct dma_map_ops xtensa_dma_map_ops;
 
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-	if (dev && dev->archdata.dma_ops)
-		return dev->archdata.dma_ops;
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
 	else
 		return &xtensa_dma_map_ops;
 }
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index d21ba9d857c3..dfc24f19178b 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -2465,7 +2465,7 @@ static void srpt_add_one(struct ib_device *device)
 	int i;
 
 	pr_debug("device = %p, device->dma_ops = %p\n", device,
-		 device->dma_ops);
+		 device->dma_device->dma_ops);
 
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 	if (!sdev)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 3703fb9db419..f7b86679bafe 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -515,7 +515,7 @@ static void iommu_uninit_device(struct device *dev)
 	iommu_group_remove_device(dev);
 
 	/* Remove dma-ops */
-	dev->archdata.dma_ops = NULL;
+	dev->dma_ops = NULL;
 
 	/*
 	 * We keep dev_data around for unplugged devices and reuse it when the
@@ -2164,7 +2164,7 @@ static int amd_iommu_add_device(struct device *dev)
 				dev_name(dev));
 
 		iommu_ignore_device(dev);
-		dev->archdata.dma_ops = &nommu_dma_ops;
+		dev->dma_ops = &nommu_dma_ops;
 		goto out;
 	}
 	init_iommu_group(dev);
@@ -2181,7 +2181,7 @@ static int amd_iommu_add_device(struct device *dev)
 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
 		dev_data->passthrough = true;
 	else
-		dev->archdata.dma_ops = &amd_iommu_dma_ops;
+		dev->dma_ops = &amd_iommu_dma_ops;
 
 out:
 	iommu_completion_wait(iommu);
diff --git a/drivers/misc/mic/bus/mic_bus.c b/drivers/misc/mic/bus/mic_bus.c
index c4b27a25662a..77b16ca66846 100644
--- a/drivers/misc/mic/bus/mic_bus.c
+++ b/drivers/misc/mic/bus/mic_bus.c
@@ -158,7 +158,7 @@ mbus_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_
 	mbdev->dev.parent = pdev;
 	mbdev->id.device = id;
 	mbdev->id.vendor = MBUS_DEV_ANY_ID;
-	mbdev->dev.archdata.dma_ops = dma_ops;
+	mbdev->dev.dma_ops = dma_ops;
 	mbdev->dev.dma_mask = &mbdev->dev.coherent_dma_mask;
 	dma_set_mask(&mbdev->dev, DMA_BIT_MASK(64));
 	mbdev->dev.release = mbus_release_dev;
diff --git a/drivers/misc/mic/bus/scif_bus.c b/drivers/misc/mic/bus/scif_bus.c
index e5d377e97c86..a444db5f61fe 100644
--- a/drivers/misc/mic/bus/scif_bus.c
+++ b/drivers/misc/mic/bus/scif_bus.c
@@ -154,7 +154,7 @@ scif_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_
 	sdev->dev.parent = pdev;
 	sdev->id.device = id;
 	sdev->id.vendor = SCIF_DEV_ANY_ID;
-	sdev->dev.archdata.dma_ops = dma_ops;
+	sdev->dev.dma_ops = dma_ops;
 	sdev->dev.release = scif_release_dev;
 	sdev->hw_ops = hw_ops;
 	sdev->dnode = dnode;
diff --git a/drivers/misc/mic/bus/vop_bus.c b/drivers/misc/mic/bus/vop_bus.c
index e3caa6c53922..fd7f2a6049f8 100644
--- a/drivers/misc/mic/bus/vop_bus.c
+++ b/drivers/misc/mic/bus/vop_bus.c
@@ -154,7 +154,7 @@ vop_register_device(struct device *pdev, int id,
 	vdev->dev.parent = pdev;
 	vdev->id.device = id;
 	vdev->id.vendor = VOP_DEV_ANY_ID;
-	vdev->dev.archdata.dma_ops = dma_ops;
+	vdev->dev.dma_ops = dma_ops;
 	vdev->dev.dma_mask = &vdev->dev.coherent_dma_mask;
 	dma_set_mask(&vdev->dev, DMA_BIT_MASK(64));
 	vdev->dev.release = vop_release_dev;
diff --git a/include/linux/device.h b/include/linux/device.h
index 491b4c0ca633..46a567261ccc 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -921,6 +921,7 @@ struct device {
 #ifdef CONFIG_NUMA
 	int		numa_node;	/* NUMA node this device is close to */
 #endif
+	const struct dma_map_ops *dma_ops;
 	u64		*dma_mask;	/* dma mask (if dma'able device) */
 	u64		coherent_dma_mask;/* Like dma_mask, but for
 					     alloc_coherent mappings as
-- 
cgit v1.2.3


From ca6e8e1031419549f67291ca31b43126f07cecdb Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:03 -0800
Subject: treewide: Consolidate set_dma_ops() implementations

Now that all set_dma_ops() implementations are identical (ignoring
BUG_ON() statements), remove the architecture specific definitions
and add a definition in <linux/dma-mapping.h>.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 arch/arm/include/asm/dma-mapping.h     | 6 ------
 arch/powerpc/include/asm/dma-mapping.h | 5 -----
 arch/tile/include/asm/dma-mapping.h    | 5 -----
 include/linux/dma-mapping.h            | 5 +++++
 4 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 312f4d0564d6..c7432d647e53 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -31,12 +31,6 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 		return __generic_dma_ops(dev);
 }
 
-static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
-{
-	BUG_ON(!dev);
-	dev->dma_ops = ops;
-}
-
 #define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *dev, u64 mask);
 
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 59fbd4abcbf8..8275603ba4d5 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -91,11 +91,6 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 	return dev->dma_ops;
 }
 
-static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
-{
-	dev->dma_ops = ops;
-}
-
 /*
  * get_dma_offset()
  *
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index c0620697eaad..2562995a6ac9 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -59,11 +59,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *ops)
-{
-	dev->dma_ops = ops;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f1da68b82c63..e97f23e8b2d9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -164,6 +164,11 @@ int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma,
 
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
+static inline void set_dma_ops(struct device *dev,
+			       const struct dma_map_ops *dma_ops)
+{
+	dev->dma_ops = dma_ops;
+}
 #else
 /*
  * Define the dma api to allow compilation but not linking of
-- 
cgit v1.2.3


From 815dd18788fe0d41899f51b91d0560279cf16b0d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:04 -0800
Subject: treewide: Consolidate get_dma_ops() implementations

Introduce a new architecture-specific get_arch_dma_ops() function
that takes a struct bus_type * argument. Add get_dma_ops() in
<linux/dma-mapping.h>.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Russell King <linux@armlinux.org.uk>
Cc: x86@kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 arch/alpha/include/asm/dma-mapping.h      | 2 +-
 arch/arc/include/asm/dma-mapping.h        | 2 +-
 arch/arm/include/asm/dma-mapping.h        | 4 ++--
 arch/arm64/include/asm/dma-mapping.h      | 4 ++--
 arch/avr32/include/asm/dma-mapping.h      | 2 +-
 arch/blackfin/include/asm/dma-mapping.h   | 2 +-
 arch/c6x/include/asm/dma-mapping.h        | 2 +-
 arch/cris/include/asm/dma-mapping.h       | 4 ++--
 arch/frv/include/asm/dma-mapping.h        | 2 +-
 arch/h8300/include/asm/dma-mapping.h      | 2 +-
 arch/hexagon/include/asm/dma-mapping.h    | 5 +----
 arch/ia64/include/asm/dma-mapping.h       | 5 ++++-
 arch/m32r/include/asm/dma-mapping.h       | 4 +---
 arch/m68k/include/asm/dma-mapping.h       | 2 +-
 arch/metag/include/asm/dma-mapping.h      | 2 +-
 arch/microblaze/include/asm/dma-mapping.h | 2 +-
 arch/mips/include/asm/dma-mapping.h       | 7 ++-----
 arch/mn10300/include/asm/dma-mapping.h    | 2 +-
 arch/nios2/include/asm/dma-mapping.h      | 2 +-
 arch/openrisc/include/asm/dma-mapping.h   | 2 +-
 arch/parisc/include/asm/dma-mapping.h     | 2 +-
 arch/powerpc/include/asm/dma-mapping.h    | 7 ++-----
 arch/powerpc/include/asm/ps3.h            | 2 +-
 arch/s390/include/asm/dma-mapping.h       | 4 +---
 arch/sh/include/asm/dma-mapping.h         | 2 +-
 arch/sparc/include/asm/dma-mapping.h      | 4 ++--
 arch/tile/include/asm/dma-mapping.h       | 7 ++-----
 arch/unicore32/include/asm/dma-mapping.h  | 2 +-
 arch/x86/include/asm/dma-mapping.h        | 9 +--------
 arch/xtensa/include/asm/dma-mapping.h     | 7 ++-----
 include/linux/dma-mapping.h               | 7 +++++++
 31 files changed, 48 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index d3480562411d..5d53666935e6 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -3,7 +3,7 @@
 
 extern const struct dma_map_ops *dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return dma_ops;
 }
diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h
index fdff3aa60052..94285031c4fb 100644
--- a/arch/arc/include/asm/dma-mapping.h
+++ b/arch/arc/include/asm/dma-mapping.h
@@ -20,7 +20,7 @@
 
 extern const struct dma_map_ops arc_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &arc_dma_ops;
 }
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index c7432d647e53..716656925975 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -23,12 +23,12 @@ static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
 	return &arm_dma_ops;
 }
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	if (xen_initial_domain())
 		return xen_dma_ops;
 	else
-		return __generic_dma_ops(dev);
+		return __generic_dma_ops(NULL);
 }
 
 #define HAVE_ARCH_DMA_SUPPORTED 1
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index 58ae36cc3b60..505756cdc67a 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -39,12 +39,12 @@ static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev)
 	return &dummy_dma_ops;
 }
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	if (xen_initial_domain())
 		return xen_dma_ops;
 	else
-		return __generic_dma_ops(dev);
+		return __generic_dma_ops(NULL);
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h
index b2b43c0e0774..7388451f9905 100644
--- a/arch/avr32/include/asm/dma-mapping.h
+++ b/arch/avr32/include/asm/dma-mapping.h
@@ -6,7 +6,7 @@ extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 
 extern const struct dma_map_ops avr32_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &avr32_dma_ops;
 }
diff --git a/arch/blackfin/include/asm/dma-mapping.h b/arch/blackfin/include/asm/dma-mapping.h
index 320fb50fbd41..04254ac36bed 100644
--- a/arch/blackfin/include/asm/dma-mapping.h
+++ b/arch/blackfin/include/asm/dma-mapping.h
@@ -38,7 +38,7 @@ _dma_sync(dma_addr_t addr, size_t size, enum dma_data_direction dir)
 
 extern const struct dma_map_ops bfin_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &bfin_dma_ops;
 }
diff --git a/arch/c6x/include/asm/dma-mapping.h b/arch/c6x/include/asm/dma-mapping.h
index 88258b9ebc8e..aca9f755e4f8 100644
--- a/arch/c6x/include/asm/dma-mapping.h
+++ b/arch/c6x/include/asm/dma-mapping.h
@@ -19,7 +19,7 @@
 
 extern const struct dma_map_ops c6x_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &c6x_dma_ops;
 }
diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h
index aae4fbc0a656..256169de3743 100644
--- a/arch/cris/include/asm/dma-mapping.h
+++ b/arch/cris/include/asm/dma-mapping.h
@@ -4,12 +4,12 @@
 #ifdef CONFIG_PCI
 extern const struct dma_map_ops v32_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &v32_dma_ops;
 }
 #else
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	BUG();
 	return NULL;
diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h
index 150cc00544a8..354900917585 100644
--- a/arch/frv/include/asm/dma-mapping.h
+++ b/arch/frv/include/asm/dma-mapping.h
@@ -9,7 +9,7 @@ extern unsigned long __nongprelbss dma_coherent_mem_end;
 
 extern const struct dma_map_ops frv_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &frv_dma_ops;
 }
diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h
index f804bca4c13f..847c7562e046 100644
--- a/arch/h8300/include/asm/dma-mapping.h
+++ b/arch/h8300/include/asm/dma-mapping.h
@@ -3,7 +3,7 @@
 
 extern const struct dma_map_ops h8300_dma_map_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &h8300_dma_map_ops;
 }
diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h
index b812e917cd95..d3a87bd9b686 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -34,11 +34,8 @@ extern int bad_dma_address;
 
 extern const struct dma_map_ops *dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (unlikely(dev == NULL))
-		return NULL;
-
 	return dma_ops;
 }
 
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 05e467d56d86..73ec3c6f4cfe 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -23,7 +23,10 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
 extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
 				enum dma_data_direction);
 
-#define get_dma_ops(dev) platform_dma_get_ops(dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+{
+	return platform_dma_get_ops(NULL);
+}
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h
index 27b1597ac563..c01d9f52d228 100644
--- a/arch/m32r/include/asm/dma-mapping.h
+++ b/arch/m32r/include/asm/dma-mapping.h
@@ -10,10 +10,8 @@
 
 #define DMA_ERROR_CODE (~(dma_addr_t)0x0)
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (dev && dev->dma_ops)
-		return dev->dma_ops;
 	return &dma_noop_ops;
 }
 
diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h
index 863509939d5a..9210e470771b 100644
--- a/arch/m68k/include/asm/dma-mapping.h
+++ b/arch/m68k/include/asm/dma-mapping.h
@@ -3,7 +3,7 @@
 
 extern const struct dma_map_ops m68k_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
         return &m68k_dma_ops;
 }
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index c156a7ac732f..fad3dc3cb210 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -3,7 +3,7 @@
 
 extern const struct dma_map_ops metag_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &metag_dma_ops;
 }
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index c7faf2fb51d6..3fad5e722a66 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -38,7 +38,7 @@
  */
 extern const struct dma_map_ops dma_direct_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &dma_direct_ops;
 }
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index dad3b09fe993..aba71385f9d1 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -11,12 +11,9 @@
 
 extern const struct dma_map_ops *mips_dma_map_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (dev && dev->dma_ops)
-		return dev->dma_ops;
-	else
-		return mips_dma_map_ops;
+	return mips_dma_map_ops;
 }
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
diff --git a/arch/mn10300/include/asm/dma-mapping.h b/arch/mn10300/include/asm/dma-mapping.h
index 564e3927e005..737ef574b3ea 100644
--- a/arch/mn10300/include/asm/dma-mapping.h
+++ b/arch/mn10300/include/asm/dma-mapping.h
@@ -16,7 +16,7 @@
 
 extern const struct dma_map_ops mn10300_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &mn10300_dma_ops;
 }
diff --git a/arch/nios2/include/asm/dma-mapping.h b/arch/nios2/include/asm/dma-mapping.h
index aa00d839a64b..7b3c6f280293 100644
--- a/arch/nios2/include/asm/dma-mapping.h
+++ b/arch/nios2/include/asm/dma-mapping.h
@@ -12,7 +12,7 @@
 
 extern const struct dma_map_ops nios2_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &nios2_dma_ops;
 }
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index 88acbedb4947..0c0075f17145 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -30,7 +30,7 @@
 
 extern const struct dma_map_ops or1k_dma_map_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &or1k_dma_map_ops;
 }
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h
index 1749073e44fc..5404c6a726b2 100644
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -27,7 +27,7 @@ extern const struct dma_map_ops pcx_dma_ops;
 
 extern const struct dma_map_ops *hppa_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return hppa_dma_ops;
 }
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 8275603ba4d5..181a095468e4 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -78,17 +78,14 @@ extern struct dma_map_ops dma_iommu_ops;
 #endif
 extern const struct dma_map_ops dma_direct_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	/* We don't handle the NULL dev case for ISA for now. We could
 	 * do it via an out of line call but it is not needed for now. The
 	 * only ISA DMA device we support is the floppy and we have a hack
 	 * in the floppy driver directly to get a device for us.
 	 */
-	if (unlikely(dev == NULL))
-		return NULL;
-
-	return dev->dma_ops;
+	return NULL;
 }
 
 /*
diff --git a/arch/powerpc/include/asm/ps3.h b/arch/powerpc/include/asm/ps3.h
index a19f831a4cc9..17ee719e799f 100644
--- a/arch/powerpc/include/asm/ps3.h
+++ b/arch/powerpc/include/asm/ps3.h
@@ -435,7 +435,7 @@ static inline void *ps3_system_bus_get_drvdata(
 	return dev_get_drvdata(&dev->core);
 }
 
-/* These two need global scope for get_dma_ops(). */
+/* These two need global scope for get_arch_dma_ops(). */
 
 extern struct bus_type ps3_system_bus_type;
 
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index a872027d0c1b..3108b8dbe266 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -12,10 +12,8 @@
 
 extern const struct dma_map_ops s390_pci_dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (dev && dev->dma_ops)
-		return dev->dma_ops;
 	return &dma_noop_ops;
 }
 
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index a7382c34c241..d99008af5f73 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -4,7 +4,7 @@
 extern const struct dma_map_ops *dma_ops;
 extern void no_iommu_init(void);
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return dma_ops;
 }
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 3d2babc0c4c6..69cc627779f2 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -24,14 +24,14 @@ extern const struct dma_map_ops pci32_dma_ops;
 
 extern struct bus_type pci_bus_type;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 #ifdef CONFIG_SPARC_LEON
 	if (sparc_cpu_model == sparc_leon)
 		return leon_dma_ops;
 #endif
 #if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
-	if (dev->bus == &pci_bus_type)
+	if (bus == &pci_bus_type)
 		return &pci32_dma_ops;
 #endif
 	return dma_ops;
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 2562995a6ac9..bbc71a29b2c6 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -29,12 +29,9 @@ extern const struct dma_map_ops *gx_pci_dma_map_ops;
 extern const struct dma_map_ops *gx_legacy_pci_dma_map_ops;
 extern const struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (dev && dev->dma_ops)
-		return dev->dma_ops;
-	else
-		return tile_dma_map_ops;
+	return tile_dma_map_ops;
 }
 
 static inline dma_addr_t get_dma_offset(struct device *dev)
diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h
index 14d7729c7b73..518ba5848dd6 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -23,7 +23,7 @@
 
 extern const struct dma_map_ops swiotlb_dma_map_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	return &swiotlb_dma_map_ops;
 }
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 94b5b96966cb..08a0838b83fb 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -27,16 +27,9 @@ extern int panic_on_overflow;
 
 extern const struct dma_map_ops *dma_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-#ifndef CONFIG_X86_DEV_DMA_OPS
 	return dma_ops;
-#else
-	if (unlikely(!dev) || !dev->dma_ops)
-		return dma_ops;
-	else
-		return dev->dma_ops;
-#endif
 }
 
 bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 9eecfc3c5dc4..c6140fa8c0be 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -20,12 +20,9 @@
 
 extern const struct dma_map_ops xtensa_dma_map_ops;
 
-static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	if (dev && dev->dma_ops)
-		return dev->dma_ops;
-	else
-		return &xtensa_dma_map_ops;
+	return &xtensa_dma_map_ops;
 }
 
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index e97f23e8b2d9..ab8710888ddf 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -164,6 +164,13 @@ int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma,
 
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+{
+	if (dev && dev->dma_ops)
+		return dev->dma_ops;
+	return get_arch_dma_ops(dev ? dev->bus : NULL);
+}
+
 static inline void set_dma_ops(struct device *dev,
 			       const struct dma_map_ops *dma_ops)
 {
-- 
cgit v1.2.3


From 551199aca1c3102ebed390566d681cc1290284ca Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 20 Jan 2017 13:04:07 -0800
Subject: lib/dma-virt: Add dma_virt_ops

Several RDMA drivers (hfi1, qib and rxe) expect that ib_sge.addr
is a virtual address. Provide DMA mapping operations that are
suitable for these drivers.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/dma-mapping.h |  1 +
 lib/Kconfig                 |  5 ++++
 lib/Makefile                |  1 +
 lib/dma-virt.c              | 72 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+)
 create mode 100644 lib/dma-virt.c

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index ab8710888ddf..426c43d4fdbf 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -128,6 +128,7 @@ struct dma_map_ops {
 };
 
 extern const struct dma_map_ops dma_noop_ops;
+extern const struct dma_map_ops dma_virt_ops;
 
 #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
 
diff --git a/lib/Kconfig b/lib/Kconfig
index b6baf0609e8b..97af23b26f33 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -400,6 +400,11 @@ config DMA_NOOP_OPS
 	depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
 	default n
 
+config DMA_VIRT_OPS
+	bool
+	depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
+	default n
+
 config CHECK_SIGNATURE
 	bool
 
diff --git a/lib/Makefile b/lib/Makefile
index ada06d5652e8..b97e9a814854 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -27,6 +27,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_DMA_NOOP_OPS) += dma-noop.o
+lib-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o
 
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
diff --git a/lib/dma-virt.c b/lib/dma-virt.c
new file mode 100644
index 000000000000..dcd4df1f7174
--- /dev/null
+++ b/lib/dma-virt.c
@@ -0,0 +1,72 @@
+/*
+ *	lib/dma-virt.c
+ *
+ * DMA operations that map to virtual addresses without flushing memory.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+
+static void *dma_virt_alloc(struct device *dev, size_t size,
+			    dma_addr_t *dma_handle, gfp_t gfp,
+			    unsigned long attrs)
+{
+	void *ret;
+
+	ret = (void *)__get_free_pages(gfp, get_order(size));
+	if (ret)
+		*dma_handle = (uintptr_t)ret;
+	return ret;
+}
+
+static void dma_virt_free(struct device *dev, size_t size,
+			  void *cpu_addr, dma_addr_t dma_addr,
+			  unsigned long attrs)
+{
+	free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page,
+				    unsigned long offset, size_t size,
+				    enum dma_data_direction dir,
+				    unsigned long attrs)
+{
+	return (uintptr_t)(page_address(page) + offset);
+}
+
+static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction dir,
+			   unsigned long attrs)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, nents, i) {
+		BUG_ON(!sg_page(sg));
+		sg_dma_address(sg) = (uintptr_t)sg_virt(sg);
+		sg_dma_len(sg) = sg->length;
+	}
+
+	return nents;
+}
+
+static int dma_virt_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return false;
+}
+
+static int dma_virt_supported(struct device *dev, u64 mask)
+{
+	return true;
+}
+
+const struct dma_map_ops dma_virt_ops = {
+	.alloc			= dma_virt_alloc,
+	.free			= dma_virt_free,
+	.map_page		= dma_virt_map_page,
+	.map_sg			= dma_virt_map_sg,
+	.mapping_error		= dma_virt_mapping_error,
+	.dma_supported		= dma_virt_supported,
+};
+EXPORT_SYMBOL(dma_virt_ops);
-- 
cgit v1.2.3


From 6db6f0eae6052b70885562e1733896647ec1d807 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 21 Jan 2017 21:01:32 +0100
Subject: bridge: multicast to unicast
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements an optional, per bridge port flag and feature to deliver
multicast packets to any host on the according port via unicast
individually. This is done by copying the packet per host and
changing the multicast destination MAC to a unicast one accordingly.

multicast-to-unicast works on top of the multicast snooping feature of
the bridge. Which means unicast copies are only delivered to hosts which
are interested in it and signalized this via IGMP/MLD reports
previously.

This feature is intended for interface types which have a more reliable
and/or efficient way to deliver unicast packets than broadcast ones
(e.g. wifi).

However, it should only be enabled on interfaces where no IGMPv2/MLDv1
report suppression takes place. This feature is disabled by default.

The initial patch and idea is from Felix Fietkau.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
[linus.luessing@c0d3.blue: various bug + style fixes, commit message]
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_forward.c      | 39 ++++++++++++++++++-
 net/bridge/br_mdb.c          |  2 +-
 net/bridge/br_multicast.c    | 90 ++++++++++++++++++++++++++++++++------------
 net/bridge/br_netlink.c      |  5 +++
 net/bridge/br_private.h      |  3 +-
 net/bridge/br_sysfs_if.c     |  2 +
 8 files changed, 114 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index c6587c01d951..debc9d5904e5 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -46,6 +46,7 @@ struct br_ip_list {
 #define BR_LEARNING_SYNC	BIT(9)
 #define BR_PROXYARP_WIFI	BIT(10)
 #define BR_MCAST_FLOOD		BIT(11)
+#define BR_MULTICAST_TO_UNICAST	BIT(12)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 184b16ed2b84..b9aa5641ebe5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -321,6 +321,7 @@ enum {
 	IFLA_BRPORT_MULTICAST_ROUTER,
 	IFLA_BRPORT_PAD,
 	IFLA_BRPORT_MCAST_FLOOD,
+	IFLA_BRPORT_MCAST_TO_UCAST,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7cb41aee4c82..a0f9d0037d24 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -174,6 +174,31 @@ out:
 	return p;
 }
 
+static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
+			       const unsigned char *addr, bool local_orig)
+{
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+	const unsigned char *src = eth_hdr(skb)->h_source;
+
+	if (!should_deliver(p, skb))
+		return;
+
+	/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+	if (skb->dev == p->dev && ether_addr_equal(src, addr))
+		return;
+
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (!skb) {
+		dev->stats.tx_dropped++;
+		return;
+	}
+
+	if (!is_broadcast_ether_addr(addr))
+		memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+
+	__br_forward(p, skb, local_orig);
+}
+
 /* called under rcu_read_lock */
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
@@ -241,10 +266,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 		rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
 			     NULL;
 
-		port = (unsigned long)lport > (unsigned long)rport ?
-		       lport : rport;
+		if ((unsigned long)lport > (unsigned long)rport) {
+			port = lport;
+
+			if (port->flags & BR_MULTICAST_TO_UNICAST) {
+				maybe_deliver_addr(lport, skb, p->eth_addr,
+						   local_orig);
+				goto delivered;
+			}
+		} else {
+			port = rport;
+		}
 
 		prev = maybe_deliver(prev, port, skb, local_orig);
+delivered:
 		if (IS_ERR(prev))
 			goto out;
 		if (prev == port)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7dbc80d01eb0..056e6ac49d8f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, state);
+	p = br_multicast_new_port_group(port, group, *pp, state, NULL);
 	if (unlikely(!p))
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f66346122dc4..1de3438e36bf 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -43,12 +43,14 @@ static void br_multicast_add_router(struct net_bridge *br,
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid);
+					 __u16 vid,
+					 const unsigned char *src);
+
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid);
+					 __u16 vid, const unsigned char *src);
 #endif
 unsigned int br_mdb_rehash_seq;
 
@@ -711,7 +713,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 			struct net_bridge_port *port,
 			struct br_ip *group,
 			struct net_bridge_port_group __rcu *next,
-			unsigned char flags)
+			unsigned char flags,
+			const unsigned char *src)
 {
 	struct net_bridge_port_group *p;
 
@@ -726,12 +729,32 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 	hlist_add_head(&p->mglist, &port->mglist);
 	setup_timer(&p->timer, br_multicast_port_group_expired,
 		    (unsigned long)p);
+
+	if (src)
+		memcpy(p->eth_addr, src, ETH_ALEN);
+	else
+		memset(p->eth_addr, 0xff, ETH_ALEN);
+
 	return p;
 }
 
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+				struct net_bridge_port *port,
+				const unsigned char *src)
+{
+	if (p->port != port)
+		return false;
+
+	if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+		return true;
+
+	return ether_addr_equal(src, p->eth_addr);
+}
+
 static int br_multicast_add_group(struct net_bridge *br,
 				  struct net_bridge_port *port,
-				  struct br_ip *group)
+				  struct br_ip *group,
+				  const unsigned char *src)
 {
 	struct net_bridge_port_group __rcu **pp;
 	struct net_bridge_port_group *p;
@@ -758,13 +781,13 @@ static int br_multicast_add_group(struct net_bridge *br,
 	for (pp = &mp->ports;
 	     (p = mlock_dereference(*pp, br)) != NULL;
 	     pp = &p->next) {
-		if (p->port == port)
+		if (br_port_group_equal(p, port, src))
 			goto found;
 		if ((unsigned long)p->port < (unsigned long)port)
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, 0);
+	p = br_multicast_new_port_group(port, group, *pp, 0, src);
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
@@ -783,7 +806,8 @@ err:
 static int br_ip4_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      __be32 group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -794,14 +818,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IP);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int br_ip6_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      const struct in6_addr *group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -812,7 +837,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IPV6);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 #endif
 
@@ -1081,6 +1106,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 					 struct sk_buff *skb,
 					 u16 vid)
 {
+	const unsigned char *src;
 	struct igmpv3_report *ih;
 	struct igmpv3_grec *grec;
 	int i;
@@ -1121,12 +1147,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
 		     type == IGMPV3_MODE_IS_INCLUDE) &&
 		    ntohs(grec->grec_nsrcs) == 0) {
-			br_ip4_multicast_leave_group(br, port, group, vid);
+			br_ip4_multicast_leave_group(br, port, group, vid, src);
 		} else {
-			err = br_ip4_multicast_add_group(br, port, group, vid);
+			err = br_ip4_multicast_add_group(br, port, group, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1141,6 +1169,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 					struct sk_buff *skb,
 					u16 vid)
 {
+	const unsigned char *src;
 	struct icmp6hdr *icmp6h;
 	struct mld2_grec *grec;
 	int i;
@@ -1188,14 +1217,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
 		     grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
 		    ntohs(*nsrcs) == 0) {
 			br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
-						     vid);
+						     vid, src);
 		} else {
 			err = br_ip6_multicast_add_group(br, port,
-							 &grec->grec_mca, vid);
+							 &grec->grec_mca, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1511,7 +1542,8 @@ br_multicast_leave_group(struct net_bridge *br,
 			 struct net_bridge_port *port,
 			 struct br_ip *group,
 			 struct bridge_mcast_other_query *other_query,
-			 struct bridge_mcast_own_query *own_query)
+			 struct bridge_mcast_own_query *own_query,
+			 const unsigned char *src)
 {
 	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
@@ -1535,7 +1567,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (pp = &mp->ports;
 		     (p = mlock_dereference(*pp, br)) != NULL;
 		     pp = &p->next) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			rcu_assign_pointer(*pp, p->next);
@@ -1566,7 +1598,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (p = mlock_dereference(mp->ports, br);
 		     p != NULL;
 		     p = mlock_dereference(p->next, br)) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			if (!hlist_unhashed(&p->mglist) &&
@@ -1617,7 +1649,8 @@ out:
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1632,14 +1665,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
-				 own_query);
+				 own_query, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1654,7 +1688,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
-				 own_query);
+				 own_query, src);
 }
 #endif
 
@@ -1712,6 +1746,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct igmphdr *ih;
 	int err;
 
@@ -1731,13 +1766,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 	}
 
 	ih = igmp_hdr(skb);
+	src = eth_hdr(skb)->h_source;
 	BR_INPUT_SKB_CB(skb)->igmp = ih->type;
 
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_REPORT:
 	case IGMPV2_HOST_MEMBERSHIP_REPORT:
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip4_multicast_add_group(br, port, ih->group, vid);
+		err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
 		break;
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
 		err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1746,7 +1782,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_LEAVE_MESSAGE:
-		br_ip4_multicast_leave_group(br, port, ih->group, vid);
+		br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
 		break;
 	}
 
@@ -1766,6 +1802,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct mld_msg *mld;
 	int err;
 
@@ -1785,8 +1822,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 
 	switch (mld->mld_type) {
 	case ICMPV6_MGM_REPORT:
+		src = eth_hdr(skb)->h_source;
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
+		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
+						 src);
 		break;
 	case ICMPV6_MLD2_REPORT:
 		err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1795,7 +1834,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 		err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_REDUCTION:
-		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
+		src = eth_hdr(skb)->h_source;
+		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
 		break;
 	}
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 71c7453268c1..6c087cd049b9 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -123,6 +123,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_GUARD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROTECT */
 		+ nla_total_size(1)	/* IFLA_BRPORT_FAST_LEAVE */
+		+ nla_total_size(1)	/* IFLA_BRPORT_MCAST_TO_UCAST */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LEARNING */
 		+ nla_total_size(1)	/* IFLA_BRPORT_UNICAST_FLOOD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
@@ -173,6 +174,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       !!(p->flags & BR_ROOT_BLOCK)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
 		       !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
+		       !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
 		       !!(p->flags & BR_FLOOD)) ||
@@ -586,6 +589,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_PROXYARP]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
 	[IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
+	[IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -636,6 +640,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
 	br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
+	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8ce621e8345c..0b82a227fc34 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -177,6 +177,7 @@ struct net_bridge_port_group {
 	struct timer_list		timer;
 	struct br_ip			addr;
 	unsigned char			flags;
+	unsigned char			eth_addr[ETH_ALEN];
 };
 
 struct net_bridge_mdb_entry
@@ -599,7 +600,7 @@ void br_multicast_free_pg(struct rcu_head *head);
 struct net_bridge_port_group *
 br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
 			    struct net_bridge_port_group __rcu *next,
-			    unsigned char flags);
+			    unsigned char flags, const unsigned char *src);
 void br_mdb_init(void);
 void br_mdb_uninit(void);
 void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 8bd569695e76..05e8946ccc03 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -188,6 +188,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
 		   store_multicast_router);
 
 BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
 #endif
 
 static const struct brport_attribute *brport_attrs[] = {
@@ -214,6 +215,7 @@ static const struct brport_attribute *brport_attrs[] = {
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&brport_attr_multicast_router,
 	&brport_attr_multicast_fast_leave,
+	&brport_attr_multicast_to_unicast,
 #endif
 	&brport_attr_proxyarp,
 	&brport_attr_proxyarp_wifi,
-- 
cgit v1.2.3


From b70f43a16182c7dbc0779bc5bd9f621711f13eb3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sun, 22 Jan 2017 21:17:32 -0800
Subject: net: phy: Fix typo for MDIO module boilerplate comment

The module boilerplate macro is named mdio_module_driver and not
module_mdio_driver, fix that.

Fixes: a9049e0c513c ("mdio: Add support for mdio drivers.")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index b6587a4b32e7..55a80d73cfc1 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -265,7 +265,7 @@ bool mdiobus_is_registered_device(struct mii_bus *bus, int addr);
 struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr);
 
 /**
- * module_mdio_driver() - Helper macro for registering mdio drivers
+ * mdio_module_driver() - Helper macro for registering mdio drivers
  *
  * Helper macro for MDIO drivers which do not do anything special in module
  * init/exit. Each module may only use this macro once, and calling it
-- 
cgit v1.2.3


From c9497c98901c689bf6c357f812bf864ed8f50ace Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Thu, 15 Dec 2016 14:02:53 +0200
Subject: net/mlx5: Add support for setting VF min rate

Add support for SRIOV VF min rate guarantee by using the TSAR BW share
weights mechanism.

The TSAR BW share vport attribute represents the weight of that vport
among the other vports weights which means that the actual vport BW
percentage is the same vport weight percentage among the total vports
weights sum.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  5 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 97 +++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 11 ++-
 include/linux/mlx5/mlx5_ifc.h                     |  4 +-
 4 files changed, 104 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3a06c81ef85e..c819d07fbdb3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3021,11 +3021,8 @@ static int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 
-	if (min_tx_rate)
-		return -EOPNOTSUPP;
-
 	return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vf + 1,
-					   max_tx_rate);
+					   max_tx_rate, min_tx_rate);
 }
 
 static int mlx5_vport_link2ifla(u8 esw_link)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 4b3b60be319d..efa1a7a76d8a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1415,7 +1415,7 @@ static void esw_destroy_tsar(struct mlx5_eswitch *esw)
 }
 
 static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
-				u32 initial_max_rate)
+				u32 initial_max_rate, u32 initial_bw_share)
 {
 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
 	struct mlx5_vport *vport = &esw->vports[vport_num];
@@ -1439,6 +1439,7 @@ static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
 		 esw->qos.root_tsar_id);
 	MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
 		 initial_max_rate);
+	MLX5_SET(scheduling_context, &sched_ctx, bw_share, initial_bw_share);
 
 	err = mlx5_create_scheduling_element_cmd(dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
@@ -1473,7 +1474,7 @@ static void esw_vport_disable_qos(struct mlx5_eswitch *esw, int vport_num)
 }
 
 static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
-				u32 max_rate)
+				u32 max_rate, u32 bw_share)
 {
 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
 	struct mlx5_vport *vport = &esw->vports[vport_num];
@@ -1497,7 +1498,9 @@ static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
 		 esw->qos.root_tsar_id);
 	MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
 		 max_rate);
+	MLX5_SET(scheduling_context, &sched_ctx, bw_share, bw_share);
 	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
 
 	err = mlx5_modify_scheduling_element_cmd(dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
@@ -1563,7 +1566,8 @@ static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num,
 	esw_apply_vport_conf(esw, vport);
 
 	/* Attach vport to the eswitch rate limiter */
-	if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate))
+	if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate,
+				 vport->qos.bw_share))
 		esw_warn(esw->dev, "Failed to attach vport %d to eswitch rate limiter", vport_num);
 
 	/* Sync with current vport context */
@@ -1952,6 +1956,7 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
 	ivi->qos = evport->info.qos;
 	ivi->spoofchk = evport->info.spoofchk;
 	ivi->trusted = evport->info.trusted;
+	ivi->min_tx_rate = evport->info.min_rate;
 	ivi->max_tx_rate = evport->info.max_rate;
 	mutex_unlock(&esw->state_lock);
 
@@ -2046,23 +2051,103 @@ int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
 	return 0;
 }
 
-int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw,
-				int vport, u32 max_rate)
+static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw)
 {
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
 	struct mlx5_vport *evport;
+	u32 max_guarantee = 0;
+	int i;
+
+	for (i = 0; i <= esw->total_vports; i++) {
+		evport = &esw->vports[i];
+		if (!evport->enabled || evport->info.min_rate < max_guarantee)
+			continue;
+		max_guarantee = evport->info.min_rate;
+	}
+
+	return max_t(u32, max_guarantee / fw_max_bw_share, 1);
+}
+
+static int normalize_vports_min_rate(struct mlx5_eswitch *esw, u32 divider)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	struct mlx5_vport *evport;
+	u32 vport_max_rate;
+	u32 vport_min_rate;
+	u32 bw_share;
+	int err;
+	int i;
+
+	for (i = 0; i <= esw->total_vports; i++) {
+		evport = &esw->vports[i];
+		if (!evport->enabled)
+			continue;
+		vport_min_rate = evport->info.min_rate;
+		vport_max_rate = evport->info.max_rate;
+		bw_share = MLX5_MIN_BW_SHARE;
+
+		if (vport_min_rate)
+			bw_share = MLX5_RATE_TO_BW_SHARE(vport_min_rate,
+							 divider,
+							 fw_max_bw_share);
+
+		if (bw_share == evport->qos.bw_share)
+			continue;
+
+		err = esw_vport_qos_config(esw, i, vport_max_rate,
+					   bw_share);
+		if (!err)
+			evport->qos.bw_share = bw_share;
+		else
+			return err;
+	}
+
+	return 0;
+}
+
+int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
+				u32 max_rate, u32 min_rate)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	bool min_rate_supported = MLX5_CAP_QOS(esw->dev, esw_bw_share) &&
+					fw_max_bw_share >= MLX5_MIN_BW_SHARE;
+	bool max_rate_supported = MLX5_CAP_QOS(esw->dev, esw_rate_limit);
+	struct mlx5_vport *evport;
+	u32 previous_min_rate;
+	u32 divider;
 	int err = 0;
 
 	if (!ESW_ALLOWED(esw))
 		return -EPERM;
 	if (!LEGAL_VPORT(esw, vport))
 		return -EINVAL;
+	if ((min_rate && !min_rate_supported) || (max_rate && !max_rate_supported))
+		return -EOPNOTSUPP;
 
 	mutex_lock(&esw->state_lock);
 	evport = &esw->vports[vport];
-	err = esw_vport_qos_config(esw, vport, max_rate);
+
+	if (min_rate == evport->info.min_rate)
+		goto set_max_rate;
+
+	previous_min_rate = evport->info.min_rate;
+	evport->info.min_rate = min_rate;
+	divider = calculate_vports_min_rate_divider(esw);
+	err = normalize_vports_min_rate(esw, divider);
+	if (err) {
+		evport->info.min_rate = previous_min_rate;
+		goto unlock;
+	}
+
+set_max_rate:
+	if (max_rate == evport->info.max_rate)
+		goto unlock;
+
+	err = esw_vport_qos_config(esw, vport, max_rate, evport->qos.bw_share);
 	if (!err)
 		evport->info.max_rate = max_rate;
 
+unlock:
 	mutex_unlock(&esw->state_lock);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index cea1660d59ac..5b78883d5654 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -50,6 +50,11 @@
 
 #define FDB_UPLINK_VPORT 0xffff
 
+#define MLX5_MIN_BW_SHARE 1
+
+#define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
+	min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit)
+
 /* L2 -mac address based- hash helpers */
 struct l2addr_node {
 	struct hlist_node hlist;
@@ -116,6 +121,7 @@ struct mlx5_vport_info {
 	u8                      qos;
 	u64                     node_guid;
 	int                     link_state;
+	u32                     min_rate;
 	u32                     max_rate;
 	bool                    spoofchk;
 	bool                    trusted;
@@ -138,6 +144,7 @@ struct mlx5_vport {
 	struct {
 		bool            enabled;
 		u32             esw_tsar_ix;
+		u32             bw_share;
 	} qos;
 
 	bool                    enabled;
@@ -249,8 +256,8 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
 				    int vport, bool spoofchk);
 int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
 				 int vport_num, bool setting);
-int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw,
-				int vport, u32 max_rate);
+int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
+				u32 max_rate, u32 min_rate);
 int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
 				  int vport, struct ifla_vf_info *ivi);
 int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d96ebc319d63..a919dfb920ae 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -547,7 +547,9 @@ struct mlx5_ifc_e_switch_cap_bits {
 struct mlx5_ifc_qos_cap_bits {
 	u8         packet_pacing[0x1];
 	u8         esw_scheduling[0x1];
-	u8         reserved_at_2[0x1e];
+	u8         esw_bw_share[0x1];
+	u8         esw_rate_limit[0x1];
+	u8         reserved_at_4[0x1c];
 
 	u8         reserved_at_20[0x20];
 
-- 
cgit v1.2.3


From 8c7245a60ef8f8c4a427349690c5a141cfed6217 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 30 Nov 2016 20:23:51 +0200
Subject: net/mlx5: Push min-inline mode resolution helper into the core

So we can use that from the IB driver too in downstream patches.

This patch doesn't change any functionality.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 18 +-----------------
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   | 17 +++++++++++++++++
 include/linux/mlx5/vport.h                        |  1 +
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c819d07fbdb3..636372873e64 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3458,22 +3458,6 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 			MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
 }
 
-static void mlx5e_query_min_inline(struct mlx5_core_dev *mdev,
-				   u8 *min_inline_mode)
-{
-	switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) {
-	case MLX5_CAP_INLINE_MODE_L2:
-		*min_inline_mode = MLX5_INLINE_MODE_L2;
-		break;
-	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
-		mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode);
-		break;
-	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
-		*min_inline_mode = MLX5_INLINE_MODE_NONE;
-		break;
-	}
-}
-
 u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
 {
 	int i;
@@ -3533,7 +3517,7 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
 	priv->params.tx_cq_moderation.pkts =
 		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.tx_max_inline         = mlx5e_get_max_inline_cap(mdev);
-	mlx5e_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
+	mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
 	priv->params.num_tc                = 1;
 	priv->params.rss_hfunc             = ETH_RSS_HASH_XOR;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 269e4401c342..b49cfc895c10 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -127,6 +127,23 @@ int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline);
 
+void mlx5_query_min_inline(struct mlx5_core_dev *mdev,
+			   u8 *min_inline_mode)
+{
+	switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) {
+	case MLX5_CAP_INLINE_MODE_L2:
+		*min_inline_mode = MLX5_INLINE_MODE_L2;
+		break;
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode);
+		break;
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+		*min_inline_mode = MLX5_INLINE_MODE_NONE;
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_query_min_inline);
+
 int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 min_inline)
 {
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index ec35157ea725..656c70b65dd2 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -51,6 +51,7 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 *addr);
 int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 				    u16 vport, u8 *min_inline);
+void mlx5_query_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline);
 int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 min_inline);
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From 08d62f58aa2587132a930afbe8664379b430e2dd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 17 Jan 2017 13:57:26 +0200
Subject: dmaengine: dw: register IRQ and DMA pool with instance ID

It is really useful not only for debugging to have an IRQ line and DMA
pool labeled with driver and its instance ID. Do this for DesignWare DMA
driver.

All current users of this IP would be enhanced later on.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw/core.c     | 7 +++++--
 drivers/dma/dw/pci.c      | 1 +
 drivers/dma/dw/platform.c | 1 +
 drivers/dma/dw/regs.h     | 1 +
 include/linux/dma/dw.h    | 2 ++
 5 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 45bb608a1b7c..87516246c445 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1502,8 +1502,11 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 	/* Force dma off, just in case */
 	dw_dma_off(dw);
 
+	/* Device and instance ID for IRQ and DMA pool */
+	snprintf(dw->name, sizeof(dw->name), "dw:dmac%d", chip->id);
+
 	/* Create a pool of consistent memory blocks for hardware descriptors */
-	dw->desc_pool = dmam_pool_create("dw_dmac_desc_pool", chip->dev,
+	dw->desc_pool = dmam_pool_create(dw->name, chip->dev,
 					 sizeof(struct dw_desc), 4, 0);
 	if (!dw->desc_pool) {
 		dev_err(chip->dev, "No memory for descriptors dma pool\n");
@@ -1514,7 +1517,7 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 	tasklet_init(&dw->tasklet, dw_dma_tasklet, (unsigned long)dw);
 
 	err = request_irq(chip->irq, dw_dma_interrupt, IRQF_SHARED,
-			  "dw_dmac", dw);
+			  dw->name, dw);
 	if (err)
 		goto err_pdata;
 
diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c
index 7558a9c38fda..47194372f738 100644
--- a/drivers/dma/dw/pci.c
+++ b/drivers/dma/dw/pci.c
@@ -47,6 +47,7 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid)
 		return -ENOMEM;
 
 	chip->dev = &pdev->dev;
+	chip->id = pdev->devfn;
 	chip->regs = pcim_iomap_table(pdev)[0];
 	chip->irq = pdev->irq;
 	chip->pdata = pdata;
diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c
index b1655e40cfa2..c639c60b825a 100644
--- a/drivers/dma/dw/platform.c
+++ b/drivers/dma/dw/platform.c
@@ -202,6 +202,7 @@ static int dw_probe(struct platform_device *pdev)
 		pdata = dw_dma_parse_dt(pdev);
 
 	chip->dev = dev;
+	chip->id = pdev->id;
 	chip->pdata = pdata;
 
 	chip->clk = devm_clk_get(chip->dev, "hclk");
diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h
index 4e0128c62704..6087eba2c275 100644
--- a/drivers/dma/dw/regs.h
+++ b/drivers/dma/dw/regs.h
@@ -270,6 +270,7 @@ static inline struct dw_dma_chan *to_dw_dma_chan(struct dma_chan *chan)
 
 struct dw_dma {
 	struct dma_device	dma;
+	char			name[20];
 	void __iomem		*regs;
 	struct dma_pool		*desc_pool;
 	struct tasklet_struct	tasklet;
diff --git a/include/linux/dma/dw.h b/include/linux/dma/dw.h
index ccfd0c3777df..b63b25814d77 100644
--- a/include/linux/dma/dw.h
+++ b/include/linux/dma/dw.h
@@ -23,6 +23,7 @@ struct dw_dma;
 /**
  * struct dw_dma_chip - representation of DesignWare DMA controller hardware
  * @dev:		struct device of the DMA controller
+ * @id:			instance ID
  * @irq:		irq line
  * @regs:		memory mapped I/O space
  * @clk:		hclk clock
@@ -31,6 +32,7 @@ struct dw_dma;
  */
 struct dw_dma_chip {
 	struct device	*dev;
+	int		id;
 	int		irq;
 	void __iomem	*regs;
 	struct clk	*clk;
-- 
cgit v1.2.3


From 199244d69458770770890f8b5988a1b6cad701ad Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 17 Jan 2017 13:57:31 +0200
Subject: dmaengine: dw: add support of iDMA 32-bit hardware

iDMA 32-bit is Intel designed DMA controller that behaves like Synopsys
Designware DMA. This patch adds a support of the new Intel hardware.

Due to iDMA 32-bit has no autoconfiguration the platform code must
provide a platform data to dw_dma_probe().

By default full FIFO (1024 bytes) is assigned to channel 0. Here we
slice FIFO on equal parts between channels for iDMA 32-bit case.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw/core.c                | 101 +++++++++++++++++++++++++++++++----
 include/linux/platform_data/dma-dw.h |   2 +
 2 files changed, 94 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 0186d6732294..e500950dad82 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -138,16 +138,32 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	dwc->descs_allocated--;
 }
 
-static void dwc_initialize(struct dw_dma_chan *dwc)
+static void dwc_initialize_chan_idma32(struct dw_dma_chan *dwc)
+{
+	u32 cfghi = 0;
+	u32 cfglo = 0;
+
+	/* Set default burst alignment */
+	cfglo |= IDMA32C_CFGL_DST_BURST_ALIGN | IDMA32C_CFGL_SRC_BURST_ALIGN;
+
+	/* Low 4 bits of the request lines */
+	cfghi |= IDMA32C_CFGH_DST_PER(dwc->dws.dst_id & 0xf);
+	cfghi |= IDMA32C_CFGH_SRC_PER(dwc->dws.src_id & 0xf);
+
+	/* Request line extension (2 bits) */
+	cfghi |= IDMA32C_CFGH_DST_PER_EXT(dwc->dws.dst_id >> 4 & 0x3);
+	cfghi |= IDMA32C_CFGH_SRC_PER_EXT(dwc->dws.src_id >> 4 & 0x3);
+
+	channel_writel(dwc, CFG_LO, cfglo);
+	channel_writel(dwc, CFG_HI, cfghi);
+}
+
+static void dwc_initialize_chan_dw(struct dw_dma_chan *dwc)
 {
-	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
 	u32 cfghi = DWC_CFGH_FIFO_MODE;
 	u32 cfglo = DWC_CFGL_CH_PRIOR(dwc->priority);
 	bool hs_polarity = dwc->dws.hs_polarity;
 
-	if (test_bit(DW_DMA_IS_INITIALIZED, &dwc->flags))
-		return;
-
 	cfghi |= DWC_CFGH_DST_PER(dwc->dws.dst_id);
 	cfghi |= DWC_CFGH_SRC_PER(dwc->dws.src_id);
 
@@ -156,6 +172,19 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
 
 	channel_writel(dwc, CFG_LO, cfglo);
 	channel_writel(dwc, CFG_HI, cfghi);
+}
+
+static void dwc_initialize(struct dw_dma_chan *dwc)
+{
+	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
+
+	if (test_bit(DW_DMA_IS_INITIALIZED, &dwc->flags))
+		return;
+
+	if (dw->pdata->is_idma32)
+		dwc_initialize_chan_idma32(dwc);
+	else
+		dwc_initialize_chan_dw(dwc);
 
 	/* Enable interrupts */
 	channel_set_bit(dw, MASK.XFER, dwc->mask);
@@ -187,8 +216,13 @@ static inline void dwc_chan_disable(struct dw_dma *dw, struct dw_dma_chan *dwc)
 static u32 bytes2block(struct dw_dma_chan *dwc, size_t bytes,
 			  unsigned int width, size_t *len)
 {
+	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
 	u32 block;
 
+	/* Always in bytes for iDMA 32-bit */
+	if (dw->pdata->is_idma32)
+		width = 0;
+
 	if ((bytes >> width) > dwc->block_size) {
 		block = dwc->block_size;
 		*len = block << width;
@@ -202,6 +236,11 @@ static u32 bytes2block(struct dw_dma_chan *dwc, size_t bytes,
 
 static size_t block2bytes(struct dw_dma_chan *dwc, u32 block, u32 width)
 {
+	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
+
+	if (dw->pdata->is_idma32)
+		return IDMA32C_CTLH_BLOCK_TS(block);
+
 	return DWC_CTLH_BLOCK_TS(block) << width;
 }
 
@@ -915,14 +954,16 @@ static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig)
 {
 	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
 	struct dma_slave_config *sc = &dwc->dma_sconfig;
+	struct dw_dma *dw = to_dw_dma(chan->device);
 	/*
 	 * Fix sconfig's burst size according to dw_dmac. We need to convert
 	 * them as:
 	 * 1 -> 0, 4 -> 1, 8 -> 2, 16 -> 3.
 	 *
 	 * NOTE: burst size 2 is not supported by DesignWare controller.
+	 *       iDMA 32-bit supports it.
 	 */
-	u32 s = 2;
+	u32 s = dw->pdata->is_idma32 ? 1 : 2;
 
 	/* Check if chan will be configured for slave transfers */
 	if (!is_slave_direction(sconfig->direction))
@@ -937,12 +978,19 @@ static int dwc_config(struct dma_chan *chan, struct dma_slave_config *sconfig)
 	return 0;
 }
 
-static void dwc_chan_pause(struct dw_dma_chan *dwc)
+static void dwc_chan_pause(struct dw_dma_chan *dwc, bool drain)
 {
+	struct dw_dma *dw = to_dw_dma(dwc->chan.device);
 	unsigned int		count = 20;	/* timeout iterations */
 	u32			cfglo;
 
 	cfglo = channel_readl(dwc, CFG_LO);
+	if (dw->pdata->is_idma32) {
+		if (drain)
+			cfglo |= IDMA32C_CFGL_CH_DRAIN;
+		else
+			cfglo &= ~IDMA32C_CFGL_CH_DRAIN;
+	}
 	channel_writel(dwc, CFG_LO, cfglo | DWC_CFGL_CH_SUSP);
 	while (!(channel_readl(dwc, CFG_LO) & DWC_CFGL_FIFO_EMPTY) && count--)
 		udelay(2);
@@ -956,7 +1004,7 @@ static int dwc_pause(struct dma_chan *chan)
 	unsigned long		flags;
 
 	spin_lock_irqsave(&dwc->lock, flags);
-	dwc_chan_pause(dwc);
+	dwc_chan_pause(dwc, false);
 	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	return 0;
@@ -998,6 +1046,8 @@ static int dwc_terminate_all(struct dma_chan *chan)
 
 	clear_bit(DW_DMA_IS_SOFT_LLP, &dwc->flags);
 
+	dwc_chan_pause(dwc, true);
+
 	dwc_chan_disable(dw, dwc);
 
 	dwc_chan_resume(dwc);
@@ -1090,6 +1140,32 @@ static void dwc_issue_pending(struct dma_chan *chan)
 
 /*----------------------------------------------------------------------*/
 
+/*
+ * Program FIFO size of channels.
+ *
+ * By default full FIFO (1024 bytes) is assigned to channel 0. Here we
+ * slice FIFO on equal parts between channels.
+ */
+static void idma32_fifo_partition(struct dw_dma *dw)
+{
+	u64 value = IDMA32C_FP_PSIZE_CH0(128) | IDMA32C_FP_PSIZE_CH1(128) |
+		    IDMA32C_FP_UPDATE;
+	u64 fifo_partition = 0;
+
+	if (!dw->pdata->is_idma32)
+		return;
+
+	/* Fill FIFO_PARTITION low bits (Channels 0..1, 4..5) */
+	fifo_partition |= value << 0;
+
+	/* Fill FIFO_PARTITION high bits (Channels 2..3, 6..7) */
+	fifo_partition |= value << 32;
+
+	/* Program FIFO Partition registers - 128 bytes for each channel */
+	idma32_writeq(dw, FIFO_PARTITION1, fifo_partition);
+	idma32_writeq(dw, FIFO_PARTITION0, fifo_partition);
+}
+
 static void dw_dma_off(struct dw_dma *dw)
 {
 	unsigned int i;
@@ -1509,8 +1585,13 @@ int dw_dma_probe(struct dw_dma_chip *chip)
 	/* Force dma off, just in case */
 	dw_dma_off(dw);
 
+	idma32_fifo_partition(dw);
+
 	/* Device and instance ID for IRQ and DMA pool */
-	snprintf(dw->name, sizeof(dw->name), "dw:dmac%d", chip->id);
+	if (pdata->is_idma32)
+		snprintf(dw->name, sizeof(dw->name), "idma32:dmac%d", chip->id);
+	else
+		snprintf(dw->name, sizeof(dw->name), "dw:dmac%d", chip->id);
 
 	/* Create a pool of consistent memory blocks for hardware descriptors */
 	dw->desc_pool = dmam_pool_create(dw->name, chip->dev,
@@ -1673,6 +1754,8 @@ int dw_dma_enable(struct dw_dma_chip *chip)
 {
 	struct dw_dma *dw = chip->dw;
 
+	idma32_fifo_partition(dw);
+
 	dw_dma_on(dw);
 	return 0;
 }
diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index e69e415d0d98..896cb71a382c 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -41,6 +41,7 @@ struct dw_dma_slave {
  * @is_private: The device channels should be marked as private and not for
  *	by the general purpose DMA channel allocator.
  * @is_memcpy: The device channels do support memory-to-memory transfers.
+ * @is_idma32: The type of the DMA controller is iDMA32
  * @chan_allocation_order: Allocate channels starting from 0 or 7
  * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0.
  * @block_size: Maximum block size supported by the controller
@@ -53,6 +54,7 @@ struct dw_dma_platform_data {
 	unsigned int	nr_channels;
 	bool		is_private;
 	bool		is_memcpy;
+	bool		is_idma32;
 #define CHAN_ALLOCATION_ASCENDING	0	/* zero to seven */
 #define CHAN_ALLOCATION_DESCENDING	1	/* seven to zero */
 	unsigned char	chan_allocation_order;
-- 
cgit v1.2.3


From 728bbe75c82fdc6bdf157652a4351856ed08afc4 Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Thu, 12 Jan 2017 14:52:19 -0600
Subject: misc: sram: Introduce support code for protect-exec sram type

Some platforms, like many ARM SoCs, require the ability to run code from
on-chip memory like SRAM for tasks like reconfiguring the SDRAM
controller or entering low-power sleep modes. In order to do this we
must be able to allocate memory that the code can be copied to but then
change the mapping to be read-only and executable so that no memory is
both writable and executable at the same time to avoid opening any
unneccesary security holes.

By using the existing "pool" partition type that the SRAM driver allows
we can create a memory space that will already be exposed by the
genalloc framework to allow for allocating memory but we must extend
this to meet the executable requirements. By making use of various
set_memory_* APIs we can change the attributes of pages to make them
writable for code upload but then read-only and executable when we want
to actually run code.  Because SRAM is a shared resource we need a
centralized manager of these set memory calls. Because the SRAM driver
itself is responsible for allocating the memory we can introduce a
sram_copy_exec API for the driver that works like memcpy but also
manages the page attributes and locking to allow multiple users of the
same SRAM space to all copy their code over independent of other each
before starting execution.

It is maintained in a separate file from the core SRAM driver to allow
it to be selectively built depending on whether or not a platform has
the appropriate set_memory_* APIs. A future patch will integrate it with
the core SRAM driver.

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/sram-exec.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/misc/sram.h      |  18 ++++++++
 include/linux/sram.h     |  27 ++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 drivers/misc/sram-exec.c
 create mode 100644 include/linux/sram.h

(limited to 'include/linux')

diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
new file mode 100644
index 000000000000..ac522417c462
--- /dev/null
+++ b/drivers/misc/sram-exec.c
@@ -0,0 +1,105 @@
+/*
+ * SRAM protect-exec region helper functions
+ *
+ * Copyright (C) 2017 Texas Instruments Incorporated - http://www.ti.com/
+ *	Dave Gerlach
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/device.h>
+#include <linux/genalloc.h>
+#include <linux/sram.h>
+
+#include <asm/cacheflush.h>
+
+#include "sram.h"
+
+static DEFINE_MUTEX(exec_pool_list_mutex);
+static LIST_HEAD(exec_pool_list);
+
+int sram_check_protect_exec(struct sram_dev *sram, struct sram_reserve *block,
+			    struct sram_partition *part)
+{
+	unsigned long base = (unsigned long)part->base;
+	unsigned long end = base + block->size;
+
+	if (!PAGE_ALIGNED(base) || !PAGE_ALIGNED(end)) {
+		dev_err(sram->dev,
+			"SRAM pool marked with 'protect-exec' is not page aligned and will not be created.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int sram_add_protect_exec(struct sram_partition *part)
+{
+	mutex_lock(&exec_pool_list_mutex);
+	list_add_tail(&part->list, &exec_pool_list);
+	mutex_unlock(&exec_pool_list_mutex);
+
+	return 0;
+}
+
+/**
+ * sram_exec_copy - copy data to a protected executable region of sram
+ *
+ * @pool: struct gen_pool retrieved that is part of this sram
+ * @dst: Destination address for the copy, that must be inside pool
+ * @src: Source address for the data to copy
+ * @size: Size of copy to perform, which starting from dst, must reside in pool
+ *
+ * This helper function allows sram driver to act as central control location
+ * of 'protect-exec' pools which are normal sram pools but are always set
+ * read-only and executable except when copying data to them, at which point
+ * they are set to read-write non-executable, to make sure no memory is
+ * writeable and executable at the same time. This region must be page-aligned
+ * and is checked during probe, otherwise page attribute manipulation would
+ * not be possible.
+ */
+int sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
+		   size_t size)
+{
+	struct sram_partition *part = NULL, *p;
+	unsigned long base;
+	int pages;
+
+	mutex_lock(&exec_pool_list_mutex);
+	list_for_each_entry(p, &exec_pool_list, list) {
+		if (p->pool == pool)
+			part = p;
+	}
+	mutex_unlock(&exec_pool_list_mutex);
+
+	if (!part)
+		return -EINVAL;
+
+	if (!addr_in_gen_pool(pool, (unsigned long)dst, size))
+		return -EINVAL;
+
+	base = (unsigned long)part->base;
+	pages = PAGE_ALIGN(size) / PAGE_SIZE;
+
+	mutex_lock(&part->lock);
+
+	set_memory_nx((unsigned long)base, pages);
+	set_memory_rw((unsigned long)base, pages);
+
+	memcpy(dst, src, size);
+
+	set_memory_ro((unsigned long)base, pages);
+	set_memory_x((unsigned long)base, pages);
+
+	mutex_unlock(&part->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sram_exec_copy);
diff --git a/drivers/misc/sram.h b/drivers/misc/sram.h
index 52501a84468c..b268cd3f55bb 100644
--- a/drivers/misc/sram.h
+++ b/drivers/misc/sram.h
@@ -36,4 +36,22 @@ struct sram_reserve {
 	bool pool;
 	const char *label;
 };
+
+#ifdef CONFIG_SRAM_EXEC
+int sram_check_protect_exec(struct sram_dev *sram, struct sram_reserve *block,
+			    struct sram_partition *part);
+int sram_add_protect_exec(struct sram_partition *part);
+#else
+static inline int sram_check_protect_exec(struct sram_dev *sram,
+					  struct sram_reserve *block,
+					  struct sram_partition *part)
+{
+	return -ENODEV;
+}
+
+static inline int sram_add_protect_exec(struct sram_partition *part)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_SRAM_EXEC */
 #endif /* __SRAM_H */
diff --git a/include/linux/sram.h b/include/linux/sram.h
new file mode 100644
index 000000000000..c97dcbe8ce25
--- /dev/null
+++ b/include/linux/sram.h
@@ -0,0 +1,27 @@
+/*
+ * Generic SRAM Driver Interface
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef __LINUX_SRAM_H__
+#define __LINUX_SRAM_H__
+
+struct gen_pool;
+
+#ifdef CONFIG_SRAM_EXEC
+int sram_exec_copy(struct gen_pool *pool, void *dst, void *src, size_t size);
+#else
+static inline int sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
+				 size_t size)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_SRAM_EXEC */
+#endif /* __LINUX_SRAM_H__ */
-- 
cgit v1.2.3


From bcd375f7f71f7106c97516bf5395149954ef8810 Mon Sep 17 00:00:00 2001
From: Manuel Schölling <manuel.schoelling@gmx.de>
Date: Fri, 13 Jan 2017 21:07:56 +0100
Subject: console: Add callback to flush scrollback buffer to consw struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This new callback is in preparation for persistent scrollback buffer
support for VGA consoles.
With a single scrollback buffer for all consoles, we could flush the
buffer just by invocating consw->con_switch(). But when each VGA console
has its own scrollback buffer, we need a new callback to tell the
video console driver which buffer to flush.

Signed-off-by: Manuel Schölling <manuel.schoelling@gmx.de>
Reviewed-by: Andrey Utkin <andrey_utkin@fastmail.com>
Tested-by: Andrey Utkin <andrey_utkin@fastmail.com>
Tested-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c            |  9 +++++++++
 drivers/video/console/vgacon.c | 24 +++++++++++++++++++++++-
 include/linux/console.h        |  4 ++++
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 4c10a9df3b91..9d3ce505e7ab 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -625,6 +625,14 @@ static void save_screen(struct vc_data *vc)
 		vc->vc_sw->con_save_screen(vc);
 }
 
+static void flush_scrollback(struct vc_data *vc)
+{
+	WARN_CONSOLE_UNLOCKED();
+
+	if (vc->vc_sw->con_flush_scrollback)
+		vc->vc_sw->con_flush_scrollback(vc);
+}
+
 /*
  *	Redrawing of screen
  */
@@ -1171,6 +1179,7 @@ static void csi_J(struct vc_data *vc, int vpar)
 		case 3: /* erase scroll-back buffer (and whole display) */
 			scr_memsetw(vc->vc_screenbuf, vc->vc_video_erase_char,
 				    vc->vc_screenbuf_size);
+			flush_scrollback(vc);
 			set_origin(vc);
 			if (con_is_visible(vc))
 				update_screen(vc);
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 48b97648d4af..9a7c2bbc5326 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -173,6 +173,16 @@ static struct vgacon_scrollback_info {
 	int restore;
 } vgacon_scrollback;
 
+static void vgacon_scrollback_reset(size_t reset_size)
+{
+	if (vgacon_scrollback.data && reset_size > 0)
+		memset(vgacon_scrollback.data, 0, reset_size);
+
+	vgacon_scrollback.cnt  = 0;
+	vgacon_scrollback.tail = 0;
+	vgacon_scrollback.cur  = 0;
+}
+
 static void vgacon_scrollback_init(int pitch)
 {
 	int rows = CONFIG_VGACON_SOFT_SCROLLBACK_SIZE * 1024/pitch;
@@ -305,6 +315,14 @@ static void vgacon_scrolldelta(struct vc_data *c, int lines)
 	} else
 		vgacon_cursor(c, CM_MOVE);
 }
+
+static void vgacon_flush_scrollback(struct vc_data *c)
+{
+	size_t size = CONFIG_VGACON_SOFT_SCROLLBACK_SIZE * 1024;
+
+	if (c->vc_num == fg_console)
+		vgacon_scrollback_reset(size);
+}
 #else
 #define vgacon_scrollback_startup(...) do { } while (0)
 #define vgacon_scrollback_init(...)    do { } while (0)
@@ -322,6 +340,10 @@ static void vgacon_scrolldelta(struct vc_data *c, int lines)
 			vga_vram_size);
 	vga_set_mem_top(c);
 }
+
+static void vgacon_flush_scrollback(struct vc_data *c)
+{
+}
 #endif /* CONFIG_VGACON_SOFT_SCROLLBACK */
 
 static const char *vgacon_startup(void)
@@ -1329,7 +1351,6 @@ static bool vgacon_scroll(struct vc_data *c, unsigned int t, unsigned int b,
 	return true;
 }
 
-
 /*
  *  The console `switch' structure for the VGA based console
  */
@@ -1362,6 +1383,7 @@ const struct consw vga_con = {
 	.con_save_screen = vgacon_save_screen,
 	.con_build_attr = vgacon_build_attr,
 	.con_invert_region = vgacon_invert_region,
+	.con_flush_scrollback = vgacon_flush_scrollback,
 };
 EXPORT_SYMBOL(vga_con);
 
diff --git a/include/linux/console.h b/include/linux/console.h
index 9c26c6685587..5949d1855589 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -72,6 +72,10 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	/*
+	 * Flush the video console driver's scrollback buffer
+	 */
+	void	(*con_flush_scrollback)(struct vc_data *);
 	/*
 	 * Prepare the console for the debugger.  This includes, but is not
 	 * limited to, unblanking the console, loading an appropriate
-- 
cgit v1.2.3


From 93fbe91b552194af970256ce72934745d01df435 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Date: Fri, 20 Jan 2017 10:15:07 +0100
Subject: iio: Add STM32 timer trigger driver

Timers IPs can be used to generate triggers for other IPs like
DAC or ADC.
Each trigger may result of timer internals signals like counter enable,
reset or edge, this configuration could be done through "master_mode"
device attribute.

Since triggers could be used by DAC or ADC their names are defined
in include/ nux/iio/timer/stm32-timer-trigger.h and is_stm32_iio_timer_trigger
function could be used to check if the trigger is valid or not.

"trgo" trigger have a "sampling_frequency" attribute which allow to configure
timer sampling frequency.

version 8:
- change kernel version from 4.10 to 4.11 in ABI documentation

version 7:
- remove all iio_device related code
- move driver into trigger directory

version 5:
- simplify tables of triggers
- only create an IIO device when needed

version 4:
- get triggers configuration from "reg" in DT
- add tables of triggers
- sampling frequency is enable/disable when writing in trigger
  sampling_frequency attribute
- no more use of interruptions

version 3:
- change compatible to "st,stm32-timer-trigger"
- fix attributes access right
- use string instead of int for master_mode and slave_mode
- document device attributes in sysfs-bus-iio-timer-stm32

version 2:
- keep only one compatible
- use st,input-triggers-names and st,output-triggers-names
  to know which triggers are accepted and/or create by the device

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
Acked-by: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../ABI/testing/sysfs-bus-iio-timer-stm32          |  29 ++
 drivers/iio/trigger/Kconfig                        |   9 +
 drivers/iio/trigger/Makefile                       |   1 +
 drivers/iio/trigger/stm32-timer-trigger.c          | 342 +++++++++++++++++++++
 include/linux/iio/timer/stm32-timer-trigger.h      |  62 ++++
 5 files changed, 443 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
 create mode 100644 drivers/iio/trigger/stm32-timer-trigger.c
 create mode 100644 include/linux/iio/timer/stm32-timer-trigger.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
new file mode 100644
index 000000000000..6534a60037ff
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
@@ -0,0 +1,29 @@
+What:		/sys/bus/iio/devices/triggerX/master_mode_available
+KernelVersion:	4.11
+Contact:	benjamin.gaignard@st.com
+Description:
+		Reading returns the list possible master modes which are:
+		- "reset"     :	The UG bit from the TIMx_EGR register is used as trigger output (TRGO).
+		- "enable"    : The Counter Enable signal CNT_EN is used as trigger output.
+		- "update"    : The update event is selected as trigger output.
+				For instance a master timer can then be used as a prescaler for a slave timer.
+		- "compare_pulse" : The trigger output send a positive pulse when the CC1IF flag is to be set.
+		- "OC1REF"    : OC1REF signal is used as trigger output.
+		- "OC2REF"    : OC2REF signal is used as trigger output.
+		- "OC3REF"    : OC3REF signal is used as trigger output.
+		- "OC4REF"    : OC4REF signal is used as trigger output.
+
+What:		/sys/bus/iio/devices/triggerX/master_mode
+KernelVersion:	4.11
+Contact:	benjamin.gaignard@st.com
+Description:
+		Reading returns the current master modes.
+		Writing set the master mode
+
+What:		/sys/bus/iio/devices/triggerX/sampling_frequency
+KernelVersion:	4.11
+Contact:	benjamin.gaignard@st.com
+Description:
+		Reading returns the current sampling frequency.
+		Writing an value different of 0 set and start sampling.
+		Writing 0 stop sampling.
diff --git a/drivers/iio/trigger/Kconfig b/drivers/iio/trigger/Kconfig
index 809b2e7d58fa..e4d4e63434db 100644
--- a/drivers/iio/trigger/Kconfig
+++ b/drivers/iio/trigger/Kconfig
@@ -24,6 +24,15 @@ config IIO_INTERRUPT_TRIGGER
 	  To compile this driver as a module, choose M here: the
 	  module will be called iio-trig-interrupt.
 
+config IIO_STM32_TIMER_TRIGGER
+	tristate "STM32 Timer Trigger"
+	depends on (ARCH_STM32 && OF && MFD_STM32_TIMERS) || COMPILE_TEST
+	help
+	  Select this option to enable STM32 Timer Trigger
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called stm32-timer-trigger.
+
 config IIO_TIGHTLOOP_TRIGGER
 	tristate "A kthread based hammering loop trigger"
 	depends on IIO_SW_TRIGGER
diff --git a/drivers/iio/trigger/Makefile b/drivers/iio/trigger/Makefile
index aab4dc23303d..5c4ecd380653 100644
--- a/drivers/iio/trigger/Makefile
+++ b/drivers/iio/trigger/Makefile
@@ -6,5 +6,6 @@
 
 obj-$(CONFIG_IIO_HRTIMER_TRIGGER) += iio-trig-hrtimer.o
 obj-$(CONFIG_IIO_INTERRUPT_TRIGGER) += iio-trig-interrupt.o
+obj-$(CONFIG_IIO_STM32_TIMER_TRIGGER) += stm32-timer-trigger.o
 obj-$(CONFIG_IIO_SYSFS_TRIGGER) += iio-trig-sysfs.o
 obj-$(CONFIG_IIO_TIGHTLOOP_TRIGGER) += iio-trig-loop.o
diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c
new file mode 100644
index 000000000000..994b96d19750
--- /dev/null
+++ b/drivers/iio/trigger/stm32-timer-trigger.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) STMicroelectronics 2016
+ *
+ * Author: Benjamin Gaignard <benjamin.gaignard@st.com>
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#include <linux/iio/iio.h>
+#include <linux/iio/sysfs.h>
+#include <linux/iio/timer/stm32-timer-trigger.h>
+#include <linux/iio/trigger.h>
+#include <linux/mfd/stm32-timers.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#define MAX_TRIGGERS 6
+
+/* List the triggers created by each timer */
+static const void *triggers_table[][MAX_TRIGGERS] = {
+	{ TIM1_TRGO, TIM1_CH1, TIM1_CH2, TIM1_CH3, TIM1_CH4,},
+	{ TIM2_TRGO, TIM2_CH1, TIM2_CH2, TIM2_CH3, TIM2_CH4,},
+	{ TIM3_TRGO, TIM3_CH1, TIM3_CH2, TIM3_CH3, TIM3_CH4,},
+	{ TIM4_TRGO, TIM4_CH1, TIM4_CH2, TIM4_CH3, TIM4_CH4,},
+	{ TIM5_TRGO, TIM5_CH1, TIM5_CH2, TIM5_CH3, TIM5_CH4,},
+	{ TIM6_TRGO,},
+	{ TIM7_TRGO,},
+	{ TIM8_TRGO, TIM8_CH1, TIM8_CH2, TIM8_CH3, TIM8_CH4,},
+	{ TIM9_TRGO, TIM9_CH1, TIM9_CH2,},
+	{ }, /* timer 10 */
+	{ }, /* timer 11 */
+	{ TIM12_TRGO, TIM12_CH1, TIM12_CH2,},
+};
+
+struct stm32_timer_trigger {
+	struct device *dev;
+	struct regmap *regmap;
+	struct clk *clk;
+	u32 max_arr;
+	const void *triggers;
+};
+
+static int stm32_timer_start(struct stm32_timer_trigger *priv,
+			     unsigned int frequency)
+{
+	unsigned long long prd, div;
+	int prescaler = 0;
+	u32 ccer, cr1;
+
+	/* Period and prescaler values depends of clock rate */
+	div = (unsigned long long)clk_get_rate(priv->clk);
+
+	do_div(div, frequency);
+
+	prd = div;
+
+	/*
+	 * Increase prescaler value until we get a result that fit
+	 * with auto reload register maximum value.
+	 */
+	while (div > priv->max_arr) {
+		prescaler++;
+		div = prd;
+		do_div(div, (prescaler + 1));
+	}
+	prd = div;
+
+	if (prescaler > MAX_TIM_PSC) {
+		dev_err(priv->dev, "prescaler exceeds the maximum value\n");
+		return -EINVAL;
+	}
+
+	/* Check if nobody else use the timer */
+	regmap_read(priv->regmap, TIM_CCER, &ccer);
+	if (ccer & TIM_CCER_CCXE)
+		return -EBUSY;
+
+	regmap_read(priv->regmap, TIM_CR1, &cr1);
+	if (!(cr1 & TIM_CR1_CEN))
+		clk_enable(priv->clk);
+
+	regmap_write(priv->regmap, TIM_PSC, prescaler);
+	regmap_write(priv->regmap, TIM_ARR, prd - 1);
+	regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, TIM_CR1_ARPE);
+
+	/* Force master mode to update mode */
+	regmap_update_bits(priv->regmap, TIM_CR2, TIM_CR2_MMS, 0x20);
+
+	/* Make sure that registers are updated */
+	regmap_update_bits(priv->regmap, TIM_EGR, TIM_EGR_UG, TIM_EGR_UG);
+
+	/* Enable controller */
+	regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, TIM_CR1_CEN);
+
+	return 0;
+}
+
+static void stm32_timer_stop(struct stm32_timer_trigger *priv)
+{
+	u32 ccer, cr1;
+
+	regmap_read(priv->regmap, TIM_CCER, &ccer);
+	if (ccer & TIM_CCER_CCXE)
+		return;
+
+	regmap_read(priv->regmap, TIM_CR1, &cr1);
+	if (cr1 & TIM_CR1_CEN)
+		clk_disable(priv->clk);
+
+	/* Stop timer */
+	regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, 0);
+	regmap_write(priv->regmap, TIM_PSC, 0);
+	regmap_write(priv->regmap, TIM_ARR, 0);
+
+	/* Make sure that registers are updated */
+	regmap_update_bits(priv->regmap, TIM_EGR, TIM_EGR_UG, TIM_EGR_UG);
+}
+
+static ssize_t stm32_tt_store_frequency(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct iio_trigger *trig = to_iio_trigger(dev);
+	struct stm32_timer_trigger *priv = iio_trigger_get_drvdata(trig);
+	unsigned int freq;
+	int ret;
+
+	ret = kstrtouint(buf, 10, &freq);
+	if (ret)
+		return ret;
+
+	if (freq == 0) {
+		stm32_timer_stop(priv);
+	} else {
+		ret = stm32_timer_start(priv, freq);
+		if (ret)
+			return ret;
+	}
+
+	return len;
+}
+
+static ssize_t stm32_tt_read_frequency(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct iio_trigger *trig = to_iio_trigger(dev);
+	struct stm32_timer_trigger *priv = iio_trigger_get_drvdata(trig);
+	u32 psc, arr, cr1;
+	unsigned long long freq = 0;
+
+	regmap_read(priv->regmap, TIM_CR1, &cr1);
+	regmap_read(priv->regmap, TIM_PSC, &psc);
+	regmap_read(priv->regmap, TIM_ARR, &arr);
+
+	if (psc && arr && (cr1 & TIM_CR1_CEN)) {
+		freq = (unsigned long long)clk_get_rate(priv->clk);
+		do_div(freq, psc);
+		do_div(freq, arr);
+	}
+
+	return sprintf(buf, "%d\n", (unsigned int)freq);
+}
+
+static IIO_DEV_ATTR_SAMP_FREQ(0660,
+			      stm32_tt_read_frequency,
+			      stm32_tt_store_frequency);
+
+static char *master_mode_table[] = {
+	"reset",
+	"enable",
+	"update",
+	"compare_pulse",
+	"OC1REF",
+	"OC2REF",
+	"OC3REF",
+	"OC4REF"
+};
+
+static ssize_t stm32_tt_show_master_mode(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	u32 cr2;
+
+	regmap_read(priv->regmap, TIM_CR2, &cr2);
+	cr2 = (cr2 & TIM_CR2_MMS) >> TIM_CR2_MMS_SHIFT;
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", master_mode_table[cr2]);
+}
+
+static ssize_t stm32_tt_store_master_mode(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t len)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(master_mode_table); i++) {
+		if (!strncmp(master_mode_table[i], buf,
+			     strlen(master_mode_table[i]))) {
+			regmap_update_bits(priv->regmap, TIM_CR2,
+					   TIM_CR2_MMS, i << TIM_CR2_MMS_SHIFT);
+			/* Make sure that registers are updated */
+			regmap_update_bits(priv->regmap, TIM_EGR,
+					   TIM_EGR_UG, TIM_EGR_UG);
+			return len;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static IIO_CONST_ATTR(master_mode_available,
+	"reset enable update compare_pulse OC1REF OC2REF OC3REF OC4REF");
+
+static IIO_DEVICE_ATTR(master_mode, 0660,
+		       stm32_tt_show_master_mode,
+		       stm32_tt_store_master_mode,
+		       0);
+
+static struct attribute *stm32_trigger_attrs[] = {
+	&iio_dev_attr_sampling_frequency.dev_attr.attr,
+	&iio_dev_attr_master_mode.dev_attr.attr,
+	&iio_const_attr_master_mode_available.dev_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group stm32_trigger_attr_group = {
+	.attrs = stm32_trigger_attrs,
+};
+
+static const struct attribute_group *stm32_trigger_attr_groups[] = {
+	&stm32_trigger_attr_group,
+	NULL,
+};
+
+static const struct iio_trigger_ops timer_trigger_ops = {
+	.owner = THIS_MODULE,
+};
+
+static int stm32_setup_iio_triggers(struct stm32_timer_trigger *priv)
+{
+	int ret;
+	const char * const *cur = priv->triggers;
+
+	while (cur && *cur) {
+		struct iio_trigger *trig;
+
+		trig = devm_iio_trigger_alloc(priv->dev, "%s", *cur);
+		if  (!trig)
+			return -ENOMEM;
+
+		trig->dev.parent = priv->dev->parent;
+		trig->ops = &timer_trigger_ops;
+
+		/*
+		 * sampling frequency and master mode attributes
+		 * should only be available on trgo trigger which
+		 * is always the first in the list.
+		 */
+		if (cur == priv->triggers)
+			trig->dev.groups = stm32_trigger_attr_groups;
+
+		iio_trigger_set_drvdata(trig, priv);
+
+		ret = devm_iio_trigger_register(priv->dev, trig);
+		if (ret)
+			return ret;
+		cur++;
+	}
+
+	return 0;
+}
+
+/**
+ * is_stm32_timer_trigger
+ * @trig: trigger to be checked
+ *
+ * return true if the trigger is a valid stm32 iio timer trigger
+ * either return false
+ */
+bool is_stm32_timer_trigger(struct iio_trigger *trig)
+{
+	return (trig->ops == &timer_trigger_ops);
+}
+EXPORT_SYMBOL(is_stm32_timer_trigger);
+
+static int stm32_timer_trigger_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct stm32_timer_trigger *priv;
+	struct stm32_timers *ddata = dev_get_drvdata(pdev->dev.parent);
+	unsigned int index;
+	int ret;
+
+	if (of_property_read_u32(dev->of_node, "reg", &index))
+		return -EINVAL;
+
+	if (index >= ARRAY_SIZE(triggers_table))
+		return -EINVAL;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+
+	if (!priv)
+		return -ENOMEM;
+
+	priv->dev = dev;
+	priv->regmap = ddata->regmap;
+	priv->clk = ddata->clk;
+	priv->max_arr = ddata->max_arr;
+	priv->triggers = triggers_table[index];
+
+	ret = stm32_setup_iio_triggers(priv);
+	if (ret)
+		return ret;
+
+	platform_set_drvdata(pdev, priv);
+
+	return 0;
+}
+
+static const struct of_device_id stm32_trig_of_match[] = {
+	{ .compatible = "st,stm32-timer-trigger", },
+	{ /* end node */ },
+};
+MODULE_DEVICE_TABLE(of, stm32_trig_of_match);
+
+static struct platform_driver stm32_timer_trigger_driver = {
+	.probe = stm32_timer_trigger_probe,
+	.driver = {
+		.name = "stm32-timer-trigger",
+		.of_match_table = stm32_trig_of_match,
+	},
+};
+module_platform_driver(stm32_timer_trigger_driver);
+
+MODULE_ALIAS("platform: stm32-timer-trigger");
+MODULE_DESCRIPTION("STMicroelectronics STM32 Timer Trigger driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/iio/timer/stm32-timer-trigger.h b/include/linux/iio/timer/stm32-timer-trigger.h
new file mode 100644
index 000000000000..55535aef2e6c
--- /dev/null
+++ b/include/linux/iio/timer/stm32-timer-trigger.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) STMicroelectronics 2016
+ *
+ * Author: Benjamin Gaignard <benjamin.gaignard@st.com>
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#ifndef _STM32_TIMER_TRIGGER_H_
+#define _STM32_TIMER_TRIGGER_H_
+
+#define TIM1_TRGO	"tim1_trgo"
+#define TIM1_CH1	"tim1_ch1"
+#define TIM1_CH2	"tim1_ch2"
+#define TIM1_CH3	"tim1_ch3"
+#define TIM1_CH4	"tim1_ch4"
+
+#define TIM2_TRGO	"tim2_trgo"
+#define TIM2_CH1	"tim2_ch1"
+#define TIM2_CH2	"tim2_ch2"
+#define TIM2_CH3	"tim2_ch3"
+#define TIM2_CH4	"tim2_ch4"
+
+#define TIM3_TRGO	"tim3_trgo"
+#define TIM3_CH1	"tim3_ch1"
+#define TIM3_CH2	"tim3_ch2"
+#define TIM3_CH3	"tim3_ch3"
+#define TIM3_CH4	"tim3_ch4"
+
+#define TIM4_TRGO	"tim4_trgo"
+#define TIM4_CH1	"tim4_ch1"
+#define TIM4_CH2	"tim4_ch2"
+#define TIM4_CH3	"tim4_ch3"
+#define TIM4_CH4	"tim4_ch4"
+
+#define TIM5_TRGO	"tim5_trgo"
+#define TIM5_CH1	"tim5_ch1"
+#define TIM5_CH2	"tim5_ch2"
+#define TIM5_CH3	"tim5_ch3"
+#define TIM5_CH4	"tim5_ch4"
+
+#define TIM6_TRGO	"tim6_trgo"
+
+#define TIM7_TRGO	"tim7_trgo"
+
+#define TIM8_TRGO	"tim8_trgo"
+#define TIM8_CH1	"tim8_ch1"
+#define TIM8_CH2	"tim8_ch2"
+#define TIM8_CH3	"tim8_ch3"
+#define TIM8_CH4	"tim8_ch4"
+
+#define TIM9_TRGO	"tim9_trgo"
+#define TIM9_CH1	"tim9_ch1"
+#define TIM9_CH2	"tim9_ch2"
+
+#define TIM12_TRGO	"tim12_trgo"
+#define TIM12_CH1	"tim12_ch1"
+#define TIM12_CH2	"tim12_ch2"
+
+bool is_stm32_timer_trigger(struct iio_trigger *trig);
+
+#endif
-- 
cgit v1.2.3


From 2acae0d5b0f73a8fb4b180bd13491feb96e55fc6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 25 Jan 2017 02:28:16 +0100
Subject: trace: add variant without spacing in trace_print_hex_seq

For upcoming tracepoint support for BPF, we want to dump the program's
tag. Format should be similar to __print_hex(), but without spacing.
Add a __print_hex_str() variant for exactly that purpose that reuses
trace_print_hex_seq().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h | 3 ++-
 include/trace/trace_events.h | 8 +++++++-
 kernel/trace/trace_output.c  | 7 ++++---
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index be007610ceb0..cfa475a0e9ca 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -33,7 +33,8 @@ const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 				    unsigned int bitmask_size);
 
 const char *trace_print_hex_seq(struct trace_seq *p,
-				const unsigned char *buf, int len);
+				const unsigned char *buf, int len,
+				bool spacing);
 
 const char *trace_print_array_seq(struct trace_seq *p,
 				   const void *buf, int count,
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 467e12f780d8..9f684629c3cf 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -297,7 +297,12 @@ TRACE_MAKE_SYSTEM_STR();
 #endif
 
 #undef __print_hex
-#define __print_hex(buf, buf_len) trace_print_hex_seq(p, buf, buf_len)
+#define __print_hex(buf, buf_len)					\
+	trace_print_hex_seq(p, buf, buf_len, true)
+
+#undef __print_hex_str
+#define __print_hex_str(buf, buf_len)					\
+	trace_print_hex_seq(p, buf, buf_len, false)
 
 #undef __print_array
 #define __print_array(array, count, el_size)				\
@@ -711,6 +716,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #undef __print_flags
 #undef __print_symbolic
 #undef __print_hex
+#undef __print_hex_str
 #undef __get_dynamic_array
 #undef __get_dynamic_array_len
 #undef __get_str
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5d33a7352919..30a144b1b9ee 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -163,14 +163,15 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
 
 const char *
-trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
+		    bool spacing)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
 
 	for (i = 0; i < buf_len; i++)
-		trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
-
+		trace_seq_printf(p, "%s%2.2x", !spacing || i == 0 ? "" : " ",
+				 buf[i]);
 	trace_seq_putc(p, 0);
 
 	return ret;
-- 
cgit v1.2.3


From a67edbf4fb6deadcfe57a04a134abed4a5ba3bb5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 25 Jan 2017 02:28:18 +0100
Subject: bpf: add initial bpf tracepoints

This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.

For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.

Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:

  # ./perf record -a -e bpf:* sleep 10
  # ./perf script
  sock_example  6197 [005]   283.980322:      bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
  sock_example  6197 [005]   283.980721:       bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
  sock_example  6197 [005]   283.988423:   bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
  sock_example  6197 [005]   283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
  [...]
  sock_example  6197 [005]   288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
       swapper     0 [005]   289.338243:    bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER

  [1] https://lwn.net/Articles/705270/

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  12 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  15 +-
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   4 +
 drivers/net/virtio_net.c                           |  12 +-
 include/linux/bpf_trace.h                          |   7 +
 include/trace/events/bpf.h                         | 347 +++++++++++++++++++++
 include/trace/events/xdp.h                         |  53 ++++
 kernel/bpf/core.c                                  |   9 +
 kernel/bpf/inode.c                                 |  17 +-
 kernel/bpf/syscall.c                               |  19 +-
 11 files changed, 483 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/bpf_trace.h
 create mode 100644 include/trace/events/bpf.h
 create mode 100644 include/trace/events/xdp.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index e362f99334d0..f15ddba3659a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -33,6 +33,7 @@
 
 #include <net/busy_poll.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/mlx4/cq.h>
 #include <linux/slab.h>
 #include <linux/mlx4/qp.h>
@@ -926,10 +927,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 							length, cq->ring,
 							&doorbell_pending)))
 					goto consumed;
+				trace_xdp_exception(dev, xdp_prog, act);
 				goto xdp_drop_no_cnt; /* Drop on xmit failure */
 			default:
 				bpf_warn_invalid_xdp_action(act);
 			case XDP_ABORTED:
+				trace_xdp_exception(dev, xdp_prog, act);
 			case XDP_DROP:
 				ring->xdp_drop++;
 xdp_drop_no_cnt:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ba50583ea3ed..3d2e1a1886a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -33,6 +33,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/tcp.h>
+#include <linux/bpf_trace.h>
 #include <net/busy_poll.h>
 #include "en.h"
 #include "en_tc.h"
@@ -640,7 +641,7 @@ static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_sq *sq)
 	mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0);
 }
 
-static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
+static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 					struct mlx5e_dma_info *di,
 					const struct xdp_buff *xdp)
 {
@@ -662,7 +663,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 		     MLX5E_SW2HW_MTU(rq->netdev->mtu) < dma_len)) {
 		rq->stats.xdp_drop++;
 		mlx5e_page_release(rq, di, true);
-		return;
+		return false;
 	}
 
 	if (unlikely(!mlx5e_sq_has_room_for(sq, MLX5E_XDP_TX_WQEBBS))) {
@@ -673,7 +674,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 		}
 		rq->stats.xdp_tx_full++;
 		mlx5e_page_release(rq, di, true);
-		return;
+		return false;
 	}
 
 	dma_len -= MLX5E_XDP_MIN_INLINE;
@@ -703,6 +704,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 
 	sq->db.xdp.doorbell = true;
 	rq->stats.xdp_tx++;
+	return true;
 }
 
 /* returns true if packet was consumed by xdp */
@@ -728,11 +730,13 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		*len = xdp.data_end - xdp.data;
 		return false;
 	case XDP_TX:
-		mlx5e_xmit_xdp_frame(rq, di, &xdp);
+		if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
+			trace_xdp_exception(rq->netdev, prog, act);
 		return true;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
+		trace_xdp_exception(rq->netdev, prog, act);
 	case XDP_DROP:
 		rq->stats.xdp_drop++;
 		mlx5e_page_release(rq, di, true);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 67afd95ffb93..6ac43abf561b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -42,6 +42,7 @@
  */
 
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -1459,7 +1460,7 @@ nfp_net_rx_drop(struct nfp_net_r_vector *r_vec, struct nfp_net_rx_ring *rx_ring,
 		dev_kfree_skb_any(skb);
 }
 
-static void
+static bool
 nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring,
 		   struct nfp_net_tx_ring *tx_ring,
 		   struct nfp_net_rx_buf *rxbuf, unsigned int pkt_off,
@@ -1473,13 +1474,13 @@ nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring,
 
 	if (unlikely(nfp_net_tx_full(tx_ring, 1))) {
 		nfp_net_rx_drop(rx_ring->r_vec, rx_ring, rxbuf, NULL);
-		return;
+		return false;
 	}
 
 	new_frag = nfp_net_napi_alloc_one(nn, DMA_BIDIRECTIONAL, &new_dma_addr);
 	if (unlikely(!new_frag)) {
 		nfp_net_rx_drop(rx_ring->r_vec, rx_ring, rxbuf, NULL);
-		return;
+		return false;
 	}
 	nfp_net_rx_give_one(rx_ring, new_frag, new_dma_addr);
 
@@ -1509,6 +1510,7 @@ nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring,
 
 	tx_ring->wr_p++;
 	tx_ring->wr_ptr_add++;
+	return true;
 }
 
 static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len)
@@ -1613,12 +1615,15 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 			case XDP_PASS:
 				break;
 			case XDP_TX:
-				nfp_net_tx_xdp_buf(nn, rx_ring, tx_ring, rxbuf,
-						   pkt_off, pkt_len);
+				if (unlikely(!nfp_net_tx_xdp_buf(nn, rx_ring,
+								 tx_ring, rxbuf,
+								 pkt_off, pkt_len)))
+					trace_xdp_exception(nn->netdev, xdp_prog, act);
 				continue;
 			default:
 				bpf_warn_invalid_xdp_action(act);
 			case XDP_ABORTED:
+				trace_xdp_exception(nn->netdev, xdp_prog, act);
 			case XDP_DROP:
 				nfp_net_rx_give_one(rx_ring, rxbuf->frag,
 						    rxbuf->dma_addr);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 1a6ca4884fad..445d4d2492c3 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/skbuff.h>
+#include <linux/bpf_trace.h>
 #include <net/udp_tunnel.h>
 #include <linux/ip.h>
 #include <net/ipv6.h>
@@ -1016,6 +1017,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 		/* We need the replacement buffer before transmit. */
 		if (qede_alloc_rx_buffer(rxq, true)) {
 			qede_recycle_rx_bd_ring(rxq, 1);
+			trace_xdp_exception(edev->ndev, prog, act);
 			return false;
 		}
 
@@ -1026,6 +1028,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 			dma_unmap_page(rxq->dev, bd->mapping,
 				       PAGE_SIZE, DMA_BIDIRECTIONAL);
 			__free_page(bd->data);
+			trace_xdp_exception(edev->ndev, prog, act);
 		}
 
 		/* Regardless, we've consumed an Rx BD */
@@ -1035,6 +1038,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
+		trace_xdp_exception(edev->ndev, prog, act);
 	case XDP_DROP:
 		qede_recycle_rx_bd_ring(rxq, cqe->bd_num);
 	}
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 37db91d1a0a3..f9bf94887ff1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -23,6 +23,7 @@
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
@@ -330,7 +331,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
-static void virtnet_xdp_xmit(struct virtnet_info *vi,
+static bool virtnet_xdp_xmit(struct virtnet_info *vi,
 			     struct receive_queue *rq,
 			     struct send_queue *sq,
 			     struct xdp_buff *xdp,
@@ -382,10 +383,12 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 			put_page(page);
 		} else /* small buffer */
 			kfree_skb(data);
-		return; // On error abort to avoid unnecessary kick
+		/* On error abort to avoid unnecessary kick */
+		return false;
 	}
 
 	virtqueue_kick(sq->vq);
+	return true;
 }
 
 static u32 do_xdp_prog(struct virtnet_info *vi,
@@ -421,11 +424,14 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 			vi->xdp_queue_pairs +
 			smp_processor_id();
 		xdp.data = buf;
-		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data);
+		if (unlikely(!virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp,
+					       data)))
+			trace_xdp_exception(vi->dev, xdp_prog, act);
 		return XDP_TX;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
+		trace_xdp_exception(vi->dev, xdp_prog, act);
 	case XDP_DROP:
 		return XDP_DROP;
 	}
diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
new file mode 100644
index 000000000000..b22efbdd2eb4
--- /dev/null
+++ b/include/linux/bpf_trace.h
@@ -0,0 +1,7 @@
+#ifndef __LINUX_BPF_TRACE_H__
+#define __LINUX_BPF_TRACE_H__
+
+#include <trace/events/bpf.h>
+#include <trace/events/xdp.h>
+
+#endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
new file mode 100644
index 000000000000..c3a53fd47ff1
--- /dev/null
+++ b/include/trace/events/bpf.h
@@ -0,0 +1,347 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf
+
+#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BPF_H
+
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/tracepoint.h>
+
+#define __PROG_TYPE_MAP(FN)	\
+	FN(SOCKET_FILTER)	\
+	FN(KPROBE)		\
+	FN(SCHED_CLS)		\
+	FN(SCHED_ACT)		\
+	FN(TRACEPOINT)		\
+	FN(XDP)			\
+	FN(PERF_EVENT)		\
+	FN(CGROUP_SKB)		\
+	FN(CGROUP_SOCK)		\
+	FN(LWT_IN)		\
+	FN(LWT_OUT)		\
+	FN(LWT_XMIT)
+
+#define __MAP_TYPE_MAP(FN)	\
+	FN(HASH)		\
+	FN(ARRAY)		\
+	FN(PROG_ARRAY)		\
+	FN(PERF_EVENT_ARRAY)	\
+	FN(PERCPU_HASH)		\
+	FN(PERCPU_ARRAY)	\
+	FN(STACK_TRACE)		\
+	FN(CGROUP_ARRAY)	\
+	FN(LRU_HASH)		\
+	FN(LRU_PERCPU_HASH)	\
+	FN(LPM_TRIE)
+
+#define __PROG_TYPE_TP_FN(x)	\
+	TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x);
+#define __PROG_TYPE_SYM_FN(x)	\
+	{ BPF_PROG_TYPE_##x, #x },
+#define __PROG_TYPE_SYM_TAB	\
+	__PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 }
+__PROG_TYPE_MAP(__PROG_TYPE_TP_FN)
+
+#define __MAP_TYPE_TP_FN(x)	\
+	TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x);
+#define __MAP_TYPE_SYM_FN(x)	\
+	{ BPF_MAP_TYPE_##x, #x },
+#define __MAP_TYPE_SYM_TAB	\
+	__MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 }
+__MAP_TYPE_MAP(__MAP_TYPE_TP_FN)
+
+DECLARE_EVENT_CLASS(bpf_prog_event,
+
+	TP_PROTO(const struct bpf_prog *prg),
+
+	TP_ARGS(prg),
+
+	TP_STRUCT__entry(
+		__array(u8, prog_tag, 8)
+		__field(u32, type)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
+		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
+		__entry->type = prg->type;
+	),
+
+	TP_printk("prog=%s type=%s",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB))
+);
+
+DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type,
+
+	TP_PROTO(const struct bpf_prog *prg),
+
+	TP_ARGS(prg)
+);
+
+DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu,
+
+	TP_PROTO(const struct bpf_prog *prg),
+
+	TP_ARGS(prg)
+);
+
+TRACE_EVENT(bpf_prog_load,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd),
+
+	TP_ARGS(prg, ufd),
+
+	TP_STRUCT__entry(
+		__array(u8, prog_tag, 8)
+		__field(u32, type)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
+		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
+		__entry->type = prg->type;
+		__entry->ufd  = ufd;
+	),
+
+	TP_printk("prog=%s type=%s ufd=%d",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB),
+		  __entry->ufd)
+);
+
+TRACE_EVENT(bpf_map_create,
+
+	TP_PROTO(const struct bpf_map *map, int ufd),
+
+	TP_ARGS(map, ufd),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, size_key)
+		__field(u32, size_value)
+		__field(u32, max_entries)
+		__field(u32, flags)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		__entry->type        = map->map_type;
+		__entry->size_key    = map->key_size;
+		__entry->size_value  = map->value_size;
+		__entry->max_entries = map->max_entries;
+		__entry->flags       = map->map_flags;
+		__entry->ufd         = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd, __entry->size_key, __entry->size_value,
+		  __entry->max_entries, __entry->flags)
+);
+
+DECLARE_EVENT_CLASS(bpf_obj_prog,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(prg, ufd, pname),
+
+	TP_STRUCT__entry(
+		__array(u8, prog_tag, 8)
+		__field(int, ufd)
+		__string(path, pname->name)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
+		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
+		__assign_str(path, pname->name);
+		__entry->ufd = ufd;
+	),
+
+	TP_printk("prog=%s path=%s ufd=%d",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __get_str(path), __entry->ufd)
+);
+
+DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(prg, ufd, pname)
+);
+
+DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(prg, ufd, pname)
+);
+
+DECLARE_EVENT_CLASS(bpf_obj_map,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(map, ufd, pname),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(int, ufd)
+		__string(path, pname->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(path, pname->name);
+		__entry->type = map->map_type;
+		__entry->ufd  = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d path=%s",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd, __get_str(path))
+);
+
+DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(map, ufd, pname)
+);
+
+DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(map, ufd, pname)
+);
+
+DECLARE_EVENT_CLASS(bpf_map_keyval,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *val),
+
+	TP_ARGS(map, ufd, key, val),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, key_len)
+		__dynamic_array(u8, key, map->key_size)
+		__field(bool, key_trunc)
+		__field(u32, val_len)
+		__dynamic_array(u8, val, map->value_size)
+		__field(bool, val_trunc)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		memcpy(__get_dynamic_array(key), key, map->key_size);
+		memcpy(__get_dynamic_array(val), val, map->value_size);
+		__entry->type      = map->map_type;
+		__entry->key_len   = min(map->key_size, 16U);
+		__entry->key_trunc = map->key_size != __entry->key_len;
+		__entry->val_len   = min(map->value_size, 16U);
+		__entry->val_trunc = map->value_size != __entry->val_len;
+		__entry->ufd       = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd,
+		  __print_hex(__get_dynamic_array(key), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "",
+		  __print_hex(__get_dynamic_array(val), __entry->val_len),
+		  __entry->val_trunc ? " ..." : "")
+);
+
+DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *val),
+
+	TP_ARGS(map, ufd, key, val)
+);
+
+DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *val),
+
+	TP_ARGS(map, ufd, key, val)
+);
+
+TRACE_EVENT(bpf_map_delete_elem,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key),
+
+	TP_ARGS(map, ufd, key),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, key_len)
+		__dynamic_array(u8, key, map->key_size)
+		__field(bool, key_trunc)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		memcpy(__get_dynamic_array(key), key, map->key_size);
+		__entry->type      = map->map_type;
+		__entry->key_len   = min(map->key_size, 16U);
+		__entry->key_trunc = map->key_size != __entry->key_len;
+		__entry->ufd       = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=[%s%s]",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd,
+		  __print_hex(__get_dynamic_array(key), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "")
+);
+
+TRACE_EVENT(bpf_map_next_key,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *key_next),
+
+	TP_ARGS(map, ufd, key, key_next),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, key_len)
+		__dynamic_array(u8, key, map->key_size)
+		__dynamic_array(u8, nxt, map->key_size)
+		__field(bool, key_trunc)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		memcpy(__get_dynamic_array(key), key, map->key_size);
+		memcpy(__get_dynamic_array(nxt), key_next, map->key_size);
+		__entry->type      = map->map_type;
+		__entry->key_len   = min(map->key_size, 16U);
+		__entry->key_trunc = map->key_size != __entry->key_len;
+		__entry->ufd       = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd,
+		  __print_hex(__get_dynamic_array(key), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "",
+		  __print_hex(__get_dynamic_array(nxt), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "")
+);
+
+#endif /* _TRACE_BPF_H */
+
+#include <trace/define_trace.h>
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
new file mode 100644
index 000000000000..1b61357d3f57
--- /dev/null
+++ b/include/trace/events/xdp.h
@@ -0,0 +1,53 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xdp
+
+#if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XDP_H
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/tracepoint.h>
+
+#define __XDP_ACT_MAP(FN)	\
+	FN(ABORTED)		\
+	FN(DROP)		\
+	FN(PASS)		\
+	FN(TX)
+
+#define __XDP_ACT_TP_FN(x)	\
+	TRACE_DEFINE_ENUM(XDP_##x);
+#define __XDP_ACT_SYM_FN(x)	\
+	{ XDP_##x, #x },
+#define __XDP_ACT_SYM_TAB	\
+	__XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, 0 }
+__XDP_ACT_MAP(__XDP_ACT_TP_FN)
+
+TRACE_EVENT(xdp_exception,
+
+	TP_PROTO(const struct net_device *dev,
+		 const struct bpf_prog *xdp, u32 act),
+
+	TP_ARGS(dev, xdp, act),
+
+	TP_STRUCT__entry(
+		__string(name, dev->name)
+		__array(u8, prog_tag, 8)
+		__field(u32, act)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
+		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
+		__assign_str(name, dev->name);
+		__entry->act = act;
+	),
+
+	TP_printk("prog=%s device=%s action=%s",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __get_str(name),
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB))
+);
+
+#endif /* _TRACE_XDP_H */
+
+#include <trace/define_trace.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 503d4211988a..fddd76b1b627 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1173,3 +1173,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 {
 	return -EFAULT;
 }
+
+/* All definitions of tracepoints related to BPF. */
+#define CREATE_TRACE_POINTS
+#include <linux/bpf_trace.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 0b030c9126d3..fddcae801724 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -21,6 +21,7 @@
 #include <linux/parser.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 
 enum bpf_type {
 	BPF_TYPE_UNSPEC	= 0,
@@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
 	ret = bpf_obj_do_pin(pname, raw, type);
 	if (ret != 0)
 		bpf_any_put(raw, type);
+	if ((trace_bpf_obj_pin_prog_enabled() ||
+	     trace_bpf_obj_pin_map_enabled()) && !ret) {
+		if (type == BPF_TYPE_PROG)
+			trace_bpf_obj_pin_prog(raw, ufd, pname);
+		if (type == BPF_TYPE_MAP)
+			trace_bpf_obj_pin_map(raw, ufd, pname);
+	}
 out:
 	putname(pname);
 	return ret;
@@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname)
 	else
 		goto out;
 
-	if (ret < 0)
+	if (ret < 0) {
 		bpf_any_put(raw, type);
+	} else if (trace_bpf_obj_get_prog_enabled() ||
+		   trace_bpf_obj_get_map_enabled()) {
+		if (type == BPF_TYPE_PROG)
+			trace_bpf_obj_get_prog(raw, ret, pname);
+		if (type == BPF_TYPE_MAP)
+			trace_bpf_obj_get_map(raw, ret, pname);
+	}
 out:
 	putname(pname);
 	return ret;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1d6b29e4e2c3..05ad086ab71d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -10,6 +10,7 @@
  * General Public License for more details.
  */
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
@@ -215,6 +216,7 @@ static int map_create(union bpf_attr *attr)
 		/* failed to allocate fd */
 		goto free_map;
 
+	trace_bpf_map_create(map, err);
 	return err;
 
 free_map:
@@ -339,6 +341,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_to_user(uvalue, value, value_size) != 0)
 		goto free_value;
 
+	trace_bpf_map_lookup_elem(map, ufd, key, value);
 	err = 0;
 
 free_value:
@@ -421,6 +424,8 @@ static int map_update_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 
+	if (!err)
+		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
 	kfree(value);
 free_key:
@@ -466,6 +471,8 @@ static int map_delete_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 
+	if (!err)
+		trace_bpf_map_delete_elem(map, ufd, key);
 free_key:
 	kfree(key);
 err_put:
@@ -518,6 +525,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
 		goto free_next_key;
 
+	trace_bpf_map_next_key(map, ufd, key, next_key);
 	err = 0;
 
 free_next_key:
@@ -671,8 +679,10 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 void bpf_prog_put(struct bpf_prog *prog)
 {
-	if (atomic_dec_and_test(&prog->aux->refcnt))
+	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		trace_bpf_prog_put_rcu(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+	}
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
@@ -781,7 +791,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	return __bpf_prog_get(ufd, &type);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+
+	if (!IS_ERR(prog))
+		trace_bpf_prog_get_type(prog);
+	return prog;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
@@ -863,6 +877,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		/* failed to allocate fd */
 		goto free_used_maps;
 
+	trace_bpf_prog_load(prog, err);
 	return err;
 
 free_used_maps:
-- 
cgit v1.2.3


From 19f6d3f3c8422d65b5e3d2162e30ef07c6e21ea2 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Mon, 23 Jan 2017 10:59:22 -0800
Subject: net/tcp-fastopen: Add new API support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
alternative way to perform Fast Open on the active side (client). Prior
to this patch, a client needs to replace the connect() call with
sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
to use Fast Open: these socket operations are often done in lower layer
libraries used by many other applications. Changing these libraries
and/or the socket call sequences are not trivial. A more convenient
approach is to perform Fast Open by simply enabling a socket option when
the socket is created w/o changing other socket calls sequence:
  s = socket()
    create a new socket
  setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
    newly introduced sockopt
    If set, new functionality described below will be used.
    Return ENOTSUPP if TFO is not supported or not enabled in the
    kernel.

  connect()
    With cookie present, return 0 immediately.
    With no cookie, initiate 3WHS with TFO cookie-request option and
    return -1 with errno = EINPROGRESS.

  write()/sendmsg()
    With cookie present, send out SYN with data and return the number of
    bytes buffered.
    With no cookie, and 3WHS not yet completed, return -1 with errno =
    EINPROGRESS.
    No MSG_FASTOPEN flag is needed.

  read()
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
    write() is not called yet.
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
    established but no msg is received yet.
    Return number of bytes read if socket is established and there is
    msg received.

The new API simplifies life for applications that always perform a write()
immediately after a successful connect(). Such applications can now take
advantage of Fast Open by merely making one new setsockopt() call at the time
of creating the socket. Nothing else about the application's socket call
sequence needs to change.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  3 ++-
 include/net/inet_sock.h  |  6 +++++-
 include/net/tcp.h        |  1 +
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/af_inet.c       | 31 ++++++++++++++++++++++++-------
 net/ipv4/tcp.c           | 35 ++++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_fastopen.c  | 33 +++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  7 ++++++-
 net/ipv6/tcp_ipv6.c      |  5 +++++
 9 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 5371b3d70cfe..f88f4649ba6f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -222,7 +222,8 @@ struct tcp_sock {
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
-		unused:5;
+		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+		unused:4;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index c9cff977a7fb..aa95053dfc78 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -206,7 +206,11 @@ struct inet_sock {
 				transparent:1,
 				mc_all:1,
 				nodefrag:1;
-	__u8			bind_address_no_port:1;
+	__u8			bind_address_no_port:1,
+				defer_connect:1; /* Indicates that fastopen_connect is set
+						  * and cookie exists so we defer connect
+						  * until first data frame is written
+						  */
 	__u8			rcv_tos;
 	__u8			convert_csum;
 	int			uc_index;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de67541d7adf..6ec4ea652f3f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1495,6 +1495,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
 /* Fastopen key context */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c53de2691cec..6ff35eb48d10 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -116,6 +116,7 @@ enum {
 #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
+#define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 28fe8da4e1ac..92e7f3e957fa 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 	long timeo;
 
-	if (addr_len < sizeof(uaddr->sa_family))
-		return -EINVAL;
+	/*
+	 * uaddr can be NULL and addr_len can be 0 if:
+	 * sk is a TCP fastopen active socket and
+	 * TCP_FASTOPEN_CONNECT sockopt is set and
+	 * we already have a valid cookie for this socket.
+	 * In this case, user can call write() after connect().
+	 * write() will invoke tcp_sendmsg_fastopen() which calls
+	 * __inet_stream_connect().
+	 */
+	if (uaddr) {
+		if (addr_len < sizeof(uaddr->sa_family))
+			return -EINVAL;
 
-	if (uaddr->sa_family == AF_UNSPEC) {
-		err = sk->sk_prot->disconnect(sk, flags);
-		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
-		goto out;
+		if (uaddr->sa_family == AF_UNSPEC) {
+			err = sk->sk_prot->disconnect(sk, flags);
+			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+			goto out;
+		}
 	}
 
 	switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		err = -EISCONN;
 		goto out;
 	case SS_CONNECTING:
-		err = -EALREADY;
+		if (inet_sk(sk)->defer_connect)
+			err = -EINPROGRESS;
+		else
+			err = -EALREADY;
 		/* Fall out of switch with err, set for this state */
 		break;
 	case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 		sock->state = SS_CONNECTING;
 
+		if (!err && inet_sk(sk)->defer_connect)
+			goto out;
+
 		/* Just entered SS_CONNECTING state; the only
 		 * difference is that return value in non-blocking
 		 * case is EINPROGRESS, rather than EALREADY.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c43eb1a831d7..d9735b76d073 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
+	} else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+		/* Active TCP fastopen socket with defer_connect
+		 * Return POLLOUT so application can call write()
+		 * in order for kernel to generate SYN+data
+		 */
+		mask |= POLLOUT | POLLWRNORM;
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
@@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 				int *copied, size_t size)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
 	int err, flags;
 
 	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	tp->fastopen_req->data = msg;
 	tp->fastopen_req->size = size;
 
+	if (inet->defer_connect) {
+		err = tcp_connect(sk);
+		/* Same failure procedure as in tcp_v4/6_connect */
+		if (err) {
+			tcp_set_state(sk, TCP_CLOSE);
+			inet->inet_dport = 0;
+			sk->sk_route_caps = 0;
+		}
+	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
 				    msg->msg_namelen, flags);
+	inet->defer_connect = 0;
 	*copied = tp->fastopen_req->copied;
 	tcp_free_fastopen_req(tp);
 	return err;
@@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
-	if (flags & MSG_FASTOPEN) {
+	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
 		if (err == -EINPROGRESS && copied_syn > 0)
 			goto out;
@@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		}
 		break;
+	case TCP_FASTOPEN_CONNECT:
+		if (val > 1 || val < 0) {
+			err = -EINVAL;
+		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+			if (sk->sk_state == TCP_CLOSE)
+				tp->fastopen_connect = val;
+			else
+				err = -EINVAL;
+		} else {
+			err = -EOPNOTSUPP;
+		}
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = icsk->icsk_accept_queue.fastopenq.max_qlen;
 		break;
 
+	case TCP_FASTOPEN_CONNECT:
+		val = tp->fastopen_connect;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f90e09e1ff4c..9674bec4a0f8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -346,3 +346,36 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 	}
 	return cookie->len > 0;
 }
+
+/* This function checks if we want to defer sending SYN until the first
+ * write().  We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ *               return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+	struct tcp_fastopen_cookie cookie = { .len = 0 };
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 mss;
+
+	if (tp->fastopen_connect && !tp->fastopen_req) {
+		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+			inet_sk(sk)->defer_connect = 1;
+			return true;
+		}
+
+		/* Alloc fastopen_req in order for FO option to be included
+		 * in SYN
+		 */
+		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+					   sk->sk_allocation);
+		if (tp->fastopen_req)
+			tp->fastopen_req->cookie = cookie;
+		else
+			*err = -ENOBUFS;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a90b4540c11e..8c9e9aa17d66 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
+	rt = NULL;
 
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	inet->inet_id = tp->write_seq ^ jiffies;
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto failure;
+
 	err = tcp_connect(sk);
 
-	rt = NULL;
 	if (err)
 		goto failure;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0b7cd3d009b6..95c05e5293b1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 							     inet->inet_dport,
 							     &tp->tsoffset);
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto late_failure;
+
 	err = tcp_connect(sk);
 	if (err)
 		goto late_failure;
-- 
cgit v1.2.3


From f2c4689640e9a34bc45c013032185ed4ce47e7ff Mon Sep 17 00:00:00 2001
From: Lance Roy <ldr709@gmail.com>
Date: Mon, 23 Jan 2017 13:35:18 -0800
Subject: srcu: Implement more-efficient reader counts

SRCU uses two per-cpu counters: a nesting counter to count the number of
active critical sections, and a sequence counter to ensure that the nesting
counters don't change while they are being added together in
srcu_readers_active_idx_check().

This patch instead uses per-cpu lock and unlock counters. Because both
counters only increase and srcu_readers_active_idx_check() reads the unlock
counter before the lock counter, this achieves the same end without having
to increment two different counters in srcu_read_lock(). This also saves a
smp_mb() in srcu_readers_active_idx_check().

Possible bug: There is no guarantee that the lock counter won't overflow
during srcu_readers_active_idx_check(), as there are no memory barriers
around srcu_flip() (see comment in srcu_readers_active_idx_check() for
details). However, this problem was already present before this patch.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Lance Roy <ldr709@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h    |  10 ++--
 kernel/rcu/rcutorture.c |  19 +++++++-
 kernel/rcu/srcu.c       | 122 +++++++++++++++++-------------------------------
 3 files changed, 66 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index dc8eb63c6568..a598cf3ac70c 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -33,9 +33,9 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-struct srcu_struct_array {
-	unsigned long c[2];
-	unsigned long seq[2];
+struct srcu_array {
+	unsigned long lock_count[2];
+	unsigned long unlock_count[2];
 };
 
 struct rcu_batch {
@@ -46,7 +46,7 @@ struct rcu_batch {
 
 struct srcu_struct {
 	unsigned long completed;
-	struct srcu_struct_array __percpu *per_cpu_ref;
+	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
 	bool running;
 	/* callbacks just queued */
@@ -118,7 +118,7 @@ void process_srcu(struct work_struct *work);
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
+	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
 	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 87c51225ceec..d81345be730e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -564,10 +564,25 @@ static void srcu_torture_stats(void)
 	pr_alert("%s%s per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
+		unsigned long l0, l1;
+		unsigned long u0, u1;
 		long c0, c1;
+		struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
 
-		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
-		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
+		u0 = counts->unlock_count[!idx];
+		u1 = counts->unlock_count[idx];
+
+		/*
+		 * Make sure that a lock is always counted if the corresponding
+		 * unlock is counted.
+		 */
+		smp_rmb();
+
+		l0 = counts->lock_count[!idx];
+		l1 = counts->lock_count[idx];
+
+		c0 = l0 - u0;
+		c1 = l1 - u1;
 		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
 	}
 	pr_cont("\n");
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd549caa..c9a0015e1c2e 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 	rcu_batch_init(&sp->batch_check1);
 	rcu_batch_init(&sp->batch_done);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
-	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
 	return sp->per_cpu_ref ? 0 : -ENOMEM;
 }
 
@@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
- * Returns approximate total of the readers' ->seq[] values for the
+ * Returns approximate total of the readers' ->lock_count[] values for the
  * rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
-	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
-		sum += t;
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[idx]);
 	}
 	return sum;
 }
 
 /*
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
-	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
-		sum += t;
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->unlock_count[idx]);
 	}
 	return sum;
 }
 
 /*
  * Return true if the number of pre-existing readers is determined to
- * be stably zero.  An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement.  This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
+ * be zero.
  */
 static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 {
-	unsigned long seq;
+	unsigned long unlocks;
 
-	seq = srcu_readers_seq_idx(sp, idx);
+	unlocks = srcu_readers_unlock_idx(sp, idx);
 
 	/*
-	 * The following smp_mb() A pairs with the smp_mb() B located in
-	 * __srcu_read_lock().  This pairing ensures that if an
-	 * __srcu_read_lock() increments its counter after the summation
-	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
-	 * critical section will see any changes made prior to the start
-	 * of the current SRCU grace period.
+	 * Make sure that a lock is always counted if the corresponding unlock
+	 * is counted. Needs to be a smp_mb() as the read side may contain a
+	 * read from a variable that is written to before the synchronize_srcu()
+	 * in the write side. In this case smp_mb()s A and B act like the store
+	 * buffering pattern.
 	 *
-	 * Also, if the above call to srcu_readers_seq_idx() saw the
-	 * increment of ->seq[], then the call to srcu_readers_active_idx()
-	 * must see the increment of ->c[].
+	 * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
+	 * synchronize_srcu() from being executed before the grace period ends.
 	 */
 	smp_mb(); /* A */
 
 	/*
-	 * Note that srcu_readers_active_idx() can incorrectly return
-	 * zero even though there is a pre-existing reader throughout.
-	 * To see this, suppose that task A is in a very long SRCU
-	 * read-side critical section that started on CPU 0, and that
-	 * no other reader exists, so that the sum of the counters
-	 * is equal to one.  Then suppose that task B starts executing
-	 * srcu_readers_active_idx(), summing up to CPU 1, and then that
-	 * task C starts reading on CPU 0, so that its increment is not
-	 * summed, but finishes reading on CPU 2, so that its decrement
-	 * -is- summed.  Then when task B completes its sum, it will
-	 * incorrectly get zero, despite the fact that task A has been
-	 * in its SRCU read-side critical section the whole time.
+	 * If the locks are the same as the unlocks, then there must have
+	 * been no readers on this index at some time in between. This does not
+	 * mean that there are no more readers, as one could have read the
+	 * current index but not have incremented the lock counter yet.
 	 *
-	 * We therefore do a validation step should srcu_readers_active_idx()
-	 * return zero.
+	 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
+	 * increments of ->lock_count[] since the unlocks were counted, meaning
+	 * that this could return true even if there are still active readers.
+	 * Since there are no memory barriers around srcu_flip(), the CPU is not
+	 * required to increment ->completed before running
+	 * srcu_readers_unlock_idx(), which means that there could be an
+	 * arbitrarily large number of critical sections that execute after
+	 * srcu_readers_unlock_idx() but use the old value of ->completed.
 	 */
-	if (srcu_readers_active_idx(sp, idx) != 0)
-		return false;
-
-	/*
-	 * The remainder of this function is the validation step.
-	 * The following smp_mb() D pairs with the smp_mb() C in
-	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
-	 * by srcu_readers_active_idx() above, then any destructive
-	 * operation performed after the grace period will happen after
-	 * the corresponding SRCU read-side critical section.
-	 *
-	 * Note that there can be at most NR_CPUS worth of readers using
-	 * the old index, which is not enough to overflow even a 32-bit
-	 * integer.  (Yes, this does mean that systems having more than
-	 * a billion or so CPUs need to be 64-bit systems.)  Therefore,
-	 * the sum of the ->seq[] counters cannot possibly overflow.
-	 * Therefore, the only way that the return values of the two
-	 * calls to srcu_readers_seq_idx() can be equal is if there were
-	 * no increments of the corresponding rank of ->seq[] counts
-	 * in the interim.  But the missed-increment scenario laid out
-	 * above includes an increment of the ->seq[] counter by
-	 * the corresponding __srcu_read_lock().  Therefore, if this
-	 * scenario occurs, the return values from the two calls to
-	 * srcu_readers_seq_idx() will differ, and thus the validation
-	 * step below suffices.
-	 */
-	smp_mb(); /* D */
-
-	return srcu_readers_seq_idx(sp, idx) == seq;
+	return srcu_readers_lock_idx(sp, idx) == unlocks;
 }
 
 /**
@@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[0]);
+		sum += READ_ONCE(cpuc->lock_count[1]);
+		sum -= READ_ONCE(cpuc->unlock_count[0]);
+		sum -= READ_ONCE(cpuc->unlock_count[1]);
 	}
 	return sum;
 }
@@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
 	int idx;
 
 	idx = READ_ONCE(sp->completed) & 0x1;
-	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
+	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
-	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
 	return idx;
 }
 EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
 	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	this_cpu_dec(sp->per_cpu_ref->c[idx]);
+	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -349,7 +315,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 
 /*
  * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->c[] and ->seq[] arrays.  This allows
+ * use the other rank of the ->(un)lock_count[] arrays.  This allows
  * us to wait for pre-existing readers in a starvation-free manner.
  */
 static void srcu_flip(struct srcu_struct *sp)
-- 
cgit v1.2.3


From d85b62f18d543c663cbdd6061054efeb9e66cee7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2016 12:08:49 -0800
Subject: srcu: Force full grace-period ordering

If a process invokes synchronize_srcu(), is delayed just the right amount
of time, and thus does not sleep when waiting for the grace period to
complete, there is no ordering between the end of the grace period and
the code following the synchronize_srcu().  Similarly, there can be a
lack of ordering between the end of the SRCU grace period and callback
invocation.

This commit adds the necessary ordering.

Reported-by: Lance Roy <ldr709@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Further smp_mb() adjustment per email with Lance Roy. ]
---
 include/linux/rcupdate.h | 12 ++++++++++++
 kernel/rcu/srcu.c        | 10 ++++++++--
 kernel/rcu/tree.h        | 12 ------------
 3 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 01f71e1d2e94..6ade6a52d9d4 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1161,5 +1161,17 @@ do { \
 		ftrace_dump(oops_dump_mode); \
 } while (0)
 
+/*
+ * Place this after a lock-acquisition primitive to guarantee that
+ * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
+ * if the UNLOCK and LOCK are executed by the same CPU or if the
+ * UNLOCK and LOCK operate on the same lock variable.
+ */
+#ifdef CONFIG_PPC
+#define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
+#else /* #ifdef CONFIG_PPC */
+#define smp_mb__after_unlock_lock()	do { } while (0)
+#endif /* #else #ifdef CONFIG_PPC */
+
 
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c9a0015e1c2e..665bc9951523 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -358,6 +358,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	head->next = NULL;
 	head->func = func;
 	spin_lock_irqsave(&sp->queue_lock, flags);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_batch_queue(&sp->batch_queue, head);
 	if (!sp->running) {
 		sp->running = true;
@@ -391,6 +392,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	head->next = NULL;
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	if (!sp->running) {
 		/* steal the processing owner */
 		sp->running = true;
@@ -410,8 +412,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
-	if (!done)
+	if (!done) {
 		wait_for_completion(&rcu.completion);
+		smp_mb(); /* Caller's later accesses after GP. */
+	}
+
 }
 
 /**
@@ -579,7 +584,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 /*
  * Invoke a limited number of SRCU callbacks that have passed through
  * their grace period.  If there are more to do, SRCU will reschedule
- * the workqueue.
+ * the workqueue.  Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
  */
 static void srcu_invoke_callbacks(struct srcu_struct *sp)
 {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe98dd24adf8..abcc25bdcb29 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -687,18 +687,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 }
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
-/*
- * Place this after a lock-acquisition primitive to guarantee that
- * an UNLOCK+LOCK pair act as a full barrier.  This guarantee applies
- * if the UNLOCK and LOCK are executed by the same CPU or if the
- * UNLOCK and LOCK operate on the same lock variable.
- */
-#ifdef CONFIG_PPC
-#define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
-#else /* #ifdef CONFIG_PPC */
-#define smp_mb__after_unlock_lock()	do { } while (0)
-#endif /* #else #ifdef CONFIG_PPC */
-
 /*
  * Wrappers for the rcu_node::lock acquire and release.
  *
-- 
cgit v1.2.3


From 85b4685da52f6680635370c2cfb9a42f04ac9652 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Wed, 25 Jan 2017 21:00:25 +0100
Subject: net: phy: broadcom: use auxctl reading helper in BCM54612E code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Starting with commit 5b4e29005123 ("net: phy: broadcom: add
bcm54xx_auxctl_read") we have a reading helper so use it and avoid code
duplication.
It also means we don't need MII_BCM54XX_AUXCTL_SHDWSEL_MISC define as
it's the same as MII_BCM54XX_AUXCTL_SHDWSEL_MISC just for reading needs
(same value shifted by 12 bits).

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 6 ++----
 include/linux/brcmphy.h    | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 4223e35490b0..25c6e6cea2dc 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -395,10 +395,8 @@ static int bcm54612e_config_aneg(struct phy_device *phydev)
 	    (phydev->interface != PHY_INTERFACE_MODE_RGMII_RXID)) {
 		u16 reg;
 
-		/* Errata: reads require filling in the write selector field */
-		bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
-				     MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC);
-		reg = phy_read(phydev, MII_BCM54XX_AUX_CTL);
+		reg = bcm54xx_auxctl_read(phydev,
+					  MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
 		/* Disable RXD to RXC delay (default set) */
 		reg &= ~MII_BCM54XX_AUXCTL_MISC_RXD_RXC_SKEW;
 		/* Clear shadow selector field */
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 295fb3e73de5..34e61004b9dc 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -111,7 +111,6 @@
 #define MII_BCM54XX_AUXCTL_MISC_WREN	0x8000
 #define MII_BCM54XX_AUXCTL_MISC_RXD_RXC_SKEW	0x0100
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX	0x0200
-#define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC	0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 #define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	(1 << 8)
-- 
cgit v1.2.3


From 8293c7bcdef1b866610bf21dee3eb0d181cee753 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Wed, 25 Jan 2017 21:00:26 +0100
Subject: net: phy: broadcom: drop duplicated define for RGMII SKEW delay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We had two defines for the same bit (both were used with the
MII_BCM54XX_AUXCTL_SHDWSEL_MISC register).

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 2 +-
 include/linux/brcmphy.h    | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 25c6e6cea2dc..97d1c057c0a1 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -398,7 +398,7 @@ static int bcm54612e_config_aneg(struct phy_device *phydev)
 		reg = bcm54xx_auxctl_read(phydev,
 					  MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
 		/* Disable RXD to RXC delay (default set) */
-		reg &= ~MII_BCM54XX_AUXCTL_MISC_RXD_RXC_SKEW;
+		reg &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
 		/* Clear shadow selector field */
 		reg &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MASK;
 		bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 34e61004b9dc..f9cb73df127e 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -109,7 +109,6 @@
 #define MII_BCM54XX_AUXCTL_ACTL_SMDSP_ENA	0x0800
 
 #define MII_BCM54XX_AUXCTL_MISC_WREN	0x8000
-#define MII_BCM54XX_AUXCTL_MISC_RXD_RXC_SKEW	0x0100
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX	0x0200
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 #define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
-- 
cgit v1.2.3


From 5e7bfa6cb0a94c673f6d565e58353366d88e2aa5 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Wed, 25 Jan 2017 21:00:27 +0100
Subject: net: phy: bcm-phy-lib: clean up remaining AUXCTL register defines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1) Use 0x%02x format for register number. This follows some other
   defines and makes it easier to distinct register from values.
2) Put register define above values and sort the values. It makes
   reading header code easier.
3) Use 0x%04x format for all values. It's about consistency with other
   values (and most of the header) not a personal preference.
4) Separate define for reading shift value with an extre empty line.
   It's user for all AUXCTL registers in a bcm54xx_auxctl_read.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index f9cb73df127e..cf93f1399d3e 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -104,17 +104,17 @@
 /*
  * AUXILIARY CONTROL SHADOW ACCESS REGISTERS.  (PHY REG 0x18)
  */
-#define MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL	0x0000
+#define MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL	0x00
 #define MII_BCM54XX_AUXCTL_ACTL_TX_6DB		0x0400
 #define MII_BCM54XX_AUXCTL_ACTL_SMDSP_ENA	0x0800
 
-#define MII_BCM54XX_AUXCTL_MISC_WREN	0x8000
-#define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX	0x0200
-#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
-#define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
-#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	(1 << 8)
-#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN	(1 << 4)
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC			0x07
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN	0x0010
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	0x0100
+#define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX		0x0200
+#define MII_BCM54XX_AUXCTL_MISC_WREN			0x8000
 
+#define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
 /*
-- 
cgit v1.2.3


From a264d10ff45c688293d9112fddd8d29c819e0853 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 9 Jan 2017 16:02:28 +0200
Subject: gpiolib: Convert fwnode_get_named_gpiod() to configure GPIO

Make fwnode_get_named_gpiod() consistent with the rest of
gpiod_get() like API, i.e. configure GPIO pin immediately after
request.

Besides obvious clean up it will help to configure pins based
on firmware provided resources.

Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/devres.c                     |  9 +++++++--
 drivers/gpio/gpiolib.c                    | 20 ++++++++++++++++----
 drivers/input/keyboard/gpio_keys.c        |  9 +--------
 drivers/input/keyboard/gpio_keys_polled.c | 11 ++---------
 drivers/leds/leds-gpio.c                  |  2 +-
 drivers/video/fbdev/amba-clcd-nomadik.c   | 15 +++++----------
 include/linux/gpio/consumer.h             | 19 +++++++++++++------
 7 files changed, 45 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/devres.c b/drivers/gpio/devres.c
index b760cbbb41d8..5e0b41b5e554 100644
--- a/drivers/gpio/devres.c
+++ b/drivers/gpio/devres.c
@@ -127,13 +127,18 @@ EXPORT_SYMBOL(devm_gpiod_get_index);
  * @dev:	GPIO consumer
  * @con_id:	function within the GPIO consumer
  * @child:	firmware node (child of @dev)
+ * @flags:	GPIO initialization flags
  *
  * GPIO descriptors returned from this function are automatically disposed on
  * driver detach.
+ *
+ * On successfull request the GPIO pin is configured in accordance with
+ * provided @flags.
  */
 struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 					    const char *con_id,
-					    struct fwnode_handle *child)
+					    struct fwnode_handle *child,
+					    enum gpiod_flags flags)
 {
 	static const char * const suffixes[] = { "gpios", "gpio" };
 	char prop_name[32]; /* 32 is max size of property name */
@@ -154,7 +159,7 @@ struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 			snprintf(prop_name, sizeof(prop_name), "%s",
 							       suffixes[i]);
 
-		desc = fwnode_get_named_gpiod(child, prop_name);
+		desc = fwnode_get_named_gpiod(child, prop_name, flags);
 		if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
 			break;
 	}
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index f4c26c7826cd..bb3d3a7edc45 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3309,6 +3309,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_index);
  * fwnode_get_named_gpiod - obtain a GPIO from firmware node
  * @fwnode:	handle of the firmware node
  * @propname:	name of the firmware property representing the GPIO
+ * @dflags:	GPIO initialization flags
  *
  * This function can be used for drivers that get their configuration
  * from firmware.
@@ -3317,12 +3318,17 @@ EXPORT_SYMBOL_GPL(gpiod_get_index);
  * underlying firmware interface and then makes sure that the GPIO
  * descriptor is requested before it is returned to the caller.
  *
+ * On successfull request the GPIO pin is configured in accordance with
+ * provided @dflags.
+ *
  * In case of error an ERR_PTR() is returned.
  */
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
-					 const char *propname)
+					 const char *propname,
+					 enum gpiod_flags dflags)
 {
 	struct gpio_desc *desc = ERR_PTR(-ENODEV);
+	unsigned long lflags = 0;
 	bool active_low = false;
 	bool single_ended = false;
 	int ret;
@@ -3355,13 +3361,19 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 		return ERR_PTR(ret);
 
 	if (active_low)
-		set_bit(FLAG_ACTIVE_LOW, &desc->flags);
+		lflags |= GPIO_ACTIVE_LOW;
 
 	if (single_ended) {
 		if (active_low)
-			set_bit(FLAG_OPEN_DRAIN, &desc->flags);
+			lflags |= GPIO_OPEN_DRAIN;
 		else
-			set_bit(FLAG_OPEN_SOURCE, &desc->flags);
+			lflags |= GPIO_OPEN_SOURCE;
+	}
+
+	ret = gpiod_configure_flags(desc, propname, lflags, dflags);
+	if (ret < 0) {
+		gpiod_put(desc);
+		return ERR_PTR(ret);
 	}
 
 	return desc;
diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c
index 582462d0af75..9de4b876100a 100644
--- a/drivers/input/keyboard/gpio_keys.c
+++ b/drivers/input/keyboard/gpio_keys.c
@@ -481,7 +481,7 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 	spin_lock_init(&bdata->lock);
 
 	if (child) {
-		bdata->gpiod = devm_get_gpiod_from_child(dev, NULL, child);
+		bdata->gpiod = devm_get_gpiod_from_child(dev, NULL, child, GPIOD_IN);
 		if (IS_ERR(bdata->gpiod)) {
 			error = PTR_ERR(bdata->gpiod);
 			if (error == -ENOENT) {
@@ -496,13 +496,6 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 						error);
 				return error;
 			}
-		} else {
-			error = gpiod_direction_input(bdata->gpiod);
-			if (error) {
-				dev_err(dev, "Failed to configure GPIO %d as input: %d\n",
-					desc_to_gpio(bdata->gpiod), error);
-				return error;
-			}
 		}
 	} else if (gpio_is_valid(button->gpio)) {
 		/*
diff --git a/drivers/input/keyboard/gpio_keys_polled.c b/drivers/input/keyboard/gpio_keys_polled.c
index bed4f2086158..fd7ab4dad87b 100644
--- a/drivers/input/keyboard/gpio_keys_polled.c
+++ b/drivers/input/keyboard/gpio_keys_polled.c
@@ -304,7 +304,8 @@ static int gpio_keys_polled_probe(struct platform_device *pdev)
 			}
 
 			bdata->gpiod = devm_get_gpiod_from_child(dev, NULL,
-								 child);
+								 child,
+								 GPIOD_IN);
 			if (IS_ERR(bdata->gpiod)) {
 				error = PTR_ERR(bdata->gpiod);
 				if (error != -EPROBE_DEFER)
@@ -314,14 +315,6 @@ static int gpio_keys_polled_probe(struct platform_device *pdev)
 				fwnode_handle_put(child);
 				return error;
 			}
-
-			error = gpiod_direction_input(bdata->gpiod);
-			if (error) {
-				dev_err(dev, "Failed to configure GPIO %d as input: %d\n",
-					desc_to_gpio(bdata->gpiod), error);
-				fwnode_handle_put(child);
-				return error;
-			}
 		} else if (gpio_is_valid(button->gpio)) {
 			/*
 			 * Legacy GPIO number so request the GPIO here and
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index d400dcaf4d29..00cc671cddcc 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -174,7 +174,7 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
 		const char *state = NULL;
 		struct device_node *np = to_of_node(child);
 
-		led.gpiod = devm_get_gpiod_from_child(dev, NULL, child);
+		led.gpiod = devm_get_gpiod_from_child(dev, NULL, child, GPIOD_ASIS);
 		if (IS_ERR(led.gpiod)) {
 			fwnode_handle_put(child);
 			return ERR_CAST(led.gpiod);
diff --git a/drivers/video/fbdev/amba-clcd-nomadik.c b/drivers/video/fbdev/amba-clcd-nomadik.c
index 0c06fcaaa6e8..2e800d70b0e5 100644
--- a/drivers/video/fbdev/amba-clcd-nomadik.c
+++ b/drivers/video/fbdev/amba-clcd-nomadik.c
@@ -184,32 +184,27 @@ static void tpg110_init(struct device *dev, struct device_node *np,
 {
 	dev_info(dev, "TPG110 display init\n");
 
-	grestb = devm_get_gpiod_from_child(dev, "grestb", &np->fwnode);
+	/* This asserts the GRESTB signal, putting the display into reset */
+	grestb = devm_get_gpiod_from_child(dev, "grestb", &np->fwnode, GPIOD_OUT_HIGH);
 	if (IS_ERR(grestb)) {
 		dev_err(dev, "no GRESTB GPIO\n");
 		return;
 	}
-	/* This asserts the GRESTB signal, putting the display into reset */
-	gpiod_direction_output(grestb, 1);
-
-	scen = devm_get_gpiod_from_child(dev, "scen", &np->fwnode);
+	scen = devm_get_gpiod_from_child(dev, "scen", &np->fwnode, GPIOD_OUT_LOW);
 	if (IS_ERR(scen)) {
 		dev_err(dev, "no SCEN GPIO\n");
 		return;
 	}
-	gpiod_direction_output(scen, 0);
-	scl = devm_get_gpiod_from_child(dev, "scl", &np->fwnode);
+	scl = devm_get_gpiod_from_child(dev, "scl", &np->fwnode, GPIOD_OUT_LOW);
 	if (IS_ERR(scl)) {
 		dev_err(dev, "no SCL GPIO\n");
 		return;
 	}
-	gpiod_direction_output(scl, 0);
-	sda = devm_get_gpiod_from_child(dev, "sda", &np->fwnode);
+	sda = devm_get_gpiod_from_child(dev, "sda", &np->fwnode, GPIOD_OUT_LOW);
 	if (IS_ERR(sda)) {
 		dev_err(dev, "no SDA GPIO\n");
 		return;
 	}
-	gpiod_direction_output(sda, 0);
 	board->enable = tpg110_enable;
 	board->disable = tpg110_disable;
 }
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index fb0fde686cb1..930d10049d8d 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -135,10 +135,12 @@ int desc_to_gpio(const struct gpio_desc *desc);
 struct fwnode_handle;
 
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
-					 const char *propname);
+					 const char *propname,
+					 enum gpiod_flags dflags);
 struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 					    const char *con_id,
-					    struct fwnode_handle *child);
+					    struct fwnode_handle *child,
+					    enum gpiod_flags flags);
 #else /* CONFIG_GPIOLIB */
 
 static inline int gpiod_count(struct device *dev, const char *con_id)
@@ -411,14 +413,19 @@ static inline int desc_to_gpio(const struct gpio_desc *desc)
 /* Child properties interface */
 struct fwnode_handle;
 
-static inline struct gpio_desc *fwnode_get_named_gpiod(
-	struct fwnode_handle *fwnode, const char *propname)
+static inline
+struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
+					 const char *propname,
+					 enum gpiod_flags dflags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
-static inline struct gpio_desc *devm_get_gpiod_from_child(
-	struct device *dev, const char *con_id, struct fwnode_handle *child)
+static inline
+struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
+					    const char *con_id,
+					    struct fwnode_handle *child,
+					    enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
-- 
cgit v1.2.3


From b2987d7438e0ca949d81774ca8b43d370a1f9947 Mon Sep 17 00:00:00 2001
From: Alexander Stein <alexander.stein@systec-electronic.com>
Date: Thu, 12 Jan 2017 17:39:24 +0100
Subject: gpio: Pass GPIO label down to gpiod_request

Currently all users of fwnode_get_named_gpiod() have no way to
specify a label for the GPIO. So GPIOs listed in debugfs are shown
with label "?". With this change a proper label is used.

Also adjust all users so they can pass a label, properly retrieved
from device tree properties.

Signed-off-by: Alexander Stein <alexander.stein@systec-electronic.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/devres.c                     |  5 +++--
 drivers/gpio/gpiolib.c                    |  5 +++--
 drivers/input/keyboard/gpio_keys.c        |  3 ++-
 drivers/input/keyboard/gpio_keys_polled.c |  3 ++-
 drivers/leds/leds-gpio.c                  | 13 +++++++------
 drivers/video/fbdev/amba-clcd-nomadik.c   | 12 ++++++++----
 include/linux/gpio/consumer.h             | 12 ++++++++----
 7 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/devres.c b/drivers/gpio/devres.c
index 5e0b41b5e554..89969e887e5d 100644
--- a/drivers/gpio/devres.c
+++ b/drivers/gpio/devres.c
@@ -138,7 +138,8 @@ EXPORT_SYMBOL(devm_gpiod_get_index);
 struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 					    const char *con_id,
 					    struct fwnode_handle *child,
-					    enum gpiod_flags flags)
+					    enum gpiod_flags flags,
+					    const char *label)
 {
 	static const char * const suffixes[] = { "gpios", "gpio" };
 	char prop_name[32]; /* 32 is max size of property name */
@@ -159,7 +160,7 @@ struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 			snprintf(prop_name, sizeof(prop_name), "%s",
 							       suffixes[i]);
 
-		desc = fwnode_get_named_gpiod(child, prop_name, flags);
+		desc = fwnode_get_named_gpiod(child, prop_name, flags, label);
 		if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
 			break;
 	}
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index bb3d3a7edc45..20c6c51cbe10 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3325,7 +3325,8 @@ EXPORT_SYMBOL_GPL(gpiod_get_index);
  */
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 					 const char *propname,
-					 enum gpiod_flags dflags)
+					 enum gpiod_flags dflags,
+					 const char *label)
 {
 	struct gpio_desc *desc = ERR_PTR(-ENODEV);
 	unsigned long lflags = 0;
@@ -3356,7 +3357,7 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 	if (IS_ERR(desc))
 		return desc;
 
-	ret = gpiod_request(desc, NULL);
+	ret = gpiod_request(desc, label);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c
index 9de4b876100a..2ec74d90ef78 100644
--- a/drivers/input/keyboard/gpio_keys.c
+++ b/drivers/input/keyboard/gpio_keys.c
@@ -481,7 +481,8 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 	spin_lock_init(&bdata->lock);
 
 	if (child) {
-		bdata->gpiod = devm_get_gpiod_from_child(dev, NULL, child, GPIOD_IN);
+		bdata->gpiod = devm_get_gpiod_from_child(dev, NULL, child,
+							 GPIOD_IN, desc);
 		if (IS_ERR(bdata->gpiod)) {
 			error = PTR_ERR(bdata->gpiod);
 			if (error == -ENOENT) {
diff --git a/drivers/input/keyboard/gpio_keys_polled.c b/drivers/input/keyboard/gpio_keys_polled.c
index fd7ab4dad87b..5e35c565e341 100644
--- a/drivers/input/keyboard/gpio_keys_polled.c
+++ b/drivers/input/keyboard/gpio_keys_polled.c
@@ -305,7 +305,8 @@ static int gpio_keys_polled_probe(struct platform_device *pdev)
 
 			bdata->gpiod = devm_get_gpiod_from_child(dev, NULL,
 								 child,
-								 GPIOD_IN);
+								 GPIOD_IN,
+								 button->desc);
 			if (IS_ERR(bdata->gpiod)) {
 				error = PTR_ERR(bdata->gpiod);
 				if (error != -EPROBE_DEFER)
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index 00cc671cddcc..6c4825d96693 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -174,12 +174,6 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
 		const char *state = NULL;
 		struct device_node *np = to_of_node(child);
 
-		led.gpiod = devm_get_gpiod_from_child(dev, NULL, child, GPIOD_ASIS);
-		if (IS_ERR(led.gpiod)) {
-			fwnode_handle_put(child);
-			return ERR_CAST(led.gpiod);
-		}
-
 		ret = fwnode_property_read_string(child, "label", &led.name);
 		if (ret && IS_ENABLED(CONFIG_OF) && np)
 			led.name = np->name;
@@ -188,6 +182,13 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
 			return ERR_PTR(-EINVAL);
 		}
 
+		led.gpiod = devm_get_gpiod_from_child(dev, NULL, child,
+						      GPIOD_ASIS, led.name);
+		if (IS_ERR(led.gpiod)) {
+			fwnode_handle_put(child);
+			return ERR_CAST(led.gpiod);
+		}
+
 		fwnode_property_read_string(child, "linux,default-trigger",
 					    &led.default_trigger);
 
diff --git a/drivers/video/fbdev/amba-clcd-nomadik.c b/drivers/video/fbdev/amba-clcd-nomadik.c
index 2e800d70b0e5..9175ad9034e1 100644
--- a/drivers/video/fbdev/amba-clcd-nomadik.c
+++ b/drivers/video/fbdev/amba-clcd-nomadik.c
@@ -185,22 +185,26 @@ static void tpg110_init(struct device *dev, struct device_node *np,
 	dev_info(dev, "TPG110 display init\n");
 
 	/* This asserts the GRESTB signal, putting the display into reset */
-	grestb = devm_get_gpiod_from_child(dev, "grestb", &np->fwnode, GPIOD_OUT_HIGH);
+	grestb = devm_get_gpiod_from_child(dev, "grestb", &np->fwnode,
+					   GPIOD_OUT_HIGH, "grestb");
 	if (IS_ERR(grestb)) {
 		dev_err(dev, "no GRESTB GPIO\n");
 		return;
 	}
-	scen = devm_get_gpiod_from_child(dev, "scen", &np->fwnode, GPIOD_OUT_LOW);
+	scen = devm_get_gpiod_from_child(dev, "scen", &np->fwnode,
+					 GPIOD_OUT_LOW, "scen");
 	if (IS_ERR(scen)) {
 		dev_err(dev, "no SCEN GPIO\n");
 		return;
 	}
-	scl = devm_get_gpiod_from_child(dev, "scl", &np->fwnode, GPIOD_OUT_LOW);
+	scl = devm_get_gpiod_from_child(dev, "scl", &np->fwnode, GPIOD_OUT_LOW,
+					"scl");
 	if (IS_ERR(scl)) {
 		dev_err(dev, "no SCL GPIO\n");
 		return;
 	}
-	sda = devm_get_gpiod_from_child(dev, "sda", &np->fwnode, GPIOD_OUT_LOW);
+	sda = devm_get_gpiod_from_child(dev, "sda", &np->fwnode, GPIOD_OUT_LOW,
+					"sda");
 	if (IS_ERR(sda)) {
 		dev_err(dev, "no SDA GPIO\n");
 		return;
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 930d10049d8d..80bad7ebde04 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -136,11 +136,13 @@ struct fwnode_handle;
 
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 					 const char *propname,
-					 enum gpiod_flags dflags);
+					 enum gpiod_flags dflags,
+					 const char *label);
 struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 					    const char *con_id,
 					    struct fwnode_handle *child,
-					    enum gpiod_flags flags);
+					    enum gpiod_flags flags,
+					    const char *label);
 #else /* CONFIG_GPIOLIB */
 
 static inline int gpiod_count(struct device *dev, const char *con_id)
@@ -416,7 +418,8 @@ struct fwnode_handle;
 static inline
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 					 const char *propname,
-					 enum gpiod_flags dflags)
+					 enum gpiod_flags dflags,
+					 const char *label)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -425,7 +428,8 @@ static inline
 struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 					    const char *con_id,
 					    struct fwnode_handle *child,
-					    enum gpiod_flags flags)
+					    enum gpiod_flags flags,
+					    const char *label)
 {
 	return ERR_PTR(-ENOSYS);
 }
-- 
cgit v1.2.3


From 53d333ac93911dab22d12a78b0a6414f9afb0117 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Tue, 17 Jan 2017 21:49:12 +0530
Subject: gpio: davinci: Remove unwanted blank line

Remove redundant blank line.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/platform_data/gpio-davinci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h
index 44ca5303849c..18127c4aa4ba 100644
--- a/include/linux/platform_data/gpio-davinci.h
+++ b/include/linux/platform_data/gpio-davinci.h
@@ -26,7 +26,6 @@ struct davinci_gpio_platform_data {
 	u32	gpio_unbanked;
 };
 
-
 struct davinci_gpio_controller {
 	struct gpio_chip	chip;
 	struct irq_domain	*irq_domain;
-- 
cgit v1.2.3


From b5cf3fd827d2e11355c126b44ea625650ebf4d39 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Fri, 13 Jan 2017 09:50:12 +0530
Subject: gpio: davinci: Redesign driver to accommodate ngpios in one gpio chip

The Davinci GPIO driver is implemented to work with one monolithic
Davinci GPIO platform device which may have up to Y(144) gpios.
The Davinci GPIO driver instantiates number of GPIO chips with
max 32 gpio pins per each during initialization and one IRQ domain.
So, the current GPIO's  opjects structure is:

<platform device> Davinci GPIO controller
 |- <gpio0_chip0> ------|
 ...                    |--- irq_domain (hwirq [0..143])
 |- <gpio0_chipN> ------|

Current driver creates one chip for every 32 GPIOs in a controller.
This was a limitation earlier now there is no need for that. Hence
redesigning the driver to create one gpio chip for all the ngpio
in the controller.

|- <gpio0_chip0> ------|--- irq_domain (hwirq [0..143]).

The previous discussion on this can be found here:
https://www.spinics.net/lists/linux-omap/msg132869.html

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-davinci.c                | 127 +++++++++++++++++------------
 include/linux/platform_data/gpio-davinci.h |  12 ++-
 2 files changed, 83 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c
index bb47de3daafa..446df4ece915 100644
--- a/drivers/gpio/gpio-davinci.c
+++ b/drivers/gpio/gpio-davinci.c
@@ -63,11 +63,13 @@ static inline int __davinci_direction(struct gpio_chip *chip,
 			unsigned offset, bool out, int value)
 {
 	struct davinci_gpio_controller *d = gpiochip_get_data(chip);
-	struct davinci_gpio_regs __iomem *g = d->regs;
+	struct davinci_gpio_regs __iomem *g;
 	unsigned long flags;
 	u32 temp;
-	u32 mask = 1 << offset;
+	int bank = offset / 32;
+	u32 mask = __gpio_mask(offset);
 
+	g = d->regs[bank];
 	spin_lock_irqsave(&d->lock, flags);
 	temp = readl_relaxed(&g->dir);
 	if (out) {
@@ -103,9 +105,12 @@ davinci_direction_out(struct gpio_chip *chip, unsigned offset, int value)
 static int davinci_gpio_get(struct gpio_chip *chip, unsigned offset)
 {
 	struct davinci_gpio_controller *d = gpiochip_get_data(chip);
-	struct davinci_gpio_regs __iomem *g = d->regs;
+	struct davinci_gpio_regs __iomem *g;
+	int bank = offset / 32;
 
-	return !!((1 << offset) & readl_relaxed(&g->in_data));
+	g = d->regs[bank];
+
+	return !!(__gpio_mask(offset) & readl_relaxed(&g->in_data));
 }
 
 /*
@@ -115,9 +120,13 @@ static void
 davinci_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 {
 	struct davinci_gpio_controller *d = gpiochip_get_data(chip);
-	struct davinci_gpio_regs __iomem *g = d->regs;
+	struct davinci_gpio_regs __iomem *g;
+	int bank = offset / 32;
 
-	writel_relaxed((1 << offset), value ? &g->set_data : &g->clr_data);
+	g = d->regs[bank];
+
+	writel_relaxed(__gpio_mask(offset),
+		       value ? &g->set_data : &g->clr_data);
 }
 
 static struct davinci_gpio_platform_data *
@@ -165,7 +174,7 @@ static int davinci_gpio_of_xlate(struct gpio_chip *gc,
 	if (gpiospec->args[0] > pdata->ngpio)
 		return -EINVAL;
 
-	if (gc != &chips[gpiospec->args[0] / 32].chip)
+	if (gc != &chips->chip)
 		return -EINVAL;
 
 	if (flags)
@@ -177,11 +186,11 @@ static int davinci_gpio_of_xlate(struct gpio_chip *gc,
 
 static int davinci_gpio_probe(struct platform_device *pdev)
 {
-	int i, base;
+	static int ctrl_num;
+	int gpio, bank;
 	unsigned ngpio, nbank;
 	struct davinci_gpio_controller *chips;
 	struct davinci_gpio_platform_data *pdata;
-	struct davinci_gpio_regs __iomem *regs;
 	struct device *dev = &pdev->dev;
 	struct resource *res;
 	char label[MAX_LABEL_SIZE];
@@ -220,38 +229,30 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 	if (IS_ERR(gpio_base))
 		return PTR_ERR(gpio_base);
 
-	for (i = 0, base = 0; base < ngpio; i++, base += 32) {
-		snprintf(label, MAX_LABEL_SIZE, "davinci_gpio.%d", i);
-		chips[i].chip.label = devm_kstrdup(dev, label, GFP_KERNEL);
-		if (!chips[i].chip.label)
+	snprintf(label, MAX_LABEL_SIZE, "davinci_gpio.%d", ctrl_num++);
+	chips->chip.label = devm_kstrdup(dev, label, GFP_KERNEL);
+		if (!chips->chip.label)
 			return -ENOMEM;
 
-		chips[i].chip.direction_input = davinci_direction_in;
-		chips[i].chip.get = davinci_gpio_get;
-		chips[i].chip.direction_output = davinci_direction_out;
-		chips[i].chip.set = davinci_gpio_set;
+	chips->chip.direction_input = davinci_direction_in;
+	chips->chip.get = davinci_gpio_get;
+	chips->chip.direction_output = davinci_direction_out;
+	chips->chip.set = davinci_gpio_set;
 
-		chips[i].chip.base = base;
-		chips[i].chip.ngpio = ngpio - base;
-		if (chips[i].chip.ngpio > 32)
-			chips[i].chip.ngpio = 32;
+	chips->chip.ngpio = ngpio;
 
 #ifdef CONFIG_OF_GPIO
-		chips[i].chip.of_gpio_n_cells = 2;
-		chips[i].chip.of_xlate = davinci_gpio_of_xlate;
-		chips[i].chip.parent = dev;
-		chips[i].chip.of_node = dev->of_node;
+	chips->chip.of_gpio_n_cells = 2;
+	chips->chip.of_xlate = davinci_gpio_of_xlate;
+	chips->chip.parent = dev;
+	chips->chip.of_node = dev->of_node;
 #endif
-		spin_lock_init(&chips[i].lock);
-
-		regs = gpio_base + offset_array[i];
-		if (!regs)
-			return -ENXIO;
-		chips[i].regs = regs;
+	spin_lock_init(&chips->lock);
 
-		gpiochip_add_data(&chips[i].chip, &chips[i]);
-	}
+	for (gpio = 0, bank = 0; gpio < ngpio; gpio += 32, bank++)
+		chips->regs[bank] = gpio_base + offset_array[bank];
 
+	gpiochip_add_data(&chips->chip, chips);
 	platform_set_drvdata(pdev, chips);
 	davinci_gpio_irq_setup(pdev);
 	return 0;
@@ -312,16 +313,19 @@ static struct irq_chip gpio_irqchip = {
 
 static void gpio_irq_handler(struct irq_desc *desc)
 {
-	unsigned int irq = irq_desc_get_irq(desc);
 	struct davinci_gpio_regs __iomem *g;
 	u32 mask = 0xffff;
+	int bank_num;
 	struct davinci_gpio_controller *d;
+	struct davinci_gpio_irq_data *irqdata;
 
-	d = (struct davinci_gpio_controller *)irq_desc_get_handler_data(desc);
-	g = (struct davinci_gpio_regs __iomem *)d->regs;
+	irqdata = (struct davinci_gpio_irq_data *)irq_desc_get_handler_data(desc);
+	bank_num = irqdata->bank_num;
+	g = irqdata->regs;
+	d = irqdata->chip;
 
 	/* we only care about one bank */
-	if (irq & 1)
+	if ((bank_num % 2) == 1)
 		mask <<= 16;
 
 	/* temporarily mask (level sensitive) parent IRQ */
@@ -329,6 +333,7 @@ static void gpio_irq_handler(struct irq_desc *desc)
 	while (1) {
 		u32		status;
 		int		bit;
+		irq_hw_number_t hw_irq;
 
 		/* ack any irqs */
 		status = readl_relaxed(&g->intstat) & mask;
@@ -341,9 +346,13 @@ static void gpio_irq_handler(struct irq_desc *desc)
 		while (status) {
 			bit = __ffs(status);
 			status &= ~BIT(bit);
+			/* Max number of gpios per controller is 144 so
+			 * hw_irq will be in [0..143]
+			 */
+			hw_irq = (bank_num / 2) * 32 + bit;
+
 			generic_handle_irq(
-				irq_find_mapping(d->irq_domain,
-						 d->chip.base + bit));
+				irq_find_mapping(d->irq_domain, hw_irq));
 		}
 	}
 	chained_irq_exit(irq_desc_get_chip(desc), desc);
@@ -355,7 +364,7 @@ static int gpio_to_irq_banked(struct gpio_chip *chip, unsigned offset)
 	struct davinci_gpio_controller *d = gpiochip_get_data(chip);
 
 	if (d->irq_domain)
-		return irq_create_mapping(d->irq_domain, d->chip.base + offset);
+		return irq_create_mapping(d->irq_domain, offset);
 	else
 		return -ENXIO;
 }
@@ -369,7 +378,7 @@ static int gpio_to_irq_unbanked(struct gpio_chip *chip, unsigned offset)
 	 * can provide direct-mapped IRQs to AINTC (up to 32 GPIOs).
 	 */
 	if (offset < d->gpio_unbanked)
-		return d->gpio_irq + offset;
+		return d->base_irq + offset;
 	else
 		return -ENODEV;
 }
@@ -382,7 +391,7 @@ static int gpio_irq_type_unbanked(struct irq_data *data, unsigned trigger)
 
 	d = (struct davinci_gpio_controller *)irq_data_get_irq_handler_data(data);
 	g = (struct davinci_gpio_regs __iomem *)d->regs;
-	mask = __gpio_mask(data->irq - d->gpio_irq);
+	mask = __gpio_mask(data->irq - d->base_irq);
 
 	if (trigger & ~(IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING))
 		return -EINVAL;
@@ -401,7 +410,7 @@ davinci_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 {
 	struct davinci_gpio_controller *chips =
 				(struct davinci_gpio_controller *)d->host_data;
-	struct davinci_gpio_regs __iomem *g = chips[hw / 32].regs;
+	struct davinci_gpio_regs __iomem *g = chips->regs[hw / 32];
 
 	irq_set_chip_and_handler_name(irq, &gpio_irqchip, handle_simple_irq,
 				"davinci_gpio");
@@ -459,6 +468,7 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev)
 	struct irq_domain	*irq_domain = NULL;
 	const struct of_device_id *match;
 	struct irq_chip *irq_chip;
+	struct davinci_gpio_irq_data *irqdata;
 	gpio_get_irq_chip_cb_t gpio_get_irq_chip;
 
 	/*
@@ -514,10 +524,8 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev)
 	 * IRQs, while the others use banked IRQs, would need some setup
 	 * tweaks to recognize hardware which can do that.
 	 */
-	for (gpio = 0, bank = 0; gpio < ngpio; bank++, gpio += 32) {
-		chips[bank].chip.to_irq = gpio_to_irq_banked;
-		chips[bank].irq_domain = irq_domain;
-	}
+	chips->chip.to_irq = gpio_to_irq_banked;
+	chips->irq_domain = irq_domain;
 
 	/*
 	 * AINTC can handle direct/unbanked IRQs for GPIOs, with the GPIO
@@ -526,9 +534,9 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev)
 	 */
 	if (pdata->gpio_unbanked) {
 		/* pass "bank 0" GPIO IRQs to AINTC */
-		chips[0].chip.to_irq = gpio_to_irq_unbanked;
-		chips[0].gpio_irq = bank_irq;
-		chips[0].gpio_unbanked = pdata->gpio_unbanked;
+		chips->chip.to_irq = gpio_to_irq_unbanked;
+		chips->base_irq = bank_irq;
+		chips->gpio_unbanked = pdata->gpio_unbanked;
 		binten = GENMASK(pdata->gpio_unbanked / 16, 0);
 
 		/* AINTC handles mask/unmask; GPIO handles triggering */
@@ -538,14 +546,14 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev)
 		irq_chip->irq_set_type = gpio_irq_type_unbanked;
 
 		/* default trigger: both edges */
-		g = chips[0].regs;
+		g = chips->regs[0];
 		writel_relaxed(~0, &g->set_falling);
 		writel_relaxed(~0, &g->set_rising);
 
 		/* set the direct IRQs up to use that irqchip */
 		for (gpio = 0; gpio < pdata->gpio_unbanked; gpio++, irq++) {
 			irq_set_chip(irq, irq_chip);
-			irq_set_handler_data(irq, &chips[gpio / 32]);
+			irq_set_handler_data(irq, chips);
 			irq_set_status_flags(irq, IRQ_TYPE_EDGE_BOTH);
 		}
 
@@ -561,7 +569,7 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev)
 		 * There are register sets for 32 GPIOs. 2 banks of 16
 		 * GPIOs are covered by each set of registers hence divide by 2
 		 */
-		g = chips[bank / 2].regs;
+		g = chips->regs[bank / 2];
 		writel_relaxed(~0, &g->clr_falling);
 		writel_relaxed(~0, &g->clr_rising);
 
@@ -570,8 +578,19 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev)
 		 * gpio irqs. Pass the irq bank's corresponding controller to
 		 * the chained irq handler.
 		 */
+		irqdata = devm_kzalloc(&pdev->dev,
+				       sizeof(struct
+					      davinci_gpio_irq_data),
+					      GFP_KERNEL);
+		if (!irqdata)
+			return -ENOMEM;
+
+		irqdata->regs = g;
+		irqdata->bank_num = bank;
+		irqdata->chip = chips;
+
 		irq_set_chained_handler_and_data(bank_irq, gpio_irq_handler,
-						 &chips[gpio / 32]);
+						 irqdata);
 
 		binten |= BIT(bank);
 	}
diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h
index 18127c4aa4ba..c62a9438976d 100644
--- a/include/linux/platform_data/gpio-davinci.h
+++ b/include/linux/platform_data/gpio-davinci.h
@@ -21,19 +21,27 @@
 
 #include <asm-generic/gpio.h>
 
+#define MAX_REGS_BANKS		5
+
 struct davinci_gpio_platform_data {
 	u32	ngpio;
 	u32	gpio_unbanked;
 };
 
+struct davinci_gpio_irq_data {
+	void __iomem			*regs;
+	struct davinci_gpio_controller	*chip;
+	int				bank_num;
+};
+
 struct davinci_gpio_controller {
 	struct gpio_chip	chip;
 	struct irq_domain	*irq_domain;
 	/* Serialize access to GPIO registers */
 	spinlock_t		lock;
-	void __iomem		*regs;
+	void __iomem		*regs[MAX_REGS_BANKS];
 	int			gpio_unbanked;
-	unsigned		gpio_irq;
+	unsigned int		base_irq;
 };
 
 /*
-- 
cgit v1.2.3


From 8e11047b8f3cc0dc6df956cf01915077a574168e Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Tue, 17 Jan 2017 21:49:14 +0530
Subject: gpio: davinci: Add support for multiple GPIO controllers

Update GPIO driver to support Multiple GPIO controllers by updating
the base of subsequent GPIO chips with total of previous chips
gpio count so that gpio_add_chip gets unique numbers.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-davinci.c                | 4 +++-
 include/linux/platform_data/gpio-davinci.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c
index 446df4ece915..2f0b6fdbcddb 100644
--- a/drivers/gpio/gpio-davinci.c
+++ b/drivers/gpio/gpio-davinci.c
@@ -186,7 +186,7 @@ static int davinci_gpio_of_xlate(struct gpio_chip *gc,
 
 static int davinci_gpio_probe(struct platform_device *pdev)
 {
-	static int ctrl_num;
+	static int ctrl_num, bank_base;
 	int gpio, bank;
 	unsigned ngpio, nbank;
 	struct davinci_gpio_controller *chips;
@@ -240,6 +240,7 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 	chips->chip.set = davinci_gpio_set;
 
 	chips->chip.ngpio = ngpio;
+	chips->chip.base = bank_base;
 
 #ifdef CONFIG_OF_GPIO
 	chips->chip.of_gpio_n_cells = 2;
@@ -248,6 +249,7 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 	chips->chip.of_node = dev->of_node;
 #endif
 	spin_lock_init(&chips->lock);
+	bank_base += ngpio;
 
 	for (gpio = 0, bank = 0; gpio < ngpio; gpio += 32, bank++)
 		chips->regs[bank] = gpio_base + offset_array[bank];
diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h
index c62a9438976d..90ae19ca828f 100644
--- a/include/linux/platform_data/gpio-davinci.h
+++ b/include/linux/platform_data/gpio-davinci.h
@@ -42,6 +42,7 @@ struct davinci_gpio_controller {
 	void __iomem		*regs[MAX_REGS_BANKS];
 	int			gpio_unbanked;
 	unsigned int		base_irq;
+	unsigned int		base;
 };
 
 /*
-- 
cgit v1.2.3


From 58957d2edfa19e9b8f80385ba042495058e5e60e Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 23 Jan 2017 15:34:32 +0300
Subject: pinctrl: Widen the generic pinconf argument from 16 to 24 bits

The current pinconf packed format allows only 16-bit argument limiting
the maximum value 65535. For most types this is enough. However,
debounce time can be in range of hundreths of milliseconds in case of
mechanical switches so we cannot represent the worst case using the
current format.

In order to support larger values change the packed format so that the
lower 8 bits are used as type which leaves 24 bits for the argument.
This allows representing values up to 16777215 and debounce times up to
16 seconds.

We also convert the existing users to use 32-bit integer when extracting
argument from the packed configuration value.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/bcm/pinctrl-bcm281xx.c           |  6 +++---
 drivers/pinctrl/bcm/pinctrl-iproc-gpio.c         |  2 +-
 drivers/pinctrl/bcm/pinctrl-ns2-mux.c            |  6 +++---
 drivers/pinctrl/bcm/pinctrl-nsp-gpio.c           |  6 +++---
 drivers/pinctrl/intel/pinctrl-cherryview.c       |  4 ++--
 drivers/pinctrl/meson/pinctrl-meson.c            |  2 --
 drivers/pinctrl/pinctrl-da850-pupd.c             |  2 --
 drivers/pinctrl/pinctrl-lpc18xx.c                | 10 +++++-----
 drivers/pinctrl/pinctrl-max77620.c               |  2 +-
 drivers/pinctrl/pinctrl-palmas.c                 |  2 +-
 drivers/pinctrl/pinctrl-rockchip.c               |  2 +-
 drivers/pinctrl/pinctrl-single.c                 |  2 +-
 drivers/pinctrl/sirf/pinctrl-atlas7.c            |  3 ++-
 drivers/pinctrl/sunxi/pinctrl-sunxi.c            |  2 +-
 drivers/pinctrl/uniphier/pinctrl-uniphier-core.c |  4 ++--
 drivers/pinctrl/vt8500/pinctrl-wmt.c             |  2 +-
 drivers/rtc/rtc-omap.c                           |  2 +-
 include/linux/pinctrl/pinconf-generic.h          | 19 +++++++++++--------
 18 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c
index a5331fdfc795..810a81786f62 100644
--- a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c
+++ b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c
@@ -1106,7 +1106,7 @@ static int bcm281xx_std_pin_update(struct pinctrl_dev *pctldev,
 	struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev);
 	int i;
 	enum pin_config_param param;
-	u16 arg;
+	u32 arg;
 
 	for (i = 0; i < num_configs; i++) {
 		param = pinconf_to_config_param(configs[i]);
@@ -1222,7 +1222,7 @@ static int bcm281xx_i2c_pin_update(struct pinctrl_dev *pctldev,
 	struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev);
 	int i, j;
 	enum pin_config_param param;
-	u16 arg;
+	u32 arg;
 
 	for (i = 0; i < num_configs; i++) {
 		param = pinconf_to_config_param(configs[i]);
@@ -1292,7 +1292,7 @@ static int bcm281xx_hdmi_pin_update(struct pinctrl_dev *pctldev,
 	struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev);
 	int i;
 	enum pin_config_param param;
-	u16 arg;
+	u32 arg;
 
 	for (i = 0; i < num_configs; i++) {
 		param = pinconf_to_config_param(configs[i]);
diff --git a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
index 5d1e505c3c63..3ca925dfefd1 100644
--- a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
@@ -619,7 +619,7 @@ static int iproc_pin_config_set(struct pinctrl_dev *pctldev, unsigned pin,
 {
 	struct iproc_gpio *chip = pinctrl_dev_get_drvdata(pctldev);
 	enum pin_config_param param;
-	u16 arg;
+	u32 arg;
 	unsigned i, gpio = iproc_pin_to_gpio(pin);
 	int ret = -ENOTSUPP;
 
diff --git a/drivers/pinctrl/bcm/pinctrl-ns2-mux.c b/drivers/pinctrl/bcm/pinctrl-ns2-mux.c
index 13a4c2774157..4b5cf0e0f16e 100644
--- a/drivers/pinctrl/bcm/pinctrl-ns2-mux.c
+++ b/drivers/pinctrl/bcm/pinctrl-ns2-mux.c
@@ -703,7 +703,7 @@ static int ns2_pin_get_enable(struct pinctrl_dev *pctrldev, unsigned int pin)
 }
 
 static int ns2_pin_set_slew(struct pinctrl_dev *pctrldev, unsigned int pin,
-			    u16 slew)
+			    u32 slew)
 {
 	struct ns2_pinctrl *pinctrl = pinctrl_dev_get_drvdata(pctrldev);
 	struct ns2_pin *pin_data = pctrldev->desc->pins[pin].drv_data;
@@ -793,7 +793,7 @@ static void ns2_pin_get_pull(struct pinctrl_dev *pctrldev,
 }
 
 static int ns2_pin_set_strength(struct pinctrl_dev *pctrldev, unsigned int pin,
-				u16 strength)
+				u32 strength)
 {
 	struct ns2_pinctrl *pinctrl = pinctrl_dev_get_drvdata(pctrldev);
 	struct ns2_pin *pin_data = pctrldev->desc->pins[pin].drv_data;
@@ -904,7 +904,7 @@ static int ns2_pin_config_set(struct pinctrl_dev *pctrldev, unsigned int pin,
 	struct ns2_pin *pin_data = pctrldev->desc->pins[pin].drv_data;
 	enum pin_config_param param;
 	unsigned int i;
-	u16 arg;
+	u32 arg;
 	int ret = -ENOTSUPP;
 
 	if (pin_data->pin_conf.base == -1)
diff --git a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
index c8deb8be1da7..91ea32dc1e7f 100644
--- a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
@@ -366,7 +366,7 @@ static const struct pinctrl_ops nsp_pctrl_ops = {
 	.dt_free_map = pinctrl_utils_free_map,
 };
 
-static int nsp_gpio_set_slew(struct nsp_gpio *chip, unsigned gpio, u16 slew)
+static int nsp_gpio_set_slew(struct nsp_gpio *chip, unsigned gpio, u32 slew)
 {
 	if (slew)
 		nsp_set_bit(chip, IO_CTRL, NSP_GPIO_SLEW_RATE_EN, gpio, true);
@@ -403,7 +403,7 @@ static void nsp_gpio_get_pull(struct nsp_gpio *chip, unsigned gpio,
 }
 
 static int nsp_gpio_set_strength(struct nsp_gpio *chip, unsigned gpio,
-				 u16 strength)
+				 u32 strength)
 {
 	u32 offset, shift, i;
 	u32 val;
@@ -522,7 +522,7 @@ static int nsp_pin_config_set(struct pinctrl_dev *pctldev, unsigned pin,
 {
 	struct nsp_gpio *chip = pinctrl_dev_get_drvdata(pctldev);
 	enum pin_config_param param;
-	u16 arg;
+	u32 arg;
 	unsigned int i, gpio;
 	int ret = -ENOTSUPP;
 
diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c
index 5e66860a5e67..f80134e3e0b6 100644
--- a/drivers/pinctrl/intel/pinctrl-cherryview.c
+++ b/drivers/pinctrl/intel/pinctrl-cherryview.c
@@ -1059,7 +1059,7 @@ static int chv_config_get(struct pinctrl_dev *pctldev, unsigned pin,
 }
 
 static int chv_config_set_pull(struct chv_pinctrl *pctrl, unsigned pin,
-			       enum pin_config_param param, u16 arg)
+			       enum pin_config_param param, u32 arg)
 {
 	void __iomem *reg = chv_padreg(pctrl, pin, CHV_PADCTRL0);
 	unsigned long flags;
@@ -1151,7 +1151,7 @@ static int chv_config_set(struct pinctrl_dev *pctldev, unsigned pin,
 	struct chv_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev);
 	enum pin_config_param param;
 	int i, ret;
-	u16 arg;
+	u32 arg;
 
 	if (chv_pad_locked(pctrl, pin))
 		return -EBUSY;
diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c
index a579126832af..8fbb571f50de 100644
--- a/drivers/pinctrl/meson/pinctrl-meson.c
+++ b/drivers/pinctrl/meson/pinctrl-meson.c
@@ -260,7 +260,6 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin,
 	enum pin_config_param param;
 	unsigned int reg, bit;
 	int i, ret;
-	u16 arg;
 
 	ret = meson_get_bank(pc, pin, &bank);
 	if (ret)
@@ -268,7 +267,6 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin,
 
 	for (i = 0; i < num_configs; i++) {
 		param = pinconf_to_config_param(configs[i]);
-		arg = pinconf_to_config_argument(configs[i]);
 
 		switch (param) {
 		case PIN_CONFIG_BIAS_DISABLE:
diff --git a/drivers/pinctrl/pinctrl-da850-pupd.c b/drivers/pinctrl/pinctrl-da850-pupd.c
index b36a90a3f3e4..44d5f5f5b07f 100644
--- a/drivers/pinctrl/pinctrl-da850-pupd.c
+++ b/drivers/pinctrl/pinctrl-da850-pupd.c
@@ -113,7 +113,6 @@ static int da850_pupd_pin_config_group_set(struct pinctrl_dev *pctldev,
 	struct da850_pupd_data *data = pinctrl_dev_get_drvdata(pctldev);
 	u32 ena, sel;
 	enum pin_config_param param;
-	u16 arg;
 	int i;
 
 	ena = readl(data->base + DA850_PUPD_ENA);
@@ -121,7 +120,6 @@ static int da850_pupd_pin_config_group_set(struct pinctrl_dev *pctldev,
 
 	for (i = 0; i < num_configs; i++) {
 		param = pinconf_to_config_param(configs[i]);
-		arg = pinconf_to_config_argument(configs[i]);
 
 		switch (param) {
 		case PIN_CONFIG_BIAS_DISABLE:
diff --git a/drivers/pinctrl/pinctrl-lpc18xx.c b/drivers/pinctrl/pinctrl-lpc18xx.c
index e053f1fa5512..d090f37ca4a1 100644
--- a/drivers/pinctrl/pinctrl-lpc18xx.c
+++ b/drivers/pinctrl/pinctrl-lpc18xx.c
@@ -904,7 +904,7 @@ static int lpc18xx_pconf_get(struct pinctrl_dev *pctldev, unsigned pin,
 
 static int lpc18xx_pconf_set_usb1(struct pinctrl_dev *pctldev,
 				  enum pin_config_param param,
-				  u16 param_val, u32 *reg)
+				  u32 param_val, u32 *reg)
 {
 	switch (param) {
 	case PIN_CONFIG_LOW_POWER_MODE:
@@ -932,7 +932,7 @@ static int lpc18xx_pconf_set_usb1(struct pinctrl_dev *pctldev,
 
 static int lpc18xx_pconf_set_i2c0(struct pinctrl_dev *pctldev,
 				  enum pin_config_param param,
-				  u16 param_val, u32 *reg,
+				  u32 param_val, u32 *reg,
 				  unsigned pin)
 {
 	u8 shift;
@@ -982,7 +982,7 @@ static int lpc18xx_pconf_set_i2c0(struct pinctrl_dev *pctldev,
 }
 
 static int lpc18xx_pconf_set_gpio_pin_int(struct pinctrl_dev *pctldev,
-					  u16 param_val, unsigned pin)
+					  u32 param_val, unsigned pin)
 {
 	struct lpc18xx_scu_data *scu = pinctrl_dev_get_drvdata(pctldev);
 	u32 val, reg_val, reg_offset = LPC18XX_SCU_PINTSEL0;
@@ -1008,7 +1008,7 @@ static int lpc18xx_pconf_set_gpio_pin_int(struct pinctrl_dev *pctldev,
 }
 
 static int lpc18xx_pconf_set_pin(struct pinctrl_dev *pctldev, unsigned param,
-				 u16 param_val, u32 *reg, unsigned pin,
+				 u32 param_val, u32 *reg, unsigned pin,
 				 struct lpc18xx_pin_caps *pin_cap)
 {
 	switch (param) {
@@ -1088,7 +1088,7 @@ static int lpc18xx_pconf_set(struct pinctrl_dev *pctldev, unsigned pin,
 	struct lpc18xx_scu_data *scu = pinctrl_dev_get_drvdata(pctldev);
 	struct lpc18xx_pin_caps *pin_cap;
 	enum pin_config_param param;
-	u16 param_val;
+	u32 param_val;
 	u32 reg;
 	int ret;
 	int i;
diff --git a/drivers/pinctrl/pinctrl-max77620.c b/drivers/pinctrl/pinctrl-max77620.c
index d9ff53e8f715..b8d2180a2bea 100644
--- a/drivers/pinctrl/pinctrl-max77620.c
+++ b/drivers/pinctrl/pinctrl-max77620.c
@@ -402,7 +402,7 @@ static int max77620_pinconf_set(struct pinctrl_dev *pctldev,
 	struct device *dev = mpci->dev;
 	struct max77620_fps_config *fps_config;
 	int param;
-	u16 param_val;
+	u32 param_val;
 	unsigned int val;
 	unsigned int pu_val;
 	unsigned int pd_val;
diff --git a/drivers/pinctrl/pinctrl-palmas.c b/drivers/pinctrl/pinctrl-palmas.c
index a30146da7ffd..4d6a5015b927 100644
--- a/drivers/pinctrl/pinctrl-palmas.c
+++ b/drivers/pinctrl/pinctrl-palmas.c
@@ -860,7 +860,7 @@ static int palmas_pinconf_set(struct pinctrl_dev *pctldev,
 {
 	struct palmas_pctrl_chip_info *pci = pinctrl_dev_get_drvdata(pctldev);
 	enum pin_config_param param;
-	u16 param_val;
+	u32 param_val;
 	const struct palmas_pingroup *g;
 	const struct palmas_pin_info *opt;
 	int ret;
diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c
index 08765f58253c..7813599e43fa 100644
--- a/drivers/pinctrl/pinctrl-rockchip.c
+++ b/drivers/pinctrl/pinctrl-rockchip.c
@@ -1441,7 +1441,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin,
 	struct rockchip_pinctrl *info = pinctrl_dev_get_drvdata(pctldev);
 	struct rockchip_pin_bank *bank = pin_to_bank(info, pin);
 	enum pin_config_param param;
-	u16 arg;
+	u32 arg;
 	int i;
 	int rc;
 
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index a5a0392ab817..f71f2e813ea6 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -622,7 +622,7 @@ static int pcs_pinconf_set(struct pinctrl_dev *pctldev,
 	struct pcs_device *pcs = pinctrl_dev_get_drvdata(pctldev);
 	struct pcs_function *func;
 	unsigned offset = 0, shift = 0, i, data, ret;
-	u16 arg;
+	u32 arg;
 	int j;
 
 	ret = pcs_get_function(pctldev, pin, &func);
diff --git a/drivers/pinctrl/sirf/pinctrl-atlas7.c b/drivers/pinctrl/sirf/pinctrl-atlas7.c
index 7f3041697813..82b8a429743d 100644
--- a/drivers/pinctrl/sirf/pinctrl-atlas7.c
+++ b/drivers/pinctrl/sirf/pinctrl-atlas7.c
@@ -5322,7 +5322,8 @@ static int atlas7_pin_config_set(struct pinctrl_dev *pctldev,
 				unsigned pin, unsigned long *configs,
 				unsigned num_configs)
 {
-	u16 param, arg;
+	u16 param;
+	u32 arg;
 	int idx, err;
 
 	for (idx = 0; idx < num_configs; idx++) {
diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c
index 0eb51e33cb1b..28bfa5f413e4 100644
--- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c
+++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c
@@ -540,7 +540,7 @@ static int sunxi_pconf_group_set(struct pinctrl_dev *pctldev,
 		enum pin_config_param param;
 		unsigned long flags;
 		u32 offset, shift, mask, reg;
-		u16 arg, val;
+		u32 arg, val;
 		int ret;
 
 		param = pinconf_to_config_param(configs[i]);
diff --git a/drivers/pinctrl/uniphier/pinctrl-uniphier-core.c b/drivers/pinctrl/uniphier/pinctrl-uniphier-core.c
index 9b2ee717bccc..546f23c9040c 100644
--- a/drivers/pinctrl/uniphier/pinctrl-uniphier-core.c
+++ b/drivers/pinctrl/uniphier/pinctrl-uniphier-core.c
@@ -297,7 +297,7 @@ static int uniphier_conf_pin_config_get(struct pinctrl_dev *pctldev,
 
 static int uniphier_conf_pin_bias_set(struct pinctrl_dev *pctldev,
 				      const struct pin_desc *desc,
-				      enum pin_config_param param, u16 arg)
+				      enum pin_config_param param, u32 arg)
 {
 	struct uniphier_pinctrl_priv *priv = pinctrl_dev_get_drvdata(pctldev);
 	enum uniphier_pin_pull_dir pull_dir =
@@ -468,7 +468,7 @@ static int uniphier_conf_pin_config_set(struct pinctrl_dev *pctldev,
 	for (i = 0; i < num_configs; i++) {
 		enum pin_config_param param =
 					pinconf_to_config_param(configs[i]);
-		u16 arg = pinconf_to_config_argument(configs[i]);
+		u32 arg = pinconf_to_config_argument(configs[i]);
 
 		switch (param) {
 		case PIN_CONFIG_BIAS_DISABLE:
diff --git a/drivers/pinctrl/vt8500/pinctrl-wmt.c b/drivers/pinctrl/vt8500/pinctrl-wmt.c
index 270ca2a47a8c..c207e60b734f 100644
--- a/drivers/pinctrl/vt8500/pinctrl-wmt.c
+++ b/drivers/pinctrl/vt8500/pinctrl-wmt.c
@@ -428,7 +428,7 @@ static int wmt_pinconf_set(struct pinctrl_dev *pctldev, unsigned pin,
 {
 	struct wmt_pinctrl_data *data = pinctrl_dev_get_drvdata(pctldev);
 	enum pin_config_param param;
-	u16 arg;
+	u32 arg;
 	u32 bank = WMT_BANK_FROM_PIN(pin);
 	u32 bit = WMT_BIT_FROM_PIN(pin);
 	u32 reg_pull_en = data->banks[bank].reg_pull_en;
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index 51e52446eacb..73594f38c453 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -610,7 +610,7 @@ static int rtc_pinconf_set(struct pinctrl_dev *pctldev,
 	struct omap_rtc *rtc = pinctrl_dev_get_drvdata(pctldev);
 	u32 val;
 	unsigned int param;
-	u16 param_val;
+	u32 param_val;
 	int i;
 
 	rtc->type->unlock(rtc);
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 12343caa114e..9a09107c890e 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -92,6 +92,8 @@
  * @PIN_CONFIG_END: this is the last enumerator for pin configurations, if
  *	you need to pass in custom configurations to the pin controller, use
  *	PIN_CONFIG_END+1 as the base offset.
+ * @PIN_CONFIG_MAX: this is the maximum configuration value that can be
+ *	presented using the packed format.
  */
 enum pin_config_param {
 	PIN_CONFIG_BIAS_BUS_HOLD,
@@ -112,7 +114,8 @@ enum pin_config_param {
 	PIN_CONFIG_OUTPUT,
 	PIN_CONFIG_POWER_SOURCE,
 	PIN_CONFIG_SLEW_RATE,
-	PIN_CONFIG_END = 0x7FFF,
+	PIN_CONFIG_END = 0x7F,
+	PIN_CONFIG_MAX = 0xFF,
 };
 
 #ifdef CONFIG_DEBUG_FS
@@ -130,27 +133,27 @@ struct pin_config_item {
 /*
  * Helpful configuration macro to be used in tables etc.
  */
-#define PIN_CONF_PACKED(p, a) ((a << 16) | ((unsigned long) p & 0xffffUL))
+#define PIN_CONF_PACKED(p, a) ((a << 8) | ((unsigned long) p & 0xffUL))
 
 /*
  * The following inlines stuffs a configuration parameter and data value
  * into and out of an unsigned long argument, as used by the generic pin config
- * system. We put the parameter in the lower 16 bits and the argument in the
- * upper 16 bits.
+ * system. We put the parameter in the lower 8 bits and the argument in the
+ * upper 24 bits.
  */
 
 static inline enum pin_config_param pinconf_to_config_param(unsigned long config)
 {
-	return (enum pin_config_param) (config & 0xffffUL);
+	return (enum pin_config_param) (config & 0xffUL);
 }
 
-static inline u16 pinconf_to_config_argument(unsigned long config)
+static inline u32 pinconf_to_config_argument(unsigned long config)
 {
-	return (enum pin_config_param) ((config >> 16) & 0xffffUL);
+	return (u32) ((config >> 8) & 0xffffffUL);
 }
 
 static inline unsigned long pinconf_to_config_packed(enum pin_config_param param,
-						     u16 argument)
+						     u32 argument)
 {
 	return PIN_CONF_PACKED(param, argument);
 }
-- 
cgit v1.2.3


From 15381bc7c7f52d56f87c56dd7c948ad78704b852 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 23 Jan 2017 15:34:33 +0300
Subject: pinctrl: Allow configuration of pins from gpiolib based drivers

When a GPIO driver is backed by a pinctrl driver the GPIO driver
sometimes needs to call the pinctrl driver to configure certain things,
like whether the pin is used as input or output. In addition to this
there are other configurations applicable to GPIOs such as setting
debounce time of the GPIO.

To support this we introduce a new function pinctrl_gpio_set_config()
that can be used by gpiolib based driver to pass configuration requests
to the backing pinctrl driver.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 29 +++++++++++++++++++++++++++++
 drivers/pinctrl/pinconf.c        | 12 ++++++++++++
 drivers/pinctrl/pinconf.h        |  9 +++++++++
 include/linux/pinctrl/consumer.h |  6 ++++++
 4 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index fb38e208f32d..597d4641e348 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -688,6 +688,35 @@ int pinctrl_gpio_direction_output(unsigned gpio)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output);
 
+/**
+ * pinctrl_gpio_set_config() - Apply config to given GPIO pin
+ * @gpio: the GPIO pin number from the GPIO subsystem number space
+ * @config: the configuration to apply to the GPIO
+ *
+ * This function should *ONLY* be used from gpiolib-based GPIO drivers, if
+ * they need to call the underlying pin controller to change GPIO config
+ * (for example set debounce time).
+ */
+int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
+{
+	unsigned long configs[] = { config };
+	struct pinctrl_gpio_range *range;
+	struct pinctrl_dev *pctldev;
+	int ret, pin;
+
+	ret = pinctrl_get_device_gpio_range(gpio, &pctldev, &range);
+	if (ret)
+		return ret;
+
+	mutex_lock(&pctldev->mutex);
+	pin = gpio_to_pin(range, gpio);
+	ret = pinconf_set_config(pctldev, pin, configs, ARRAY_SIZE(configs));
+	mutex_unlock(&pctldev->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pinctrl_gpio_set_config);
+
 static struct pinctrl_state *find_state(struct pinctrl *p,
 					const char *name)
 {
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index 799048f3c8d4..c1c1ccc58267 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -200,6 +200,18 @@ int pinconf_apply_setting(struct pinctrl_setting const *setting)
 	return 0;
 }
 
+int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
+		       unsigned long *configs, size_t nconfigs)
+{
+	const struct pinconf_ops *ops;
+
+	ops = pctldev->desc->confops;
+	if (!ops)
+		return -ENOTSUPP;
+
+	return ops->pin_config_set(pctldev, pin, configs, nconfigs);
+}
+
 #ifdef CONFIG_DEBUG_FS
 
 static void pinconf_show_config(struct seq_file *s, struct pinctrl_dev *pctldev,
diff --git a/drivers/pinctrl/pinconf.h b/drivers/pinctrl/pinconf.h
index 55c75780b3b2..bf8aff9abf32 100644
--- a/drivers/pinctrl/pinconf.h
+++ b/drivers/pinctrl/pinconf.h
@@ -20,6 +20,9 @@ int pinconf_map_to_setting(struct pinctrl_map const *map,
 void pinconf_free_setting(struct pinctrl_setting const *setting);
 int pinconf_apply_setting(struct pinctrl_setting const *setting);
 
+int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
+		       unsigned long *configs, size_t nconfigs);
+
 /*
  * You will only be interested in these if you're using PINCONF
  * so don't supply any stubs for these.
@@ -56,6 +59,12 @@ static inline int pinconf_apply_setting(struct pinctrl_setting const *setting)
 	return 0;
 }
 
+static inline int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
+				     unsigned long *configs, size_t nconfigs)
+{
+	return -ENOTSUPP;
+}
+
 #endif
 
 #if defined(CONFIG_PINCONF) && defined(CONFIG_DEBUG_FS)
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index d7e5d608faa7..a0f2aba72fa9 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -29,6 +29,7 @@ extern int pinctrl_request_gpio(unsigned gpio);
 extern void pinctrl_free_gpio(unsigned gpio);
 extern int pinctrl_gpio_direction_input(unsigned gpio);
 extern int pinctrl_gpio_direction_output(unsigned gpio);
+extern int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
 
 extern struct pinctrl * __must_check pinctrl_get(struct device *dev);
 extern void pinctrl_put(struct pinctrl *p);
@@ -80,6 +81,11 @@ static inline int pinctrl_gpio_direction_output(unsigned gpio)
 	return 0;
 }
 
+static inline int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
+{
+	return 0;
+}
+
 static inline struct pinctrl * __must_check pinctrl_get(struct device *dev)
 {
 	return NULL;
-- 
cgit v1.2.3


From 2956b5d94a76b596fa5057c2b3ca915cb27d7652 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 23 Jan 2017 15:34:34 +0300
Subject: pinctrl / gpio: Introduce .set_config() callback for GPIO chips

Currently we already have two pin configuration related callbacks
available for GPIO chips .set_single_ended() and .set_debounce(). In
future we expect to have even more, which does not scale well if we need
to add yet another callback to the GPIO chip structure for each possible
configuration parameter.

Better solution is to reuse what we already have available in the
generic pinconf.

To support this, we introduce a new .set_config() callback for GPIO
chips. The callback takes a single packed pin configuration value as
parameter. This can then be extended easily beyond what is currently
supported by just adding new types to the generic pinconf enum.

If the GPIO driver is backed up by a pinctrl driver the GPIO driver can
just assign gpiochip_generic_config() (introduced in this patch) to
.set_config and that will take care configuration requests are directed
to the pinctrl driver.

We then convert the existing drivers over .set_config() and finally
remove the .set_single_ended() and .set_debounce() callbacks.

Suggested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/gpio/driver.txt                 |  9 +++--
 drivers/gpio/gpio-bcm-kona.c                  | 14 ++++++-
 drivers/gpio/gpio-dln2.c                      | 12 ++++--
 drivers/gpio/gpio-dwapb.c                     | 14 ++++++-
 drivers/gpio/gpio-ep93xx.c                    | 11 ++++--
 drivers/gpio/gpio-f7188x.c                    | 19 +++++----
 drivers/gpio/gpio-lp873x.c                    | 14 +++----
 drivers/gpio/gpio-max77620.c                  | 20 +++++-----
 drivers/gpio/gpio-menz127.c                   | 34 +++++++++++-----
 drivers/gpio/gpio-merrifield.c                | 14 ++++++-
 drivers/gpio/gpio-omap.c                      | 14 ++++++-
 drivers/gpio/gpio-tc3589x.c                   | 15 ++++---
 drivers/gpio/gpio-tegra.c                     | 14 ++++++-
 drivers/gpio/gpio-tps65218.c                  | 14 +++----
 drivers/gpio/gpio-vx855.c                     | 13 ++++---
 drivers/gpio/gpio-wcove.c                     | 13 +++----
 drivers/gpio/gpio-wm831x.c                    | 21 +++++-----
 drivers/gpio/gpio-wm8994.c                    | 13 +++----
 drivers/gpio/gpiolib.c                        | 56 +++++++++++++++++----------
 drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 14 ++++++-
 drivers/pinctrl/pinctrl-amd.c                 | 14 ++++++-
 drivers/pinctrl/pinctrl-sx150x.c              | 55 +++++++-------------------
 drivers/staging/greybus/gpio.c                | 15 ++++---
 drivers/usb/serial/cp210x.c                   | 13 ++++---
 include/linux/gpio/driver.h                   | 37 ++++--------------
 include/linux/pinctrl/pinconf-generic.h       | 33 +++++++---------
 26 files changed, 297 insertions(+), 218 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio/driver.txt b/Documentation/gpio/driver.txt
index 747c721776ed..ad8f0c0cd13f 100644
--- a/Documentation/gpio/driver.txt
+++ b/Documentation/gpio/driver.txt
@@ -146,10 +146,11 @@ a pull-up resistor is needed on the outgoing rail to complete the circuit, and
 in the second case, a pull-down resistor is needed on the rail.
 
 Hardware that supports open drain or open source or both, can implement a
-special callback in the gpio_chip: .set_single_ended() that takes an enum flag
-telling whether to configure the line as open drain, open source or push-pull.
-This will happen in response to the GPIO_OPEN_DRAIN or GPIO_OPEN_SOURCE flag
-set in the machine file, or coming from other hardware descriptions.
+special callback in the gpio_chip: .set_config() that takes a generic
+pinconf packed value telling whether to configure the line as open drain,
+open source or push-pull. This will happen in response to the
+GPIO_OPEN_DRAIN or GPIO_OPEN_SOURCE flag set in the machine file, or coming
+from other hardware descriptions.
 
 If this state can not be configured in hardware, i.e. if the GPIO hardware does
 not support open drain/open source in hardware, the GPIO library will instead
diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c
index 3d1cf018e8e7..41d0ac142580 100644
--- a/drivers/gpio/gpio-bcm-kona.c
+++ b/drivers/gpio/gpio-bcm-kona.c
@@ -308,6 +308,18 @@ static int bcm_kona_gpio_set_debounce(struct gpio_chip *chip, unsigned gpio,
 	return 0;
 }
 
+static int bcm_kona_gpio_set_config(struct gpio_chip *chip, unsigned gpio,
+				    unsigned long config)
+{
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
+	return bcm_kona_gpio_set_debounce(chip, gpio, debounce);
+}
+
 static const struct gpio_chip template_chip = {
 	.label = "bcm-kona-gpio",
 	.owner = THIS_MODULE,
@@ -318,7 +330,7 @@ static const struct gpio_chip template_chip = {
 	.get = bcm_kona_gpio_get,
 	.direction_output = bcm_kona_gpio_direction_output,
 	.set = bcm_kona_gpio_set,
-	.set_debounce = bcm_kona_gpio_set_debounce,
+	.set_config = bcm_kona_gpio_set_config,
 	.to_irq = bcm_kona_gpio_to_irq,
 	.base = 0,
 };
diff --git a/drivers/gpio/gpio-dln2.c b/drivers/gpio/gpio-dln2.c
index 5d38b08d1ee2..aecb847166f5 100644
--- a/drivers/gpio/gpio-dln2.c
+++ b/drivers/gpio/gpio-dln2.c
@@ -272,12 +272,16 @@ static int dln2_gpio_direction_output(struct gpio_chip *chip, unsigned offset,
 	return dln2_gpio_set_direction(chip, offset, DLN2_GPIO_DIRECTION_OUT);
 }
 
-static int dln2_gpio_set_debounce(struct gpio_chip *chip, unsigned offset,
-				  unsigned debounce)
+static int dln2_gpio_set_config(struct gpio_chip *chip, unsigned offset,
+				unsigned long config)
 {
 	struct dln2_gpio *dln2 = gpiochip_get_data(chip);
-	__le32 duration = cpu_to_le32(debounce);
+	__le32 duration;
 
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	duration = cpu_to_le32(pinconf_to_config_argument(config));
 	return dln2_transfer_tx(dln2->pdev, DLN2_GPIO_SET_DEBOUNCE,
 				&duration, sizeof(duration));
 }
@@ -474,7 +478,7 @@ static int dln2_gpio_probe(struct platform_device *pdev)
 	dln2->gpio.get_direction = dln2_gpio_get_direction;
 	dln2->gpio.direction_input = dln2_gpio_direction_input;
 	dln2->gpio.direction_output = dln2_gpio_direction_output;
-	dln2->gpio.set_debounce = dln2_gpio_set_debounce;
+	dln2->gpio.set_config = dln2_gpio_set_config;
 
 	platform_set_drvdata(pdev, dln2);
 
diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c
index 6193f62c0df4..9c15ee4ef4e9 100644
--- a/drivers/gpio/gpio-dwapb.c
+++ b/drivers/gpio/gpio-dwapb.c
@@ -279,6 +279,18 @@ static int dwapb_gpio_set_debounce(struct gpio_chip *gc,
 	return 0;
 }
 
+static int dwapb_gpio_set_config(struct gpio_chip *gc, unsigned offset,
+				 unsigned long config)
+{
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
+	return dwapb_gpio_set_debounce(gc, offset, debounce);
+}
+
 static irqreturn_t dwapb_irq_handler_mfd(int irq, void *dev_id)
 {
 	u32 worked;
@@ -426,7 +438,7 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio,
 
 	/* Only port A support debounce */
 	if (pp->idx == 0)
-		port->gc.set_debounce = dwapb_gpio_set_debounce;
+		port->gc.set_config = dwapb_gpio_set_config;
 
 	if (pp->irq)
 		dwapb_configure_irqs(gpio, port, pp);
diff --git a/drivers/gpio/gpio-ep93xx.c b/drivers/gpio/gpio-ep93xx.c
index d054219e18b9..45d384039e9b 100644
--- a/drivers/gpio/gpio-ep93xx.c
+++ b/drivers/gpio/gpio-ep93xx.c
@@ -291,15 +291,20 @@ static struct ep93xx_gpio_bank ep93xx_gpio_banks[] = {
 	EP93XX_GPIO_BANK("H", 0x40, 0x44, 56, false),
 };
 
-static int ep93xx_gpio_set_debounce(struct gpio_chip *chip,
-				    unsigned offset, unsigned debounce)
+static int ep93xx_gpio_set_config(struct gpio_chip *chip, unsigned offset,
+				  unsigned long config)
 {
 	int gpio = chip->base + offset;
 	int irq = gpio_to_irq(gpio);
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
 
 	if (irq < 0)
 		return -EINVAL;
 
+	debounce = pinconf_to_config_argument(config);
 	ep93xx_gpio_int_debounce(irq, debounce ? true : false);
 
 	return 0;
@@ -335,7 +340,7 @@ static int ep93xx_gpio_add_bank(struct gpio_chip *gc, struct device *dev,
 	gc->base = bank->base;
 
 	if (bank->has_debounce) {
-		gc->set_debounce = ep93xx_gpio_set_debounce;
+		gc->set_config = ep93xx_gpio_set_config;
 		gc->to_irq = ep93xx_gpio_to_irq;
 	}
 
diff --git a/drivers/gpio/gpio-f7188x.c b/drivers/gpio/gpio-f7188x.c
index e8accde62aa7..56bd76c33767 100644
--- a/drivers/gpio/gpio-f7188x.c
+++ b/drivers/gpio/gpio-f7188x.c
@@ -131,9 +131,8 @@ static int f7188x_gpio_get(struct gpio_chip *chip, unsigned offset);
 static int f7188x_gpio_direction_out(struct gpio_chip *chip,
 				     unsigned offset, int value);
 static void f7188x_gpio_set(struct gpio_chip *chip, unsigned offset, int value);
-static int f7188x_gpio_set_single_ended(struct gpio_chip *gc,
-					unsigned offset,
-					enum single_ended_mode mode);
+static int f7188x_gpio_set_config(struct gpio_chip *chip, unsigned offset,
+				  unsigned long config);
 
 #define F7188X_GPIO_BANK(_base, _ngpio, _regbase)			\
 	{								\
@@ -145,7 +144,7 @@ static int f7188x_gpio_set_single_ended(struct gpio_chip *gc,
 			.get              = f7188x_gpio_get,		\
 			.direction_output = f7188x_gpio_direction_out,	\
 			.set              = f7188x_gpio_set,		\
-			.set_single_ended = f7188x_gpio_set_single_ended, \
+			.set_config	  = f7188x_gpio_set_config,	\
 			.base             = _base,			\
 			.ngpio            = _ngpio,			\
 			.can_sleep        = true,			\
@@ -326,17 +325,17 @@ static void f7188x_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 	superio_exit(sio->addr);
 }
 
-static int f7188x_gpio_set_single_ended(struct gpio_chip *chip,
-					unsigned offset,
-					enum single_ended_mode mode)
+static int f7188x_gpio_set_config(struct gpio_chip *chip, unsigned offset,
+				  unsigned long config)
 {
 	int err;
+	enum pin_config_param param = pinconf_to_config_param(config);
 	struct f7188x_gpio_bank *bank = gpiochip_get_data(chip);
 	struct f7188x_sio *sio = bank->data->sio;
 	u8 data;
 
-	if (mode != LINE_MODE_OPEN_DRAIN &&
-	    mode != LINE_MODE_PUSH_PULL)
+	if (param != PIN_CONFIG_DRIVE_OPEN_DRAIN &&
+	    param != PIN_CONFIG_DRIVE_PUSH_PULL)
 		return -ENOTSUPP;
 
 	err = superio_enter(sio->addr);
@@ -345,7 +344,7 @@ static int f7188x_gpio_set_single_ended(struct gpio_chip *chip,
 	superio_select(sio->addr, SIO_LD_GPIO);
 
 	data = superio_inb(sio->addr, gpio_out_mode(bank->regbase));
-	if (mode == LINE_MODE_OPEN_DRAIN)
+	if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN)
 		data &= ~BIT(offset);
 	else
 		data |= BIT(offset);
diff --git a/drivers/gpio/gpio-lp873x.c b/drivers/gpio/gpio-lp873x.c
index 218c706359aa..df0ad2cef0d2 100644
--- a/drivers/gpio/gpio-lp873x.c
+++ b/drivers/gpio/gpio-lp873x.c
@@ -100,21 +100,21 @@ static int lp873x_gpio_request(struct gpio_chip *gc, unsigned int offset)
 	return 0;
 }
 
-static int lp873x_gpio_set_single_ended(struct gpio_chip *gc,
-					unsigned int offset,
-					enum single_ended_mode mode)
+static int lp873x_gpio_set_config(struct gpio_chip *gc, unsigned offset,
+				  unsigned long config)
 {
 	struct lp873x_gpio *gpio = gpiochip_get_data(gc);
 
-	switch (mode) {
-	case LINE_MODE_OPEN_DRAIN:
+	switch (pinconf_to_config_param(config)) {
+	case PIN_CONFIG_DRIVE_OPEN_DRAIN:
 		return regmap_update_bits(gpio->lp873->regmap,
 					  LP873X_REG_GPO_CTRL,
 					  BIT(offset * BITS_PER_GPO +
 					  LP873X_GPO_CTRL_OD),
 					  BIT(offset * BITS_PER_GPO +
 					  LP873X_GPO_CTRL_OD));
-	case LINE_MODE_PUSH_PULL:
+
+	case PIN_CONFIG_DRIVE_PUSH_PULL:
 		return regmap_update_bits(gpio->lp873->regmap,
 					  LP873X_REG_GPO_CTRL,
 					  BIT(offset * BITS_PER_GPO +
@@ -133,7 +133,7 @@ static const struct gpio_chip template_chip = {
 	.direction_output	= lp873x_gpio_direction_output,
 	.get			= lp873x_gpio_get,
 	.set			= lp873x_gpio_set,
-	.set_single_ended	= lp873x_gpio_set_single_ended,
+	.set_config		= lp873x_gpio_set_config,
 	.base			= -1,
 	.ngpio			= 2,
 	.can_sleep		= true,
diff --git a/drivers/gpio/gpio-max77620.c b/drivers/gpio/gpio-max77620.c
index ec8de4190db9..743459d9477d 100644
--- a/drivers/gpio/gpio-max77620.c
+++ b/drivers/gpio/gpio-max77620.c
@@ -152,11 +152,10 @@ static int max77620_gpio_dir_output(struct gpio_chip *gc, unsigned int offset,
 	return ret;
 }
 
-static int max77620_gpio_set_debounce(struct gpio_chip *gc,
+static int max77620_gpio_set_debounce(struct max77620_gpio *mgpio,
 				      unsigned int offset,
 				      unsigned int debounce)
 {
-	struct max77620_gpio *mgpio = gpiochip_get_data(gc);
 	u8 val;
 	int ret;
 
@@ -202,21 +201,23 @@ static void max77620_gpio_set(struct gpio_chip *gc, unsigned int offset,
 		dev_err(mgpio->dev, "CNFG_GPIO_OUT update failed: %d\n", ret);
 }
 
-static int max77620_gpio_set_single_ended(struct gpio_chip *gc,
-					  unsigned int offset,
-					  enum single_ended_mode mode)
+static int max77620_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
+				    unsigned long config)
 {
 	struct max77620_gpio *mgpio = gpiochip_get_data(gc);
 
-	switch (mode) {
-	case LINE_MODE_OPEN_DRAIN:
+	switch (pinconf_to_config_param(config)) {
+	case PIN_CONFIG_DRIVE_OPEN_DRAIN:
 		return regmap_update_bits(mgpio->rmap, GPIO_REG_ADDR(offset),
 					  MAX77620_CNFG_GPIO_DRV_MASK,
 					  MAX77620_CNFG_GPIO_DRV_OPENDRAIN);
-	case LINE_MODE_PUSH_PULL:
+	case PIN_CONFIG_DRIVE_PUSH_PULL:
 		return regmap_update_bits(mgpio->rmap, GPIO_REG_ADDR(offset),
 					  MAX77620_CNFG_GPIO_DRV_MASK,
 					  MAX77620_CNFG_GPIO_DRV_PUSHPULL);
+	case PIN_CONFIG_INPUT_DEBOUNCE:
+		return max77620_gpio_set_debounce(mgpio, offset,
+			pinconf_to_config_argument(config));
 	default:
 		break;
 	}
@@ -257,9 +258,8 @@ static int max77620_gpio_probe(struct platform_device *pdev)
 	mgpio->gpio_chip.direction_input = max77620_gpio_dir_input;
 	mgpio->gpio_chip.get = max77620_gpio_get;
 	mgpio->gpio_chip.direction_output = max77620_gpio_dir_output;
-	mgpio->gpio_chip.set_debounce = max77620_gpio_set_debounce;
 	mgpio->gpio_chip.set = max77620_gpio_set;
-	mgpio->gpio_chip.set_single_ended = max77620_gpio_set_single_ended;
+	mgpio->gpio_chip.set_config = max77620_gpio_set_config;
 	mgpio->gpio_chip.to_irq = max77620_gpio_to_irq;
 	mgpio->gpio_chip.ngpio = MAX77620_GPIO_NR;
 	mgpio->gpio_chip.can_sleep = 1;
diff --git a/drivers/gpio/gpio-menz127.c b/drivers/gpio/gpio-menz127.c
index a1210e330571..e1037582e34d 100644
--- a/drivers/gpio/gpio-menz127.c
+++ b/drivers/gpio/gpio-menz127.c
@@ -89,22 +89,18 @@ static int men_z127_debounce(struct gpio_chip *gc, unsigned gpio,
 
 static int men_z127_set_single_ended(struct gpio_chip *gc,
 				     unsigned offset,
-				     enum single_ended_mode mode)
+				     enum pin_config_param param)
 {
 	struct men_z127_gpio *priv = gpiochip_get_data(gc);
 	u32 od_en;
 
-	if (mode != LINE_MODE_OPEN_DRAIN &&
-	    mode != LINE_MODE_PUSH_PULL)
-		return -ENOTSUPP;
-
 	spin_lock(&gc->bgpio_lock);
 	od_en = readl(priv->reg_base + MEN_Z127_ODER);
 
-	if (mode == LINE_MODE_OPEN_DRAIN)
+	if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN)
 		od_en |= BIT(offset);
 	else
-		/* Implicitly LINE_MODE_PUSH_PULL */
+		/* Implicitly PIN_CONFIG_DRIVE_PUSH_PULL */
 		od_en &= ~BIT(offset);
 
 	writel(od_en, priv->reg_base + MEN_Z127_ODER);
@@ -113,6 +109,27 @@ static int men_z127_set_single_ended(struct gpio_chip *gc,
 	return 0;
 }
 
+static int men_z127_set_config(struct gpio_chip *gc, unsigned offset,
+			       unsigned long config)
+{
+	enum pin_config_param param = pinconf_to_config_param(config);
+
+	switch (param) {
+	case PIN_CONFIG_DRIVE_OPEN_DRAIN:
+	case PIN_CONFIG_DRIVE_PUSH_PULL:
+		return men_z127_set_single_ended(gc, offset, param);
+
+	case PIN_CONFIG_INPUT_DEBOUNCE:
+		return men_z127_debounce(gc, offset,
+			pinconf_to_config_argument(config));
+
+	default:
+		break;
+	}
+
+	return -ENOTSUPP;
+}
+
 static int men_z127_probe(struct mcb_device *mdev,
 			  const struct mcb_device_id *id)
 {
@@ -149,8 +166,7 @@ static int men_z127_probe(struct mcb_device *mdev,
 	if (ret)
 		goto err_unmap;
 
-	men_z127_gpio->gc.set_debounce = men_z127_debounce;
-	men_z127_gpio->gc.set_single_ended = men_z127_set_single_ended;
+	men_z127_gpio->gc.set_config = men_z127_set_config;
 
 	ret = gpiochip_add_data(&men_z127_gpio->gc, men_z127_gpio);
 	if (ret) {
diff --git a/drivers/gpio/gpio-merrifield.c b/drivers/gpio/gpio-merrifield.c
index 69e0f4ace465..f40088d268c1 100644
--- a/drivers/gpio/gpio-merrifield.c
+++ b/drivers/gpio/gpio-merrifield.c
@@ -190,6 +190,18 @@ static int mrfld_gpio_set_debounce(struct gpio_chip *chip, unsigned int offset,
 	return 0;
 }
 
+static int mrfld_gpio_set_config(struct gpio_chip *chip, unsigned int offset,
+				 unsigned long config)
+{
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
+	return mrfld_gpio_set_debounce(chip, offset, debounce);
+}
+
 static void mrfld_irq_ack(struct irq_data *d)
 {
 	struct mrfld_gpio *priv = irq_data_get_irq_chip_data(d);
@@ -414,7 +426,7 @@ static int mrfld_gpio_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	priv->chip.get = mrfld_gpio_get;
 	priv->chip.set = mrfld_gpio_set;
 	priv->chip.get_direction = mrfld_gpio_get_direction;
-	priv->chip.set_debounce = mrfld_gpio_set_debounce;
+	priv->chip.set_config = mrfld_gpio_set_config;
 	priv->chip.base = gpio_base;
 	priv->chip.ngpio = MRFLD_NGPIO;
 	priv->chip.can_sleep = false;
diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index b98ede78c9d8..efc85a279d54 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -974,6 +974,18 @@ static int omap_gpio_debounce(struct gpio_chip *chip, unsigned offset,
 	return 0;
 }
 
+static int omap_gpio_set_config(struct gpio_chip *chip, unsigned offset,
+				unsigned long config)
+{
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
+	return omap_gpio_debounce(chip, offset, debounce);
+}
+
 static void omap_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 {
 	struct gpio_bank *bank;
@@ -1045,7 +1057,7 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct irq_chip *irqc)
 	bank->chip.direction_input = omap_gpio_input;
 	bank->chip.get = omap_gpio_get;
 	bank->chip.direction_output = omap_gpio_output;
-	bank->chip.set_debounce = omap_gpio_debounce;
+	bank->chip.set_config = omap_gpio_set_config;
 	bank->chip.set = omap_gpio_set;
 	if (bank->is_mpuio) {
 		bank->chip.label = "mpuio";
diff --git a/drivers/gpio/gpio-tc3589x.c b/drivers/gpio/gpio-tc3589x.c
index be97101c2c9a..433b45ef332e 100644
--- a/drivers/gpio/gpio-tc3589x.c
+++ b/drivers/gpio/gpio-tc3589x.c
@@ -100,9 +100,8 @@ static int tc3589x_gpio_get_direction(struct gpio_chip *chip,
 	return !(ret & BIT(pos));
 }
 
-static int tc3589x_gpio_set_single_ended(struct gpio_chip *chip,
-					 unsigned int offset,
-					 enum single_ended_mode mode)
+static int tc3589x_gpio_set_config(struct gpio_chip *chip, unsigned int offset,
+				   unsigned long config)
 {
 	struct tc3589x_gpio *tc3589x_gpio = gpiochip_get_data(chip);
 	struct tc3589x *tc3589x = tc3589x_gpio->tc3589x;
@@ -116,22 +115,22 @@ static int tc3589x_gpio_set_single_ended(struct gpio_chip *chip,
 	unsigned int pos = offset % 8;
 	int ret;
 
-	switch(mode) {
-	case LINE_MODE_OPEN_DRAIN:
+	switch (pinconf_to_config_param(config)) {
+	case PIN_CONFIG_DRIVE_OPEN_DRAIN:
 		/* Set open drain mode */
 		ret = tc3589x_set_bits(tc3589x, odmreg, BIT(pos), 0);
 		if (ret)
 			return ret;
 		/* Enable open drain/source mode */
 		return tc3589x_set_bits(tc3589x, odereg, BIT(pos), BIT(pos));
-	case LINE_MODE_OPEN_SOURCE:
+	case PIN_CONFIG_DRIVE_OPEN_SOURCE:
 		/* Set open source mode */
 		ret = tc3589x_set_bits(tc3589x, odmreg, BIT(pos), BIT(pos));
 		if (ret)
 			return ret;
 		/* Enable open drain/source mode */
 		return tc3589x_set_bits(tc3589x, odereg, BIT(pos), BIT(pos));
-	case LINE_MODE_PUSH_PULL:
+	case PIN_CONFIG_DRIVE_PUSH_PULL:
 		/* Disable open drain/source mode */
 		return tc3589x_set_bits(tc3589x, odereg, BIT(pos), 0);
 	default:
@@ -148,7 +147,7 @@ static const struct gpio_chip template_chip = {
 	.direction_output	= tc3589x_gpio_direction_output,
 	.direction_input	= tc3589x_gpio_direction_input,
 	.get_direction		= tc3589x_gpio_get_direction,
-	.set_single_ended	= tc3589x_gpio_set_single_ended,
+	.set_config		= tc3589x_gpio_set_config,
 	.can_sleep		= true,
 };
 
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 661b0e34e067..88529d3c06c9 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -238,6 +238,18 @@ static int tegra_gpio_set_debounce(struct gpio_chip *chip, unsigned int offset,
 	return 0;
 }
 
+static int tegra_gpio_set_config(struct gpio_chip *chip, unsigned int offset,
+				 unsigned long config)
+{
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
+	return tegra_gpio_set_debounce(chip, offset, debounce);
+}
+
 static int tegra_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
 {
 	struct tegra_gpio_info *tgi = gpiochip_get_data(chip);
@@ -615,7 +627,7 @@ static int tegra_gpio_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, tgi);
 
 	if (config->debounce_supported)
-		tgi->gc.set_debounce = tegra_gpio_set_debounce;
+		tgi->gc.set_config = tegra_gpio_set_config;
 
 	tgi->bank_info = devm_kzalloc(&pdev->dev, tgi->bank_count *
 				      sizeof(*tgi->bank_info), GFP_KERNEL);
diff --git a/drivers/gpio/gpio-tps65218.c b/drivers/gpio/gpio-tps65218.c
index 46e6dcc089cb..a379bba57d31 100644
--- a/drivers/gpio/gpio-tps65218.c
+++ b/drivers/gpio/gpio-tps65218.c
@@ -139,28 +139,28 @@ static int tps65218_gpio_request(struct gpio_chip *gc, unsigned offset)
 	return 0;
 }
 
-static int tps65218_gpio_set_single_ended(struct gpio_chip *gc,
-					  unsigned offset,
-					  enum single_ended_mode mode)
+static int tps65218_gpio_set_config(struct gpio_chip *gc, unsigned offset,
+				    unsigned long config)
 {
 	struct tps65218_gpio *tps65218_gpio = gpiochip_get_data(gc);
 	struct tps65218 *tps65218 = tps65218_gpio->tps65218;
+	enum pin_config_param param = pinconf_to_config_param(config);
 
 	switch (offset) {
 	case 0:
 	case 2:
 		/* GPO1 is hardwired to be open drain */
-		if (mode == LINE_MODE_OPEN_DRAIN)
+		if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN)
 			return 0;
 		return -ENOTSUPP;
 	case 1:
 		/* GPO2 is push-pull by default, can be set as open drain. */
-		if (mode == LINE_MODE_OPEN_DRAIN)
+		if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN)
 			return tps65218_clear_bits(tps65218,
 						   TPS65218_REG_CONFIG1,
 						   TPS65218_CONFIG1_GPO2_BUF,
 						   TPS65218_PROTECT_L1);
-		if (mode == LINE_MODE_PUSH_PULL)
+		if (param == PIN_CONFIG_DRIVE_PUSH_PULL)
 			return tps65218_set_bits(tps65218,
 						 TPS65218_REG_CONFIG1,
 						 TPS65218_CONFIG1_GPO2_BUF,
@@ -181,7 +181,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= tps65218_gpio_input,
 	.get			= tps65218_gpio_get,
 	.set			= tps65218_gpio_set,
-	.set_single_ended	= tps65218_gpio_set_single_ended,
+	.set_config		= tps65218_gpio_set_config,
 	.can_sleep		= true,
 	.ngpio			= 3,
 	.base			= -1,
diff --git a/drivers/gpio/gpio-vx855.c b/drivers/gpio/gpio-vx855.c
index 4e450121129b..98a6f1fcc561 100644
--- a/drivers/gpio/gpio-vx855.c
+++ b/drivers/gpio/gpio-vx855.c
@@ -186,23 +186,24 @@ static int vx855gpio_direction_output(struct gpio_chip *gpio,
 	return 0;
 }
 
-static int vx855gpio_set_single_ended(struct gpio_chip *gpio,
-				      unsigned int nr,
-				      enum single_ended_mode mode)
+static int vx855gpio_set_config(struct gpio_chip *gpio, unsigned int nr,
+				unsigned long config)
 {
+	enum pin_config_param param = pinconf_to_config_param(config);
+
 	/* The GPI cannot be single-ended */
 	if (nr < NR_VX855_GPI)
 		return -EINVAL;
 
 	/* The GPO's are push-pull */
 	if (nr < NR_VX855_GPInO) {
-		if (mode != LINE_MODE_PUSH_PULL)
+		if (param != PIN_CONFIG_DRIVE_PUSH_PULL)
 			return -ENOTSUPP;
 		return 0;
 	}
 
 	/* The GPIO's are open drain */
-	if (mode != LINE_MODE_OPEN_DRAIN)
+	if (param != PIN_CONFIG_DRIVE_OPEN_DRAIN)
 		return -ENOTSUPP;
 
 	return 0;
@@ -231,7 +232,7 @@ static void vx855gpio_gpio_setup(struct vx855_gpio *vg)
 	c->direction_output = vx855gpio_direction_output;
 	c->get = vx855gpio_get;
 	c->set = vx855gpio_set;
-	c->set_single_ended = vx855gpio_set_single_ended;
+	c->set_config = vx855gpio_set_config,
 	c->dbg_show = NULL;
 	c->base = 0;
 	c->ngpio = NR_VX855_GP;
diff --git a/drivers/gpio/gpio-wcove.c b/drivers/gpio/gpio-wcove.c
index 34baee5b1dd6..97613de5304e 100644
--- a/drivers/gpio/gpio-wcove.c
+++ b/drivers/gpio/gpio-wcove.c
@@ -202,17 +202,16 @@ static void wcove_gpio_set(struct gpio_chip *chip,
 		regmap_update_bits(wg->regmap, to_reg(gpio, CTRL_OUT), 1, 0);
 }
 
-static int wcove_gpio_set_single_ended(struct gpio_chip *chip,
-					unsigned int gpio,
-					enum single_ended_mode mode)
+static int wcove_gpio_set_config(struct gpio_chip *chip, unsigned int gpio,
+				 unsigned long config)
 {
 	struct wcove_gpio *wg = gpiochip_get_data(chip);
 
-	switch (mode) {
-	case LINE_MODE_OPEN_DRAIN:
+	switch (pinconf_to_config_param(config)) {
+	case PIN_CONFIG_DRIVE_OPEN_DRAIN:
 		return regmap_update_bits(wg->regmap, to_reg(gpio, CTRL_OUT),
 						CTLO_DRV_MASK, CTLO_DRV_OD);
-	case LINE_MODE_PUSH_PULL:
+	case PIN_CONFIG_DRIVE_PUSH_PULL:
 		return regmap_update_bits(wg->regmap, to_reg(gpio, CTRL_OUT),
 						CTLO_DRV_MASK, CTLO_DRV_CMOS);
 	default:
@@ -411,7 +410,7 @@ static int wcove_gpio_probe(struct platform_device *pdev)
 	wg->chip.get_direction = wcove_gpio_get_direction;
 	wg->chip.get = wcove_gpio_get;
 	wg->chip.set = wcove_gpio_set;
-	wg->chip.set_single_ended = wcove_gpio_set_single_ended,
+	wg->chip.set_config = wcove_gpio_set_config,
 	wg->chip.base = -1;
 	wg->chip.ngpio = WCOVE_VGPIO_NUM;
 	wg->chip.can_sleep = true;
diff --git a/drivers/gpio/gpio-wm831x.c b/drivers/gpio/gpio-wm831x.c
index 533707f943f4..00e3839b3f96 100644
--- a/drivers/gpio/gpio-wm831x.c
+++ b/drivers/gpio/gpio-wm831x.c
@@ -101,11 +101,9 @@ static int wm831x_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
 				  WM831X_IRQ_GPIO_1 + offset);
 }
 
-static int wm831x_gpio_set_debounce(struct gpio_chip *chip, unsigned offset,
+static int wm831x_gpio_set_debounce(struct wm831x *wm831x, unsigned offset,
 				    unsigned debounce)
 {
-	struct wm831x_gpio *wm831x_gpio = gpiochip_get_data(chip);
-	struct wm831x *wm831x = wm831x_gpio->wm831x;
 	int reg = WM831X_GPIO1_CONTROL + offset;
 	int ret, fn;
 
@@ -132,21 +130,23 @@ static int wm831x_gpio_set_debounce(struct gpio_chip *chip, unsigned offset,
 	return wm831x_set_bits(wm831x, reg, WM831X_GPN_FN_MASK, fn);
 }
 
-static int wm831x_set_single_ended(struct gpio_chip *chip,
-				   unsigned int offset,
-				   enum single_ended_mode mode)
+static int wm831x_set_config(struct gpio_chip *chip, unsigned int offset,
+			     unsigned long config)
 {
 	struct wm831x_gpio *wm831x_gpio = gpiochip_get_data(chip);
 	struct wm831x *wm831x = wm831x_gpio->wm831x;
 	int reg = WM831X_GPIO1_CONTROL + offset;
 
-	switch (mode) {
-	case LINE_MODE_OPEN_DRAIN:
+	switch (pinconf_to_config_param(config)) {
+	case PIN_CONFIG_DRIVE_OPEN_DRAIN:
 		return wm831x_set_bits(wm831x, reg,
 				       WM831X_GPN_OD_MASK, WM831X_GPN_OD);
-	case LINE_MODE_PUSH_PULL:
+	case PIN_CONFIG_DRIVE_PUSH_PULL:
 		return wm831x_set_bits(wm831x, reg,
 				       WM831X_GPN_OD_MASK, 0);
+	case PIN_CONFIG_INPUT_DEBOUNCE:
+		return wm831x_gpio_set_debounce(wm831x, offset,
+			pinconf_to_config_argument(config));
 	default:
 		break;
 	}
@@ -255,8 +255,7 @@ static const struct gpio_chip template_chip = {
 	.direction_output	= wm831x_gpio_direction_out,
 	.set			= wm831x_gpio_set,
 	.to_irq			= wm831x_gpio_to_irq,
-	.set_debounce		= wm831x_gpio_set_debounce,
-	.set_single_ended	= wm831x_set_single_ended,
+	.set_config		= wm831x_set_config,
 	.dbg_show		= wm831x_gpio_dbg_show,
 	.can_sleep		= true,
 };
diff --git a/drivers/gpio/gpio-wm8994.c b/drivers/gpio/gpio-wm8994.c
index 68410fda6138..1e35756ac55b 100644
--- a/drivers/gpio/gpio-wm8994.c
+++ b/drivers/gpio/gpio-wm8994.c
@@ -103,19 +103,18 @@ static void wm8994_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 	wm8994_set_bits(wm8994, WM8994_GPIO_1 + offset, WM8994_GPN_LVL, value);
 }
 
-static int wm8994_gpio_set_single_ended(struct gpio_chip *chip,
-					unsigned int offset,
-					enum single_ended_mode mode)
+static int wm8994_gpio_set_config(struct gpio_chip *chip, unsigned int offset,
+				  unsigned long config)
 {
 	struct wm8994_gpio *wm8994_gpio = gpiochip_get_data(chip);
 	struct wm8994 *wm8994 = wm8994_gpio->wm8994;
 
-	switch (mode) {
-	case LINE_MODE_OPEN_DRAIN:
+	switch (pinconf_to_config_param(config)) {
+	case PIN_CONFIG_DRIVE_OPEN_DRAIN:
 		return wm8994_set_bits(wm8994, WM8994_GPIO_1 + offset,
 				       WM8994_GPN_OP_CFG_MASK,
 				       WM8994_GPN_OP_CFG);
-	case LINE_MODE_PUSH_PULL:
+	case PIN_CONFIG_DRIVE_PUSH_PULL:
 		return wm8994_set_bits(wm8994, WM8994_GPIO_1 + offset,
 				       WM8994_GPN_OP_CFG_MASK, 0);
 	default:
@@ -257,7 +256,7 @@ static const struct gpio_chip template_chip = {
 	.get			= wm8994_gpio_get,
 	.direction_output	= wm8994_gpio_direction_out,
 	.set			= wm8994_gpio_set,
-	.set_single_ended	= wm8994_gpio_set_single_ended,
+	.set_config		= wm8994_gpio_set_config,
 	.to_irq			= wm8994_gpio_to_irq,
 	.dbg_show		= wm8994_gpio_dbg_show,
 	.can_sleep		= true,
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index f4c26c7826cd..b1659860be1a 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1876,6 +1876,19 @@ void gpiochip_generic_free(struct gpio_chip *chip, unsigned offset)
 }
 EXPORT_SYMBOL_GPL(gpiochip_generic_free);
 
+/**
+ * gpiochip_generic_config() - apply configuration for a pin
+ * @chip: the gpiochip owning the GPIO
+ * @offset: the offset of the GPIO to apply the configuration
+ * @config: the configuration to be applied
+ */
+int gpiochip_generic_config(struct gpio_chip *chip, unsigned offset,
+			    unsigned long config)
+{
+	return pinctrl_gpio_set_config(chip->gpiodev->base + offset, config);
+}
+EXPORT_SYMBOL_GPL(gpiochip_generic_config);
+
 #ifdef CONFIG_PINCTRL
 
 /**
@@ -2264,6 +2277,14 @@ int gpiod_direction_input(struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(gpiod_direction_input);
 
+static int gpio_set_drive_single_ended(struct gpio_chip *gc, unsigned offset,
+				       enum pin_config_param mode)
+{
+	unsigned long config = { PIN_CONF_PACKED(mode, 0) };
+
+	return gc->set_config ? gc->set_config(gc, offset, config) : -ENOTSUPP;
+}
+
 static int _gpiod_direction_output_raw(struct gpio_desc *desc, int value)
 {
 	struct gpio_chip *gc = desc->gdev->chip;
@@ -2280,32 +2301,25 @@ static int _gpiod_direction_output_raw(struct gpio_desc *desc, int value)
 
 	if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) {
 		/* First see if we can enable open drain in hardware */
-		if (gc->set_single_ended) {
-			ret = gc->set_single_ended(gc, gpio_chip_hwgpio(desc),
-						   LINE_MODE_OPEN_DRAIN);
-			if (!ret)
-				goto set_output_value;
-		}
+		ret = gpio_set_drive_single_ended(gc, gpio_chip_hwgpio(desc),
+						  PIN_CONFIG_DRIVE_OPEN_DRAIN);
+		if (!ret)
+			goto set_output_value;
 		/* Emulate open drain by not actively driving the line high */
 		if (val)
 			return gpiod_direction_input(desc);
 	}
 	else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) {
-		if (gc->set_single_ended) {
-			ret = gc->set_single_ended(gc, gpio_chip_hwgpio(desc),
-						   LINE_MODE_OPEN_SOURCE);
-			if (!ret)
-				goto set_output_value;
-		}
+		ret = gpio_set_drive_single_ended(gc, gpio_chip_hwgpio(desc),
+						  PIN_CONFIG_DRIVE_OPEN_SOURCE);
+		if (!ret)
+			goto set_output_value;
 		/* Emulate open source by not actively driving the line low */
 		if (!val)
 			return gpiod_direction_input(desc);
 	} else {
-		/* Make sure to disable open drain/source hardware, if any */
-		if (gc->set_single_ended)
-			gc->set_single_ended(gc,
-					     gpio_chip_hwgpio(desc),
-					     LINE_MODE_PUSH_PULL);
+		gpio_set_drive_single_ended(gc, gpio_chip_hwgpio(desc),
+					    PIN_CONFIG_DRIVE_PUSH_PULL);
 	}
 
 set_output_value:
@@ -2376,17 +2390,19 @@ EXPORT_SYMBOL_GPL(gpiod_direction_output);
 int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
 {
 	struct gpio_chip	*chip;
+	unsigned long		config;
 
 	VALIDATE_DESC(desc);
 	chip = desc->gdev->chip;
-	if (!chip->set || !chip->set_debounce) {
+	if (!chip->set || !chip->set_config) {
 		gpiod_dbg(desc,
-			  "%s: missing set() or set_debounce() operations\n",
+			  "%s: missing set() or set_config() operations\n",
 			  __func__);
 		return -ENOTSUPP;
 	}
 
-	return chip->set_debounce(chip, gpio_chip_hwgpio(desc), debounce);
+	config = pinconf_to_config_packed(PIN_CONFIG_INPUT_DEBOUNCE, debounce);
+	return chip->set_config(chip, gpio_chip_hwgpio(desc), config);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_debounce);
 
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
index f9aef2ac03a1..3cf384f8b122 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
@@ -1054,6 +1054,18 @@ static int mtk_gpio_set_debounce(struct gpio_chip *chip, unsigned offset,
 	return 0;
 }
 
+static int mtk_gpio_set_config(struct gpio_chip *chip, unsigned offset,
+			       unsigned long config)
+{
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
+	return mtk_gpio_set_debounce(chip, offset, debounce);
+}
+
 static const struct gpio_chip mtk_gpio_chip = {
 	.owner			= THIS_MODULE,
 	.request		= gpiochip_generic_request,
@@ -1064,7 +1076,7 @@ static const struct gpio_chip mtk_gpio_chip = {
 	.get			= mtk_gpio_get,
 	.set			= mtk_gpio_set,
 	.to_irq			= mtk_gpio_to_irq,
-	.set_debounce		= mtk_gpio_set_debounce,
+	.set_config		= mtk_gpio_set_config,
 	.of_gpio_n_cells	= 2,
 };
 
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index aea310a91821..e440665ecc35 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -164,6 +164,18 @@ static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset,
 	return ret;
 }
 
+static int amd_gpio_set_config(struct gpio_chip *gc, unsigned offset,
+			       unsigned long config)
+{
+	u32 debounce;
+
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
+	return amd_gpio_set_debounce(gc, offset, debounce);
+}
+
 #ifdef CONFIG_DEBUG_FS
 static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc)
 {
@@ -761,7 +773,7 @@ static int amd_gpio_probe(struct platform_device *pdev)
 	gpio_dev->gc.direction_output	= amd_gpio_direction_output;
 	gpio_dev->gc.get			= amd_gpio_get_value;
 	gpio_dev->gc.set			= amd_gpio_set_value;
-	gpio_dev->gc.set_debounce	= amd_gpio_set_debounce;
+	gpio_dev->gc.set_config		= amd_gpio_set_config;
 	gpio_dev->gc.dbg_show		= amd_gpio_dbg_show;
 
 	gpio_dev->gc.base			= 0;
diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c
index 29fb7403d24e..7450f5118445 100644
--- a/drivers/pinctrl/pinctrl-sx150x.c
+++ b/drivers/pinctrl/pinctrl-sx150x.c
@@ -424,41 +424,6 @@ static int sx150x_gpio_get(struct gpio_chip *chip, unsigned int offset)
 	return !!(value & BIT(offset));
 }
 
-static int sx150x_gpio_set_single_ended(struct gpio_chip *chip,
-					unsigned int offset,
-					enum single_ended_mode mode)
-{
-	struct sx150x_pinctrl *pctl = gpiochip_get_data(chip);
-	int ret;
-
-	switch (mode) {
-	case LINE_MODE_PUSH_PULL:
-		if (pctl->data->model != SX150X_789 ||
-		    sx150x_pin_is_oscio(pctl, offset))
-			return 0;
-
-		ret = regmap_write_bits(pctl->regmap,
-					pctl->data->pri.x789.reg_drain,
-					BIT(offset), 0);
-		break;
-
-	case LINE_MODE_OPEN_DRAIN:
-		if (pctl->data->model != SX150X_789 ||
-		    sx150x_pin_is_oscio(pctl, offset))
-			return -ENOTSUPP;
-
-		ret = regmap_write_bits(pctl->regmap,
-					pctl->data->pri.x789.reg_drain,
-					BIT(offset), BIT(offset));
-		break;
-	default:
-		ret = -ENOTSUPP;
-		break;
-	}
-
-	return ret;
-}
-
 static int __sx150x_gpio_set(struct sx150x_pinctrl *pctl, unsigned int offset,
 			     int value)
 {
@@ -811,16 +776,26 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin,
 			break;
 
 		case PIN_CONFIG_DRIVE_OPEN_DRAIN:
-			ret = sx150x_gpio_set_single_ended(&pctl->gpio,
-						pin, LINE_MODE_OPEN_DRAIN);
+			if (pctl->data->model != SX150X_789 ||
+			    sx150x_pin_is_oscio(pctl, pin))
+				return -ENOTSUPP;
+
+			ret = regmap_write_bits(pctl->regmap,
+						pctl->data->pri.x789.reg_drain,
+						BIT(pin), BIT(pin));
 			if (ret < 0)
 				return ret;
 
 			break;
 
 		case PIN_CONFIG_DRIVE_PUSH_PULL:
-			ret = sx150x_gpio_set_single_ended(&pctl->gpio,
-						pin, LINE_MODE_PUSH_PULL);
+			if (pctl->data->model != SX150X_789 ||
+			    sx150x_pin_is_oscio(pctl, pin))
+				return 0;
+
+			ret = regmap_write_bits(pctl->regmap,
+						pctl->data->pri.x789.reg_drain,
+						BIT(pin), 0);
 			if (ret < 0)
 				return ret;
 
@@ -1178,7 +1153,7 @@ static int sx150x_probe(struct i2c_client *client,
 	pctl->gpio.direction_output = sx150x_gpio_direction_output;
 	pctl->gpio.get = sx150x_gpio_get;
 	pctl->gpio.set = sx150x_gpio_set;
-	pctl->gpio.set_single_ended = sx150x_gpio_set_single_ended;
+	pctl->gpio.set_config = gpiochip_generic_config;
 	pctl->gpio.parent = dev;
 #ifdef CONFIG_OF_GPIO
 	pctl->gpio.of_node = dev->of_node;
diff --git a/drivers/staging/greybus/gpio.c b/drivers/staging/greybus/gpio.c
index 250caa00de5e..51384bdde450 100644
--- a/drivers/staging/greybus/gpio.c
+++ b/drivers/staging/greybus/gpio.c
@@ -474,17 +474,20 @@ static void gb_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 	gb_gpio_set_value_operation(ggc, (u8)offset, !!value);
 }
 
-static int gb_gpio_set_debounce(struct gpio_chip *chip, unsigned offset,
-					unsigned debounce)
+static int gb_gpio_set_config(struct gpio_chip *chip, unsigned offset,
+			      unsigned long config)
 {
 	struct gb_gpio_controller *ggc = gpio_chip_to_gb_gpio_controller(chip);
-	u16 usec;
+	u32 debounce;
 
+	if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE)
+		return -ENOTSUPP;
+
+	debounce = pinconf_to_config_argument(config);
 	if (debounce > U16_MAX)
 		return -EINVAL;
-	usec = (u16)debounce;
 
-	return gb_gpio_set_debounce_operation(ggc, (u8)offset, usec);
+	return gb_gpio_set_debounce_operation(ggc, (u8)offset, (u16)debounce);
 }
 
 static int gb_gpio_controller_setup(struct gb_gpio_controller *ggc)
@@ -689,7 +692,7 @@ static int gb_gpio_probe(struct gbphy_device *gbphy_dev,
 	gpio->direction_output = gb_gpio_direction_output;
 	gpio->get = gb_gpio_get;
 	gpio->set = gb_gpio_set;
-	gpio->set_debounce = gb_gpio_set_debounce;
+	gpio->set_config = gb_gpio_set_config;
 	gpio->to_irq = gb_gpio_to_irq;
 	gpio->base = -1;		/* Allocate base dynamically */
 	gpio->ngpio = ggc->line_max + 1;
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index fff718352e0c..5d61d0871f2e 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -1329,17 +1329,20 @@ static int cp210x_gpio_direction_output(struct gpio_chip *gc, unsigned int gpio,
 	return 0;
 }
 
-static int cp210x_gpio_set_single_ended(struct gpio_chip *gc, unsigned int gpio,
-					enum single_ended_mode mode)
+static int cp210x_gpio_set_config(struct gpio_chip *gc, unsigned int gpio,
+				  unsigned long config)
 {
 	struct usb_serial *serial = gpiochip_get_data(gc);
 	struct cp210x_serial_private *priv = usb_get_serial_data(serial);
+	enum pin_config_param param = pinconf_to_config_param(config);
 
 	/* Succeed only if in correct mode (this can't be set at runtime) */
-	if ((mode == LINE_MODE_PUSH_PULL) && (priv->gpio_mode & BIT(gpio)))
+	if ((param == PIN_CONFIG_DRIVE_PUSH_PULL) &&
+	    (priv->gpio_mode & BIT(gpio)))
 		return 0;
 
-	if ((mode == LINE_MODE_OPEN_DRAIN) && !(priv->gpio_mode & BIT(gpio)))
+	if ((param == PIN_CONFIG_DRIVE_OPEN_DRAIN) &&
+	    !(priv->gpio_mode & BIT(gpio)))
 		return 0;
 
 	return -ENOTSUPP;
@@ -1402,7 +1405,7 @@ static int cp2105_shared_gpio_init(struct usb_serial *serial)
 	priv->gc.direction_output = cp210x_gpio_direction_output;
 	priv->gc.get = cp210x_gpio_get;
 	priv->gc.set = cp210x_gpio_set;
-	priv->gc.set_single_ended = cp210x_gpio_set_single_ended;
+	priv->gc.set_config = cp210x_gpio_set_config;
 	priv->gc.owner = THIS_MODULE;
 	priv->gc.parent = &serial->interface->dev;
 	priv->gc.base = -1;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index c2748accea71..db2022910caf 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -8,6 +8,7 @@
 #include <linux/irqdomain.h>
 #include <linux/lockdep.h>
 #include <linux/pinctrl/pinctrl.h>
+#include <linux/pinctrl/pinconf-generic.h>
 
 struct gpio_desc;
 struct of_phandle_args;
@@ -18,18 +19,6 @@ struct module;
 
 #ifdef CONFIG_GPIOLIB
 
-/**
- * enum single_ended_mode - mode for single ended operation
- * @LINE_MODE_PUSH_PULL: normal mode for a GPIO line, drive actively high/low
- * @LINE_MODE_OPEN_DRAIN: set line to be open drain
- * @LINE_MODE_OPEN_SOURCE: set line to be open source
- */
-enum single_ended_mode {
-	LINE_MODE_PUSH_PULL,
-	LINE_MODE_OPEN_DRAIN,
-	LINE_MODE_OPEN_SOURCE,
-};
-
 /**
  * struct gpio_chip - abstract a GPIO controller
  * @label: a functional name for the GPIO device, such as a part
@@ -48,16 +37,8 @@ enum single_ended_mode {
  * @get: returns value for signal "offset", 0=low, 1=high, or negative error
  * @set: assigns output value for signal "offset"
  * @set_multiple: assigns output values for multiple signals defined by "mask"
- * @set_debounce: optional hook for setting debounce time for specified gpio in
- *	interrupt triggered gpio chips
- * @set_single_ended: optional hook for setting a line as open drain, open
- *	source, or non-single ended (restore from open drain/source to normal
- *	push-pull mode) this should be implemented if the hardware supports
- *	open drain or open source settings. The GPIOlib will otherwise try
- *	to emulate open drain/source by not actively driving lines high/low
- *	if a consumer request this. The driver may return -ENOTSUPP if e.g.
- *	it supports just open drain but not open source and is called
- *	with LINE_MODE_OPEN_SOURCE as mode argument.
+ * @set_config: optional hook for all kinds of settings. Uses the same
+ *	packed config format as generic pinconf.
  * @to_irq: optional hook supporting non-static gpio_to_irq() mappings;
  *	implementation may not sleep
  * @dbg_show: optional routine to show contents in debugfs; default code
@@ -150,13 +131,9 @@ struct gpio_chip {
 	void			(*set_multiple)(struct gpio_chip *chip,
 						unsigned long *mask,
 						unsigned long *bits);
-	int			(*set_debounce)(struct gpio_chip *chip,
-						unsigned offset,
-						unsigned debounce);
-	int			(*set_single_ended)(struct gpio_chip *chip,
-						unsigned offset,
-						enum single_ended_mode mode);
-
+	int			(*set_config)(struct gpio_chip *chip,
+					      unsigned offset,
+					      unsigned long config);
 	int			(*to_irq)(struct gpio_chip *chip,
 						unsigned offset);
 
@@ -310,6 +287,8 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 
 int gpiochip_generic_request(struct gpio_chip *chip, unsigned offset);
 void gpiochip_generic_free(struct gpio_chip *chip, unsigned offset);
+int gpiochip_generic_config(struct gpio_chip *chip, unsigned offset,
+			    unsigned long config);
 
 #ifdef CONFIG_PINCTRL
 
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 9a09107c890e..7620eb127cff 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -12,12 +12,6 @@
 #ifndef __LINUX_PINCTRL_PINCONF_GENERIC_H
 #define __LINUX_PINCTRL_PINCONF_GENERIC_H
 
-/*
- * You shouldn't even be able to compile with these enums etc unless you're
- * using generic pin config. That is why this is defined out.
- */
-#ifdef CONFIG_GENERIC_PINCONF
-
 /**
  * enum pin_config_param - possible pin configuration parameters
  * @PIN_CONFIG_BIAS_BUS_HOLD: the pin will be set to weakly latch so that it
@@ -118,18 +112,6 @@ enum pin_config_param {
 	PIN_CONFIG_MAX = 0xFF,
 };
 
-#ifdef CONFIG_DEBUG_FS
-#define PCONFDUMP(a, b, c, d) { .param = a, .display = b, .format = c, \
-				.has_arg = d }
-
-struct pin_config_item {
-	const enum pin_config_param param;
-	const char * const display;
-	const char * const format;
-	bool has_arg;
-};
-#endif /* CONFIG_DEBUG_FS */
-
 /*
  * Helpful configuration macro to be used in tables etc.
  */
@@ -158,6 +140,21 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 	return PIN_CONF_PACKED(param, argument);
 }
 
+#ifdef CONFIG_GENERIC_PINCONF
+
+#ifdef CONFIG_DEBUG_FS
+#define PCONFDUMP(a, b, c, d) {					\
+	.param = a, .display = b, .format = c, .has_arg = d	\
+	}
+
+struct pin_config_item {
+	const enum pin_config_param param;
+	const char * const display;
+	const char * const format;
+	bool has_arg;
+};
+#endif /* CONFIG_DEBUG_FS */
+
 #ifdef CONFIG_OF
 
 #include <linux/device.h>
-- 
cgit v1.2.3


From 228c8c6b1f4376788e9d5ab00d50b10228eb40d3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 26 Jan 2017 17:15:44 +0100
Subject: wireless: define cipher/AKM suites using a macro

The spec writes cipher/AKM suites as something like 00-0F-AC:9,
but the part after the colon isn't hex, it's decimal, so that
we've already had a few mistakes (in other code, or unmerged
patches) to e.g. write 0x000FAC10 instead of 0x000FAC0A.

Use a macro to avoid that problem.

Reviewed-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 87d1937e4671..02768de209d6 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2324,31 +2324,33 @@ enum ieee80211_sa_query_action {
 };
 
 
+#define SUITE(oui, id)	(((oui) << 8) | (id))
+
 /* cipher suite selectors */
-#define WLAN_CIPHER_SUITE_USE_GROUP	0x000FAC00
-#define WLAN_CIPHER_SUITE_WEP40		0x000FAC01
-#define WLAN_CIPHER_SUITE_TKIP		0x000FAC02
-/* reserved: 				0x000FAC03 */
-#define WLAN_CIPHER_SUITE_CCMP		0x000FAC04
-#define WLAN_CIPHER_SUITE_WEP104	0x000FAC05
-#define WLAN_CIPHER_SUITE_AES_CMAC	0x000FAC06
-#define WLAN_CIPHER_SUITE_GCMP		0x000FAC08
-#define WLAN_CIPHER_SUITE_GCMP_256	0x000FAC09
-#define WLAN_CIPHER_SUITE_CCMP_256	0x000FAC0A
-#define WLAN_CIPHER_SUITE_BIP_GMAC_128	0x000FAC0B
-#define WLAN_CIPHER_SUITE_BIP_GMAC_256	0x000FAC0C
-#define WLAN_CIPHER_SUITE_BIP_CMAC_256	0x000FAC0D
-
-#define WLAN_CIPHER_SUITE_SMS4		0x00147201
+#define WLAN_CIPHER_SUITE_USE_GROUP	SUITE(0x000FAC, 0)
+#define WLAN_CIPHER_SUITE_WEP40		SUITE(0x000FAC, 1)
+#define WLAN_CIPHER_SUITE_TKIP		SUITE(0x000FAC, 2)
+/* reserved: 				SUITE(0x000FAC, 3) */
+#define WLAN_CIPHER_SUITE_CCMP		SUITE(0x000FAC, 4)
+#define WLAN_CIPHER_SUITE_WEP104	SUITE(0x000FAC, 5)
+#define WLAN_CIPHER_SUITE_AES_CMAC	SUITE(0x000FAC, 6)
+#define WLAN_CIPHER_SUITE_GCMP		SUITE(0x000FAC, 8)
+#define WLAN_CIPHER_SUITE_GCMP_256	SUITE(0x000FAC, 9)
+#define WLAN_CIPHER_SUITE_CCMP_256	SUITE(0x000FAC, 10)
+#define WLAN_CIPHER_SUITE_BIP_GMAC_128	SUITE(0x000FAC, 11)
+#define WLAN_CIPHER_SUITE_BIP_GMAC_256	SUITE(0x000FAC, 12)
+#define WLAN_CIPHER_SUITE_BIP_CMAC_256	SUITE(0x000FAC, 13)
+
+#define WLAN_CIPHER_SUITE_SMS4		SUITE(0x001472, 1)
 
 /* AKM suite selectors */
-#define WLAN_AKM_SUITE_8021X		0x000FAC01
-#define WLAN_AKM_SUITE_PSK		0x000FAC02
-#define WLAN_AKM_SUITE_8021X_SHA256	0x000FAC05
-#define WLAN_AKM_SUITE_PSK_SHA256	0x000FAC06
-#define WLAN_AKM_SUITE_TDLS		0x000FAC07
-#define WLAN_AKM_SUITE_SAE		0x000FAC08
-#define WLAN_AKM_SUITE_FT_OVER_SAE	0x000FAC09
+#define WLAN_AKM_SUITE_8021X		SUITE(0x000FAC, 1)
+#define WLAN_AKM_SUITE_PSK		SUITE(0x000FAC, 2)
+#define WLAN_AKM_SUITE_8021X_SHA256	SUITE(0x000FAC, 5)
+#define WLAN_AKM_SUITE_PSK_SHA256	SUITE(0x000FAC, 6)
+#define WLAN_AKM_SUITE_TDLS		SUITE(0x000FAC, 7)
+#define WLAN_AKM_SUITE_SAE		SUITE(0x000FAC, 8)
+#define WLAN_AKM_SUITE_FT_OVER_SAE	SUITE(0x000FAC, 9)
 
 #define WLAN_MAX_KEY_LEN		32
 
-- 
cgit v1.2.3


From 65e251a4634c5644efca6f7e15803f0962d8943d Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Tue, 3 Jan 2017 17:34:56 +0000
Subject: iommu: Drop the of_iommu_{set/get}_ops() interface

With the introduction of the new iommu_{register/get}_instance()
interface in commit e4f10ffe4c9b ("iommu: Make of_iommu_set/get_ops() DT
agnostic") (based on struct fwnode_handle as look-up token, so firmware
agnostic) to register IOMMU instances with the core IOMMU layer there is
no reason to keep the old OF based interface around any longer.

Convert all the IOMMU drivers (and OF IOMMU core code) that rely on the
of_iommu_{set/get}_ops() to the new kernel interface to register/retrieve
IOMMU instances and remove the of_iommu_{set/get}_ops() remaining glue
code in order to complete the interface rework.

Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Sricharan R <sricharan@codeaurora.org>
Tested-by: Yong Wu <yong.wu@mediatek.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/exynos-iommu.c |  2 +-
 drivers/iommu/msm_iommu.c    |  2 +-
 drivers/iommu/mtk_iommu.c    |  2 +-
 drivers/iommu/of_iommu.c     |  4 ++--
 include/linux/of_iommu.h     | 11 -----------
 5 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 57ba0d3091ea..b79e4c452b8b 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -628,7 +628,7 @@ static int __init exynos_sysmmu_probe(struct platform_device *pdev)
 
 	pm_runtime_enable(dev);
 
-	of_iommu_set_ops(dev->of_node, &exynos_iommu_ops);
+	iommu_register_instance(dev->fwnode, &exynos_iommu_ops);
 
 	return 0;
 }
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index b09692bb5b0a..9cd3cee8db4d 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -737,7 +737,7 @@ static int msm_iommu_probe(struct platform_device *pdev)
 	}
 
 	list_add(&iommu->dev_node, &qcom_iommu_devices);
-	of_iommu_set_ops(pdev->dev.of_node, &msm_iommu_ops);
+	iommu_register_instance(pdev->dev.fwnode, &msm_iommu_ops);
 
 	pr_info("device mapped at %p, irq %d with %d ctx banks\n",
 		iommu->base, iommu->irq, iommu->ncb);
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 1479c76ece9e..0596ab224ff0 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -655,7 +655,7 @@ static int mtk_iommu_init_fn(struct device_node *np)
 		return ret;
 	}
 
-	of_iommu_set_ops(np, &mtk_iommu_ops);
+	iommu_register_instance(&np->fwnode, &mtk_iommu_ops);
 	return 0;
 }
 
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 0f57ddc4ecc2..d7f480a52736 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -127,7 +127,7 @@ static const struct iommu_ops
 			   "iommu-map-mask", &iommu_spec.np, iommu_spec.args))
 		return NULL;
 
-	ops = of_iommu_get_ops(iommu_spec.np);
+	ops = iommu_get_instance(&iommu_spec.np->fwnode);
 	if (!ops || !ops->of_xlate ||
 	    iommu_fwspec_init(&pdev->dev, &iommu_spec.np->fwnode, ops) ||
 	    ops->of_xlate(&pdev->dev, &iommu_spec))
@@ -157,7 +157,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 					   "#iommu-cells", idx,
 					   &iommu_spec)) {
 		np = iommu_spec.np;
-		ops = of_iommu_get_ops(np);
+		ops = iommu_get_instance(&np->fwnode);
 
 		if (!ops || !ops->of_xlate ||
 		    iommu_fwspec_init(dev, &np->fwnode, ops) ||
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index 6a7fc5051099..13394ac83c66 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -31,17 +31,6 @@ static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
 
 #endif	/* CONFIG_OF_IOMMU */
 
-static inline void of_iommu_set_ops(struct device_node *np,
-				    const struct iommu_ops *ops)
-{
-	iommu_register_instance(&np->fwnode, ops);
-}
-
-static inline const struct iommu_ops *of_iommu_get_ops(struct device_node *np)
-{
-	return iommu_get_instance(&np->fwnode);
-}
-
 extern struct of_device_id __iommu_of_table;
 
 typedef int (*of_iommu_init_fn)(struct device_node *);
-- 
cgit v1.2.3


From 1141d9d08184565b71a943a50ffe712b2dfc697f Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 23 Jan 2017 12:07:42 -0600
Subject: clk: x86: Add Atom PMC platform clocks

The BayTrail and CherryTrail platforms provide platform clocks
through their Power Management Controller (PMC).

The SoC supports up to 6 clocks (PMC_PLT_CLK[0..5]) with a
frequency of either 19.2 MHz (PLL) or 25 MHz (XTAL) for BayTrail
and a frequency of 19.2 MHz (XTAL) for CherryTrail. These clocks
are available for general system use, where appropriate, and each
have Control & Frequency register fields associated with them.

Port from legacy by Pierre Bossart, integration in clock framework
by Irina Tirdea

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/x86/Makefile                       |   1 +
 drivers/clk/x86/clk-pmc-atom.c                 | 371 +++++++++++++++++++++++++
 include/linux/platform_data/x86/clk-pmc-atom.h |  44 +++
 3 files changed, 416 insertions(+)
 create mode 100644 drivers/clk/x86/clk-pmc-atom.c
 create mode 100644 include/linux/platform_data/x86/clk-pmc-atom.h

(limited to 'include/linux')

diff --git a/drivers/clk/x86/Makefile b/drivers/clk/x86/Makefile
index 04781389d0fb..1367afb03858 100644
--- a/drivers/clk/x86/Makefile
+++ b/drivers/clk/x86/Makefile
@@ -1,2 +1,3 @@
 clk-x86-lpss-objs		:= clk-lpt.o
 obj-$(CONFIG_X86_INTEL_LPSS)	+= clk-x86-lpss.o
+obj-$(CONFIG_PMC_ATOM)		+= clk-pmc-atom.o
diff --git a/drivers/clk/x86/clk-pmc-atom.c b/drivers/clk/x86/clk-pmc-atom.c
new file mode 100644
index 000000000000..2b60577703ef
--- /dev/null
+++ b/drivers/clk/x86/clk-pmc-atom.c
@@ -0,0 +1,371 @@
+/*
+ * Intel Atom platform clocks driver for BayTrail and CherryTrail SoCs
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Irina Tirdea <irina.tirdea@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/clkdev.h>
+#include <linux/err.h>
+#include <linux/platform_data/x86/clk-pmc-atom.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#define PLT_CLK_NAME_BASE	"pmc_plt_clk"
+
+#define PMC_CLK_CTL_OFFSET		0x60
+#define PMC_CLK_CTL_SIZE		4
+#define PMC_CLK_NUM			6
+#define PMC_CLK_CTL_GATED_ON_D3		0x0
+#define PMC_CLK_CTL_FORCE_ON		0x1
+#define PMC_CLK_CTL_FORCE_OFF		0x2
+#define PMC_CLK_CTL_RESERVED		0x3
+#define PMC_MASK_CLK_CTL		GENMASK(1, 0)
+#define PMC_MASK_CLK_FREQ		BIT(2)
+#define PMC_CLK_FREQ_XTAL		(0 << 2)	/* 25 MHz */
+#define PMC_CLK_FREQ_PLL		(1 << 2)	/* 19.2 MHz */
+
+struct clk_plt_fixed {
+	struct clk_hw *clk;
+	struct clk_lookup *lookup;
+};
+
+struct clk_plt {
+	struct clk_hw hw;
+	void __iomem *reg;
+	struct clk_lookup *lookup;
+	/* protect access to PMC registers */
+	spinlock_t lock;
+};
+
+#define to_clk_plt(_hw) container_of(_hw, struct clk_plt, hw)
+
+struct clk_plt_data {
+	struct clk_plt_fixed **parents;
+	u8 nparents;
+	struct clk_plt *clks[PMC_CLK_NUM];
+};
+
+/* Return an index in parent table */
+static inline int plt_reg_to_parent(int reg)
+{
+	switch (reg & PMC_MASK_CLK_FREQ) {
+	default:
+	case PMC_CLK_FREQ_XTAL:
+		return 0;
+	case PMC_CLK_FREQ_PLL:
+		return 1;
+	}
+}
+
+/* Return clk index of parent */
+static inline int plt_parent_to_reg(int index)
+{
+	switch (index) {
+	default:
+	case 0:
+		return PMC_CLK_FREQ_XTAL;
+	case 1:
+		return PMC_CLK_FREQ_PLL;
+	}
+}
+
+/* Abstract status in simpler enabled/disabled value */
+static inline int plt_reg_to_enabled(int reg)
+{
+	switch (reg & PMC_MASK_CLK_CTL) {
+	case PMC_CLK_CTL_GATED_ON_D3:
+	case PMC_CLK_CTL_FORCE_ON:
+		return 1;	/* enabled */
+	case PMC_CLK_CTL_FORCE_OFF:
+	case PMC_CLK_CTL_RESERVED:
+	default:
+		return 0;	/* disabled */
+	}
+}
+
+static void plt_clk_reg_update(struct clk_plt *clk, u32 mask, u32 val)
+{
+	u32 tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&clk->lock, flags);
+
+	tmp = readl(clk->reg);
+	tmp = (tmp & ~mask) | (val & mask);
+	writel(tmp, clk->reg);
+
+	spin_unlock_irqrestore(&clk->lock, flags);
+}
+
+static int plt_clk_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct clk_plt *clk = to_clk_plt(hw);
+
+	plt_clk_reg_update(clk, PMC_MASK_CLK_FREQ, plt_parent_to_reg(index));
+
+	return 0;
+}
+
+static u8 plt_clk_get_parent(struct clk_hw *hw)
+{
+	struct clk_plt *clk = to_clk_plt(hw);
+	u32 value;
+
+	value = readl(clk->reg);
+
+	return plt_reg_to_parent(value);
+}
+
+static int plt_clk_enable(struct clk_hw *hw)
+{
+	struct clk_plt *clk = to_clk_plt(hw);
+
+	plt_clk_reg_update(clk, PMC_MASK_CLK_CTL, PMC_CLK_CTL_FORCE_ON);
+
+	return 0;
+}
+
+static void plt_clk_disable(struct clk_hw *hw)
+{
+	struct clk_plt *clk = to_clk_plt(hw);
+
+	plt_clk_reg_update(clk, PMC_MASK_CLK_CTL, PMC_CLK_CTL_FORCE_OFF);
+}
+
+static int plt_clk_is_enabled(struct clk_hw *hw)
+{
+	struct clk_plt *clk = to_clk_plt(hw);
+	u32 value;
+
+	value = readl(clk->reg);
+
+	return plt_reg_to_enabled(value);
+}
+
+static const struct clk_ops plt_clk_ops = {
+	.enable = plt_clk_enable,
+	.disable = plt_clk_disable,
+	.is_enabled = plt_clk_is_enabled,
+	.get_parent = plt_clk_get_parent,
+	.set_parent = plt_clk_set_parent,
+	.determine_rate = __clk_mux_determine_rate,
+};
+
+static struct clk_plt *plt_clk_register(struct platform_device *pdev, int id,
+					void __iomem *base,
+					const char **parent_names,
+					int num_parents)
+{
+	struct clk_plt *pclk;
+	struct clk_init_data init;
+	int ret;
+
+	pclk = devm_kzalloc(&pdev->dev, sizeof(*pclk), GFP_KERNEL);
+	if (!pclk)
+		return ERR_PTR(-ENOMEM);
+
+	init.name =  kasprintf(GFP_KERNEL, "%s_%d", PLT_CLK_NAME_BASE, id);
+	init.ops = &plt_clk_ops;
+	init.flags = 0;
+	init.parent_names = parent_names;
+	init.num_parents = num_parents;
+
+	pclk->hw.init = &init;
+	pclk->reg = base + PMC_CLK_CTL_OFFSET + id * PMC_CLK_CTL_SIZE;
+	spin_lock_init(&pclk->lock);
+
+	ret = devm_clk_hw_register(&pdev->dev, &pclk->hw);
+	if (ret) {
+		pclk = ERR_PTR(ret);
+		goto err_free_init;
+	}
+
+	pclk->lookup = clkdev_hw_create(&pclk->hw, init.name, NULL);
+	if (!pclk->lookup) {
+		pclk = ERR_PTR(-ENOMEM);
+		goto err_free_init;
+	}
+
+err_free_init:
+	kfree(init.name);
+	return pclk;
+}
+
+static void plt_clk_unregister(struct clk_plt *pclk)
+{
+	clkdev_drop(pclk->lookup);
+}
+
+static struct clk_plt_fixed *plt_clk_register_fixed_rate(struct platform_device *pdev,
+						 const char *name,
+						 const char *parent_name,
+						 unsigned long fixed_rate)
+{
+	struct clk_plt_fixed *pclk;
+
+	pclk = devm_kzalloc(&pdev->dev, sizeof(*pclk), GFP_KERNEL);
+	if (!pclk)
+		return ERR_PTR(-ENOMEM);
+
+	pclk->clk = clk_hw_register_fixed_rate(&pdev->dev, name, parent_name,
+					       0, fixed_rate);
+	if (IS_ERR(pclk->clk))
+		return ERR_CAST(pclk->clk);
+
+	pclk->lookup = clkdev_hw_create(pclk->clk, name, NULL);
+	if (!pclk->lookup) {
+		clk_hw_unregister_fixed_rate(pclk->clk);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return pclk;
+}
+
+static void plt_clk_unregister_fixed_rate(struct clk_plt_fixed *pclk)
+{
+	clkdev_drop(pclk->lookup);
+	clk_hw_unregister_fixed_rate(pclk->clk);
+}
+
+static void plt_clk_unregister_fixed_rate_loop(struct clk_plt_data *data,
+					       unsigned int i)
+{
+	while (i--)
+		plt_clk_unregister_fixed_rate(data->parents[i]);
+}
+
+static void plt_clk_free_parent_names_loop(const char **parent_names,
+					   unsigned int i)
+{
+	while (i--)
+		kfree_const(parent_names[i]);
+	kfree(parent_names);
+}
+
+static void plt_clk_unregister_loop(struct clk_plt_data *data,
+				    unsigned int i)
+{
+	while (i--)
+		plt_clk_unregister(data->clks[i]);
+}
+
+static const char **plt_clk_register_parents(struct platform_device *pdev,
+					     struct clk_plt_data *data,
+					     const struct pmc_clk *clks)
+{
+	const char **parent_names;
+	unsigned int i;
+	int err;
+	int nparents = 0;
+
+	data->nparents = 0;
+	while (clks[nparents].name)
+		nparents++;
+
+	data->parents = devm_kcalloc(&pdev->dev, nparents,
+				     sizeof(*data->parents), GFP_KERNEL);
+	if (!data->parents)
+		return ERR_PTR(-ENOMEM);
+
+	parent_names = kcalloc(nparents, sizeof(*parent_names),
+			       GFP_KERNEL);
+	if (!parent_names)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < nparents; i++) {
+		data->parents[i] =
+			plt_clk_register_fixed_rate(pdev, clks[i].name,
+						    clks[i].parent_name,
+						    clks[i].freq);
+		if (IS_ERR(data->parents[i])) {
+			err = PTR_ERR(data->parents[i]);
+			goto err_unreg;
+		}
+		parent_names[i] = kstrdup_const(clks[i].name, GFP_KERNEL);
+	}
+
+	data->nparents = nparents;
+	return parent_names;
+
+err_unreg:
+	plt_clk_unregister_fixed_rate_loop(data, i);
+	plt_clk_free_parent_names_loop(parent_names, i);
+	return ERR_PTR(err);
+}
+
+static void plt_clk_unregister_parents(struct clk_plt_data *data)
+{
+	plt_clk_unregister_fixed_rate_loop(data, data->nparents);
+}
+
+static int plt_clk_probe(struct platform_device *pdev)
+{
+	const struct pmc_clk_data *pmc_data;
+	const char **parent_names;
+	struct clk_plt_data *data;
+	unsigned int i;
+	int err;
+
+	pmc_data = dev_get_platdata(&pdev->dev);
+	if (!pmc_data || !pmc_data->clks)
+		return -EINVAL;
+
+	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	parent_names = plt_clk_register_parents(pdev, data, pmc_data->clks);
+	if (IS_ERR(parent_names))
+		return PTR_ERR(parent_names);
+
+	for (i = 0; i < PMC_CLK_NUM; i++) {
+		data->clks[i] = plt_clk_register(pdev, i, pmc_data->base,
+						 parent_names, data->nparents);
+		if (IS_ERR(data->clks[i])) {
+			err = PTR_ERR(data->clks[i]);
+			goto err_unreg_clk_plt;
+		}
+	}
+
+	plt_clk_free_parent_names_loop(parent_names, data->nparents);
+
+	platform_set_drvdata(pdev, data);
+	return 0;
+
+err_unreg_clk_plt:
+	plt_clk_unregister_loop(data, i);
+	plt_clk_unregister_parents(data);
+	plt_clk_free_parent_names_loop(parent_names, data->nparents);
+	return err;
+}
+
+static int plt_clk_remove(struct platform_device *pdev)
+{
+	struct clk_plt_data *data;
+
+	data = platform_get_drvdata(pdev);
+
+	plt_clk_unregister_loop(data, PMC_CLK_NUM);
+	plt_clk_unregister_parents(data);
+	return 0;
+}
+
+static struct platform_driver plt_clk_driver = {
+	.driver = {
+		.name = "clk-pmc-atom",
+	},
+	.probe = plt_clk_probe,
+	.remove = plt_clk_remove,
+};
+builtin_platform_driver(plt_clk_driver);
diff --git a/include/linux/platform_data/x86/clk-pmc-atom.h b/include/linux/platform_data/x86/clk-pmc-atom.h
new file mode 100644
index 000000000000..3ab892208343
--- /dev/null
+++ b/include/linux/platform_data/x86/clk-pmc-atom.h
@@ -0,0 +1,44 @@
+/*
+ * Intel Atom platform clocks for BayTrail and CherryTrail SoC.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Irina Tirdea <irina.tirdea@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __PLATFORM_DATA_X86_CLK_PMC_ATOM_H
+#define __PLATFORM_DATA_X86_CLK_PMC_ATOM_H
+
+/**
+ * struct pmc_clk - PMC platform clock configuration
+ *
+ * @name:	identified, typically pmc_plt_clk_<x>, x=[0..5]
+ * @freq:	in Hz, 19.2MHz  and 25MHz (Baytrail only) supported
+ * @parent_name: one of 'xtal' or 'osc'
+ */
+struct pmc_clk {
+	const char *name;
+	unsigned long freq;
+	const char *parent_name;
+};
+
+/**
+ * struct pmc_clk_data - common PMC clock configuration
+ *
+ * @base:	PMC clock register base offset
+ * @clks:	pointer to set of registered clocks, typically 0..5
+ */
+struct pmc_clk_data {
+	void __iomem *base;
+	const struct pmc_clk *clks;
+};
+
+#endif /* __PLATFORM_DATA_X86_CLK_PMC_ATOM_H */
-- 
cgit v1.2.3


From 80a7581f38c0b2e83dc883a2125340b90b5635ec Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Mon, 23 Jan 2017 12:07:43 -0600
Subject: arch/x86/platform/atom: Move pmc_atom to drivers/platform/x86

The pmc_atom driver does not contain any architecture specific
code. It only enables the SoC Power Management Controller driver
for BayTrail and CherryTrail platforms.

Move the pmc_atom driver from arch/x86/platform/atom to
drivers/platform/x86. Also clean-up and reorder include files by
alphabetical order in pmc_atom.h

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 arch/x86/Kconfig                           |   4 -
 arch/x86/include/asm/pmc_atom.h            | 158 ----------
 arch/x86/platform/atom/Makefile            |   1 -
 arch/x86/platform/atom/pmc_atom.c          | 460 -----------------------------
 drivers/acpi/acpi_lpss.c                   |   2 +-
 drivers/platform/x86/Kconfig               |   4 +
 drivers/platform/x86/Makefile              |   1 +
 drivers/platform/x86/pmc_atom.c            | 459 ++++++++++++++++++++++++++++
 include/linux/platform_data/x86/pmc_atom.h | 158 ++++++++++
 9 files changed, 623 insertions(+), 624 deletions(-)
 delete mode 100644 arch/x86/include/asm/pmc_atom.h
 delete mode 100644 arch/x86/platform/atom/pmc_atom.c
 create mode 100644 drivers/platform/x86/pmc_atom.c
 create mode 100644 include/linux/platform_data/x86/pmc_atom.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..7b4f1789f386 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2789,10 +2789,6 @@ config X86_DMA_REMAP
 	bool
 	depends on STA2X11
 
-config PMC_ATOM
-	def_bool y
-        depends on PCI
-
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
deleted file mode 100644
index aa8744c77c6d..000000000000
--- a/arch/x86/include/asm/pmc_atom.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Intel Atom SOC Power Management Controller Header File
- * Copyright (c) 2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- */
-
-#ifndef PMC_ATOM_H
-#define PMC_ATOM_H
-
-/* ValleyView Power Control Unit PCI Device ID */
-#define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
-/* CherryTrail Power Control Unit PCI Device ID */
-#define	PCI_DEVICE_ID_CHT_PMC	0x229C
-
-/* PMC Memory mapped IO registers */
-#define	PMC_BASE_ADDR_OFFSET	0x44
-#define	PMC_BASE_ADDR_MASK	0xFFFFFE00
-#define	PMC_MMIO_REG_LEN	0x100
-#define	PMC_REG_BIT_WIDTH	32
-
-/* BIOS uses FUNC_DIS to disable specific function */
-#define	PMC_FUNC_DIS		0x34
-#define	PMC_FUNC_DIS_2		0x38
-
-/* CHT specific bits in FUNC_DIS2 register */
-#define	BIT_FD_GMM		BIT(3)
-#define	BIT_FD_ISH		BIT(4)
-
-/* S0ix wake event control */
-#define	PMC_S0IX_WAKE_EN	0x3C
-
-#define	BIT_LPC_CLOCK_RUN		BIT(4)
-#define	BIT_SHARED_IRQ_GPSC		BIT(5)
-#define	BIT_ORED_DEDICATED_IRQ_GPSS	BIT(18)
-#define	BIT_ORED_DEDICATED_IRQ_GPSC	BIT(19)
-#define	BIT_SHARED_IRQ_GPSS		BIT(20)
-
-#define	PMC_WAKE_EN_SETTING	~(BIT_LPC_CLOCK_RUN | \
-				BIT_SHARED_IRQ_GPSC | \
-				BIT_ORED_DEDICATED_IRQ_GPSS | \
-				BIT_ORED_DEDICATED_IRQ_GPSC | \
-				BIT_SHARED_IRQ_GPSS)
-
-/* The timers acumulate time spent in sleep state */
-#define	PMC_S0IR_TMR		0x80
-#define	PMC_S0I1_TMR		0x84
-#define	PMC_S0I2_TMR		0x88
-#define	PMC_S0I3_TMR		0x8C
-#define	PMC_S0_TMR		0x90
-/* Sleep state counter is in units of of 32us */
-#define	PMC_TMR_SHIFT		5
-
-/* Power status of power islands */
-#define	PMC_PSS			0x98
-
-#define PMC_PSS_BIT_GBE			BIT(0)
-#define PMC_PSS_BIT_SATA		BIT(1)
-#define PMC_PSS_BIT_HDA			BIT(2)
-#define PMC_PSS_BIT_SEC			BIT(3)
-#define PMC_PSS_BIT_PCIE		BIT(4)
-#define PMC_PSS_BIT_LPSS		BIT(5)
-#define PMC_PSS_BIT_LPE			BIT(6)
-#define PMC_PSS_BIT_DFX			BIT(7)
-#define PMC_PSS_BIT_USH_CTRL		BIT(8)
-#define PMC_PSS_BIT_USH_SUS		BIT(9)
-#define PMC_PSS_BIT_USH_VCCS		BIT(10)
-#define PMC_PSS_BIT_USH_VCCA		BIT(11)
-#define PMC_PSS_BIT_OTG_CTRL		BIT(12)
-#define PMC_PSS_BIT_OTG_VCCS		BIT(13)
-#define PMC_PSS_BIT_OTG_VCCA_CLK	BIT(14)
-#define PMC_PSS_BIT_OTG_VCCA		BIT(15)
-#define PMC_PSS_BIT_USB			BIT(16)
-#define PMC_PSS_BIT_USB_SUS		BIT(17)
-
-/* CHT specific bits in PSS register */
-#define	PMC_PSS_BIT_CHT_UFS		BIT(7)
-#define	PMC_PSS_BIT_CHT_UXD		BIT(11)
-#define	PMC_PSS_BIT_CHT_UXD_FD		BIT(12)
-#define	PMC_PSS_BIT_CHT_UX_ENG		BIT(15)
-#define	PMC_PSS_BIT_CHT_USB_SUS		BIT(16)
-#define	PMC_PSS_BIT_CHT_GMM		BIT(17)
-#define	PMC_PSS_BIT_CHT_ISH		BIT(18)
-#define	PMC_PSS_BIT_CHT_DFX_MASTER	BIT(26)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER1	BIT(27)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER2	BIT(28)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER3	BIT(29)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER4	BIT(30)
-#define	PMC_PSS_BIT_CHT_DFX_CLUSTER5	BIT(31)
-
-/* These registers reflect D3 status of functions */
-#define	PMC_D3_STS_0		0xA0
-
-#define	BIT_LPSS1_F0_DMA	BIT(0)
-#define	BIT_LPSS1_F1_PWM1	BIT(1)
-#define	BIT_LPSS1_F2_PWM2	BIT(2)
-#define	BIT_LPSS1_F3_HSUART1	BIT(3)
-#define	BIT_LPSS1_F4_HSUART2	BIT(4)
-#define	BIT_LPSS1_F5_SPI	BIT(5)
-#define	BIT_LPSS1_F6_XXX	BIT(6)
-#define	BIT_LPSS1_F7_XXX	BIT(7)
-#define	BIT_SCC_EMMC		BIT(8)
-#define	BIT_SCC_SDIO		BIT(9)
-#define	BIT_SCC_SDCARD		BIT(10)
-#define	BIT_SCC_MIPI		BIT(11)
-#define	BIT_HDA			BIT(12)
-#define	BIT_LPE			BIT(13)
-#define	BIT_OTG			BIT(14)
-#define	BIT_USH			BIT(15)
-#define	BIT_GBE			BIT(16)
-#define	BIT_SATA		BIT(17)
-#define	BIT_USB_EHCI		BIT(18)
-#define	BIT_SEC			BIT(19)
-#define	BIT_PCIE_PORT0		BIT(20)
-#define	BIT_PCIE_PORT1		BIT(21)
-#define	BIT_PCIE_PORT2		BIT(22)
-#define	BIT_PCIE_PORT3		BIT(23)
-#define	BIT_LPSS2_F0_DMA	BIT(24)
-#define	BIT_LPSS2_F1_I2C1	BIT(25)
-#define	BIT_LPSS2_F2_I2C2	BIT(26)
-#define	BIT_LPSS2_F3_I2C3	BIT(27)
-#define	BIT_LPSS2_F4_I2C4	BIT(28)
-#define	BIT_LPSS2_F5_I2C5	BIT(29)
-#define	BIT_LPSS2_F6_I2C6	BIT(30)
-#define	BIT_LPSS2_F7_I2C7	BIT(31)
-
-#define	PMC_D3_STS_1		0xA4
-#define	BIT_SMB			BIT(0)
-#define	BIT_OTG_SS_PHY		BIT(1)
-#define	BIT_USH_SS_PHY		BIT(2)
-#define	BIT_DFX			BIT(3)
-
-/* CHT specific bits in PMC_D3_STS_1 register */
-#define	BIT_STS_GMM		BIT(1)
-#define	BIT_STS_ISH		BIT(2)
-
-/* PMC I/O Registers */
-#define	ACPI_BASE_ADDR_OFFSET	0x40
-#define	ACPI_BASE_ADDR_MASK	0xFFFFFE00
-#define	ACPI_MMIO_REG_LEN	0x100
-
-#define	PM1_CNT			0x4
-#define	SLEEP_TYPE_MASK		0xFFFFECFF
-#define	SLEEP_TYPE_S5		0x1C00
-#define	SLEEP_ENABLE		0x2000
-
-extern int pmc_atom_read(int offset, u32 *value);
-extern int pmc_atom_write(int offset, u32 value);
-
-#endif /* PMC_ATOM_H */
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
index 40983f5b0858..57be88fa34bb 100644
--- a/arch/x86/platform/atom/Makefile
+++ b/arch/x86/platform/atom/Makefile
@@ -1,2 +1 @@
-obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
 obj-$(CONFIG_PUNIT_ATOM_DEBUG)	+= punit_atom_debug.o
diff --git a/arch/x86/platform/atom/pmc_atom.c b/arch/x86/platform/atom/pmc_atom.c
deleted file mode 100644
index 964ff4fc61f9..000000000000
--- a/arch/x86/platform/atom/pmc_atom.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Intel Atom SOC Power Management Controller Driver
- * Copyright (c) 2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/device.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/io.h>
-
-#include <asm/pmc_atom.h>
-
-struct pmc_bit_map {
-	const char *name;
-	u32 bit_mask;
-};
-
-struct pmc_reg_map {
-	const struct pmc_bit_map *d3_sts_0;
-	const struct pmc_bit_map *d3_sts_1;
-	const struct pmc_bit_map *func_dis;
-	const struct pmc_bit_map *func_dis_2;
-	const struct pmc_bit_map *pss;
-};
-
-struct pmc_dev {
-	u32 base_addr;
-	void __iomem *regmap;
-	const struct pmc_reg_map *map;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dbgfs_dir;
-#endif /* CONFIG_DEBUG_FS */
-	bool init;
-};
-
-static struct pmc_dev pmc_device;
-static u32 acpi_base_addr;
-
-static const struct pmc_bit_map d3_sts_0_map[] = {
-	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
-	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
-	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
-	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
-	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
-	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
-	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
-	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
-	{"SCC_EMMC",		BIT_SCC_EMMC},
-	{"SCC_SDIO",		BIT_SCC_SDIO},
-	{"SCC_SDCARD",		BIT_SCC_SDCARD},
-	{"SCC_MIPI",		BIT_SCC_MIPI},
-	{"HDA",			BIT_HDA},
-	{"LPE",			BIT_LPE},
-	{"OTG",			BIT_OTG},
-	{"USH",			BIT_USH},
-	{"GBE",			BIT_GBE},
-	{"SATA",		BIT_SATA},
-	{"USB_EHCI",		BIT_USB_EHCI},
-	{"SEC",			BIT_SEC},
-	{"PCIE_PORT0",		BIT_PCIE_PORT0},
-	{"PCIE_PORT1",		BIT_PCIE_PORT1},
-	{"PCIE_PORT2",		BIT_PCIE_PORT2},
-	{"PCIE_PORT3",		BIT_PCIE_PORT3},
-	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
-	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
-	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
-	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
-	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
-	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
-	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
-	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
-	{},
-};
-
-static struct pmc_bit_map byt_d3_sts_1_map[] = {
-	{"SMB",			BIT_SMB},
-	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
-	{"USH_SS_PHY",		BIT_USH_SS_PHY},
-	{"DFX",			BIT_DFX},
-	{},
-};
-
-static struct pmc_bit_map cht_d3_sts_1_map[] = {
-	{"SMB",			BIT_SMB},
-	{"GMM",			BIT_STS_GMM},
-	{"ISH",			BIT_STS_ISH},
-	{},
-};
-
-static struct pmc_bit_map cht_func_dis_2_map[] = {
-	{"SMB",			BIT_SMB},
-	{"GMM",			BIT_FD_GMM},
-	{"ISH",			BIT_FD_ISH},
-	{},
-};
-
-static const struct pmc_bit_map byt_pss_map[] = {
-	{"GBE",			PMC_PSS_BIT_GBE},
-	{"SATA",		PMC_PSS_BIT_SATA},
-	{"HDA",			PMC_PSS_BIT_HDA},
-	{"SEC",			PMC_PSS_BIT_SEC},
-	{"PCIE",		PMC_PSS_BIT_PCIE},
-	{"LPSS",		PMC_PSS_BIT_LPSS},
-	{"LPE",			PMC_PSS_BIT_LPE},
-	{"DFX",			PMC_PSS_BIT_DFX},
-	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
-	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
-	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
-	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
-	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
-	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
-	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
-	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
-	{"USB",			PMC_PSS_BIT_USB},
-	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
-	{},
-};
-
-static const struct pmc_bit_map cht_pss_map[] = {
-	{"SATA",		PMC_PSS_BIT_SATA},
-	{"HDA",			PMC_PSS_BIT_HDA},
-	{"SEC",			PMC_PSS_BIT_SEC},
-	{"PCIE",		PMC_PSS_BIT_PCIE},
-	{"LPSS",		PMC_PSS_BIT_LPSS},
-	{"LPE",			PMC_PSS_BIT_LPE},
-	{"UFS",			PMC_PSS_BIT_CHT_UFS},
-	{"UXD",			PMC_PSS_BIT_CHT_UXD},
-	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
-	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
-	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
-	{"GMM",			PMC_PSS_BIT_CHT_GMM},
-	{"ISH",			PMC_PSS_BIT_CHT_ISH},
-	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
-	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
-	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
-	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
-	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
-	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
-	{},
-};
-
-static const struct pmc_reg_map byt_reg_map = {
-	.d3_sts_0	= d3_sts_0_map,
-	.d3_sts_1	= byt_d3_sts_1_map,
-	.func_dis	= d3_sts_0_map,
-	.func_dis_2	= byt_d3_sts_1_map,
-	.pss		= byt_pss_map,
-};
-
-static const struct pmc_reg_map cht_reg_map = {
-	.d3_sts_0	= d3_sts_0_map,
-	.d3_sts_1	= cht_d3_sts_1_map,
-	.func_dis	= d3_sts_0_map,
-	.func_dis_2	= cht_func_dis_2_map,
-	.pss		= cht_pss_map,
-};
-
-static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
-{
-	return readl(pmc->regmap + reg_offset);
-}
-
-static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
-{
-	writel(val, pmc->regmap + reg_offset);
-}
-
-int pmc_atom_read(int offset, u32 *value)
-{
-	struct pmc_dev *pmc = &pmc_device;
-
-	if (!pmc->init)
-		return -ENODEV;
-
-	*value = pmc_reg_read(pmc, offset);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pmc_atom_read);
-
-int pmc_atom_write(int offset, u32 value)
-{
-	struct pmc_dev *pmc = &pmc_device;
-
-	if (!pmc->init)
-		return -ENODEV;
-
-	pmc_reg_write(pmc, offset, value);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pmc_atom_write);
-
-static void pmc_power_off(void)
-{
-	u16	pm1_cnt_port;
-	u32	pm1_cnt_value;
-
-	pr_info("Preparing to enter system sleep state S5\n");
-
-	pm1_cnt_port = acpi_base_addr + PM1_CNT;
-
-	pm1_cnt_value = inl(pm1_cnt_port);
-	pm1_cnt_value &= SLEEP_TYPE_MASK;
-	pm1_cnt_value |= SLEEP_TYPE_S5;
-	pm1_cnt_value |= SLEEP_ENABLE;
-
-	outl(pm1_cnt_value, pm1_cnt_port);
-}
-
-static void pmc_hw_reg_setup(struct pmc_dev *pmc)
-{
-	/*
-	 * Disable PMC S0IX_WAKE_EN events coming from:
-	 * - LPC clock run
-	 * - GPIO_SUS ored dedicated IRQs
-	 * - GPIO_SCORE ored dedicated IRQs
-	 * - GPIO_SUS shared IRQ
-	 * - GPIO_SCORE shared IRQ
-	 */
-	pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
-}
-
-#ifdef CONFIG_DEBUG_FS
-static void pmc_dev_state_print(struct seq_file *s, int reg_index,
-				u32 sts, const struct pmc_bit_map *sts_map,
-				u32 fd, const struct pmc_bit_map *fd_map)
-{
-	int offset = PMC_REG_BIT_WIDTH * reg_index;
-	int index;
-
-	for (index = 0; sts_map[index].name; index++) {
-		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
-			offset + index, sts_map[index].name,
-			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
-			sts_map[index].bit_mask & sts ?  "D3" : "D0");
-	}
-}
-
-static int pmc_dev_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	const struct pmc_reg_map *m = pmc->map;
-	u32 func_dis, func_dis_2;
-	u32 d3_sts_0, d3_sts_1;
-
-	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
-	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
-	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
-	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
-
-	/* Low part */
-	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
-
-	/* High part */
-	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
-
-	return 0;
-}
-
-static int pmc_dev_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_dev_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_dev_state_ops = {
-	.open		= pmc_dev_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_pss_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	const struct pmc_bit_map *map = pmc->map->pss;
-	u32 pss = pmc_reg_read(pmc, PMC_PSS);
-	int index;
-
-	for (index = 0; map[index].name; index++) {
-		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
-			index, map[index].name,
-			map[index].bit_mask & pss ? "Off" : "On");
-	}
-	return 0;
-}
-
-static int pmc_pss_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_pss_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_pss_state_ops = {
-	.open		= pmc_pss_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
-
-	s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
-	s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
-	s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
-	s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
-	s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
-
-	seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
-	seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
-	seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
-	seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
-	seq_printf(s, "S0   Residency:\t%lldus\n", s0_tmr);
-	return 0;
-}
-
-static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_sleep_tmr_show, inode->i_private);
-}
-
-static const struct file_operations pmc_sleep_tmr_ops = {
-	.open		= pmc_sleep_tmr_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
-{
-	debugfs_remove_recursive(pmc->dbgfs_dir);
-}
-
-static int pmc_dbgfs_register(struct pmc_dev *pmc)
-{
-	struct dentry *dir, *f;
-
-	dir = debugfs_create_dir("pmc_atom", NULL);
-	if (!dir)
-		return -ENOMEM;
-
-	pmc->dbgfs_dir = dir;
-
-	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_dev_state_ops);
-	if (!f)
-		goto err;
-
-	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_pss_state_ops);
-	if (!f)
-		goto err;
-
-	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_sleep_tmr_ops);
-	if (!f)
-		goto err;
-
-	return 0;
-err:
-	pmc_dbgfs_unregister(pmc);
-	return -ENODEV;
-}
-#else
-static int pmc_dbgfs_register(struct pmc_dev *pmc)
-{
-	return 0;
-}
-#endif /* CONFIG_DEBUG_FS */
-
-static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-	struct pmc_dev *pmc = &pmc_device;
-	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
-	int ret;
-
-	/* Obtain ACPI base address */
-	pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
-	acpi_base_addr &= ACPI_BASE_ADDR_MASK;
-
-	/* Install power off function */
-	if (acpi_base_addr != 0 && pm_power_off == NULL)
-		pm_power_off = pmc_power_off;
-
-	pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
-	pmc->base_addr &= PMC_BASE_ADDR_MASK;
-
-	pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
-	if (!pmc->regmap) {
-		dev_err(&pdev->dev, "error: ioremap failed\n");
-		return -ENOMEM;
-	}
-
-	pmc->map = map;
-
-	/* PMC hardware registers setup */
-	pmc_hw_reg_setup(pmc);
-
-	ret = pmc_dbgfs_register(pmc);
-	if (ret)
-		dev_warn(&pdev->dev, "debugfs register failed\n");
-
-	pmc->init = true;
-	return ret;
-}
-
-/*
- * Data for PCI driver interface
- *
- * used by pci_match_id() call below.
- */
-static const struct pci_device_id pmc_pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
-	{ 0, },
-};
-
-static int __init pmc_atom_init(void)
-{
-	struct pci_dev *pdev = NULL;
-	const struct pci_device_id *ent;
-
-	/* We look for our device - PCU PMC
-	 * we assume that there is max. one device.
-	 *
-	 * We can't use plain pci_driver mechanism,
-	 * as the device is really a multiple function device,
-	 * main driver that binds to the pci_device is lpc_ich
-	 * and have to find & bind to the device this way.
-	 */
-	for_each_pci_dev(pdev) {
-		ent = pci_match_id(pmc_pci_ids, pdev);
-		if (ent)
-			return pmc_setup_dev(pdev, ent);
-	}
-	/* Device not found. */
-	return -ENODEV;
-}
-
-device_initcall(pmc_atom_init);
-
-/*
-MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
-MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
-MODULE_LICENSE("GPL v2");
-*/
diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 8ea836c046f8..90d112a3063a 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
 #include <linux/platform_data/clk-lpss.h>
+#include <linux/platform_data/x86/pmc_atom.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
 #include <linux/delay.h>
@@ -31,7 +32,6 @@ ACPI_MODULE_NAME("acpi_lpss");
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include <asm/iosf_mbi.h>
-#include <asm/pmc_atom.h>
 
 #define LPSS_ADDR(desc) ((unsigned long)&desc)
 
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 5fe8be089b8b..659b33fc2e18 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1077,3 +1077,7 @@ config MLX_CPLD_PLATFORM
 	  cables and fans on the wide range Mellanox IB and Ethernet systems.
 
 endif # X86_PLATFORM_DEVICES
+
+config PMC_ATOM
+       def_bool y
+       depends on PCI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index d4111f0f8a78..49ee7ef283bb 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -74,5 +74,6 @@ obj-$(CONFIG_INTEL_TELEMETRY)	+= intel_telemetry_core.o \
 				   intel_telemetry_pltdrv.o \
 				   intel_telemetry_debugfs.o
 obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o
+obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
 obj-$(CONFIG_MLX_PLATFORM)	+= mlx-platform.o
 obj-$(CONFIG_MLX_CPLD_PLATFORM)	+= mlxcpld-hotplug.o
diff --git a/drivers/platform/x86/pmc_atom.c b/drivers/platform/x86/pmc_atom.c
new file mode 100644
index 000000000000..e1dfb1b3632f
--- /dev/null
+++ b/drivers/platform/x86/pmc_atom.c
@@ -0,0 +1,459 @@
+/*
+ * Intel Atom SOC Power Management Controller Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/platform_data/x86/pmc_atom.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+
+struct pmc_bit_map {
+	const char *name;
+	u32 bit_mask;
+};
+
+struct pmc_reg_map {
+	const struct pmc_bit_map *d3_sts_0;
+	const struct pmc_bit_map *d3_sts_1;
+	const struct pmc_bit_map *func_dis;
+	const struct pmc_bit_map *func_dis_2;
+	const struct pmc_bit_map *pss;
+};
+
+struct pmc_dev {
+	u32 base_addr;
+	void __iomem *regmap;
+	const struct pmc_reg_map *map;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+	bool init;
+};
+
+static struct pmc_dev pmc_device;
+static u32 acpi_base_addr;
+
+static const struct pmc_bit_map d3_sts_0_map[] = {
+	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
+	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
+	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
+	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
+	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
+	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
+	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
+	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
+	{"SCC_EMMC",		BIT_SCC_EMMC},
+	{"SCC_SDIO",		BIT_SCC_SDIO},
+	{"SCC_SDCARD",		BIT_SCC_SDCARD},
+	{"SCC_MIPI",		BIT_SCC_MIPI},
+	{"HDA",			BIT_HDA},
+	{"LPE",			BIT_LPE},
+	{"OTG",			BIT_OTG},
+	{"USH",			BIT_USH},
+	{"GBE",			BIT_GBE},
+	{"SATA",		BIT_SATA},
+	{"USB_EHCI",		BIT_USB_EHCI},
+	{"SEC",			BIT_SEC},
+	{"PCIE_PORT0",		BIT_PCIE_PORT0},
+	{"PCIE_PORT1",		BIT_PCIE_PORT1},
+	{"PCIE_PORT2",		BIT_PCIE_PORT2},
+	{"PCIE_PORT3",		BIT_PCIE_PORT3},
+	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
+	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
+	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
+	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
+	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
+	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
+	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
+	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
+	{},
+};
+
+static struct pmc_bit_map byt_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
+	{"USH_SS_PHY",		BIT_USH_SS_PHY},
+	{"DFX",			BIT_DFX},
+	{},
+};
+
+static struct pmc_bit_map cht_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_STS_GMM},
+	{"ISH",			BIT_STS_ISH},
+	{},
+};
+
+static struct pmc_bit_map cht_func_dis_2_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_FD_GMM},
+	{"ISH",			BIT_FD_ISH},
+	{},
+};
+
+static const struct pmc_bit_map byt_pss_map[] = {
+	{"GBE",			PMC_PSS_BIT_GBE},
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"DFX",			PMC_PSS_BIT_DFX},
+	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
+	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
+	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
+	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
+	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
+	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
+	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
+	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
+	{"USB",			PMC_PSS_BIT_USB},
+	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
+	{},
+};
+
+static const struct pmc_bit_map cht_pss_map[] = {
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"UFS",			PMC_PSS_BIT_CHT_UFS},
+	{"UXD",			PMC_PSS_BIT_CHT_UXD},
+	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
+	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
+	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
+	{"GMM",			PMC_PSS_BIT_CHT_GMM},
+	{"ISH",			PMC_PSS_BIT_CHT_ISH},
+	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
+	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
+	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
+	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
+	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
+	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
+	{},
+};
+
+static const struct pmc_reg_map byt_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= byt_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= byt_d3_sts_1_map,
+	.pss		= byt_pss_map,
+};
+
+static const struct pmc_reg_map cht_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= cht_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= cht_func_dis_2_map,
+	.pss		= cht_pss_map,
+};
+
+static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
+{
+	return readl(pmc->regmap + reg_offset);
+}
+
+static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
+{
+	writel(val, pmc->regmap + reg_offset);
+}
+
+int pmc_atom_read(int offset, u32 *value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	*value = pmc_reg_read(pmc, offset);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_read);
+
+int pmc_atom_write(int offset, u32 value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	pmc_reg_write(pmc, offset, value);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_write);
+
+static void pmc_power_off(void)
+{
+	u16	pm1_cnt_port;
+	u32	pm1_cnt_value;
+
+	pr_info("Preparing to enter system sleep state S5\n");
+
+	pm1_cnt_port = acpi_base_addr + PM1_CNT;
+
+	pm1_cnt_value = inl(pm1_cnt_port);
+	pm1_cnt_value &= SLEEP_TYPE_MASK;
+	pm1_cnt_value |= SLEEP_TYPE_S5;
+	pm1_cnt_value |= SLEEP_ENABLE;
+
+	outl(pm1_cnt_value, pm1_cnt_port);
+}
+
+static void pmc_hw_reg_setup(struct pmc_dev *pmc)
+{
+	/*
+	 * Disable PMC S0IX_WAKE_EN events coming from:
+	 * - LPC clock run
+	 * - GPIO_SUS ored dedicated IRQs
+	 * - GPIO_SCORE ored dedicated IRQs
+	 * - GPIO_SUS shared IRQ
+	 * - GPIO_SCORE shared IRQ
+	 */
+	pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void pmc_dev_state_print(struct seq_file *s, int reg_index,
+				u32 sts, const struct pmc_bit_map *sts_map,
+				u32 fd, const struct pmc_bit_map *fd_map)
+{
+	int offset = PMC_REG_BIT_WIDTH * reg_index;
+	int index;
+
+	for (index = 0; sts_map[index].name; index++) {
+		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
+			offset + index, sts_map[index].name,
+			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
+			sts_map[index].bit_mask & sts ?  "D3" : "D0");
+	}
+}
+
+static int pmc_dev_state_show(struct seq_file *s, void *unused)
+{
+	struct pmc_dev *pmc = s->private;
+	const struct pmc_reg_map *m = pmc->map;
+	u32 func_dis, func_dis_2;
+	u32 d3_sts_0, d3_sts_1;
+
+	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
+	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
+	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
+	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
+
+	/* Low part */
+	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
+
+	/* High part */
+	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
+
+	return 0;
+}
+
+static int pmc_dev_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pmc_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_dev_state_ops = {
+	.open		= pmc_dev_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int pmc_pss_state_show(struct seq_file *s, void *unused)
+{
+	struct pmc_dev *pmc = s->private;
+	const struct pmc_bit_map *map = pmc->map->pss;
+	u32 pss = pmc_reg_read(pmc, PMC_PSS);
+	int index;
+
+	for (index = 0; map[index].name; index++) {
+		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
+			index, map[index].name,
+			map[index].bit_mask & pss ? "Off" : "On");
+	}
+	return 0;
+}
+
+static int pmc_pss_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pmc_pss_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_pss_state_ops = {
+	.open		= pmc_pss_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
+{
+	struct pmc_dev *pmc = s->private;
+	u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
+
+	s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
+	s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
+	s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
+	s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
+	s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
+
+	seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
+	seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
+	seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
+	seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
+	seq_printf(s, "S0   Residency:\t%lldus\n", s0_tmr);
+	return 0;
+}
+
+static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pmc_sleep_tmr_show, inode->i_private);
+}
+
+static const struct file_operations pmc_sleep_tmr_ops = {
+	.open		= pmc_sleep_tmr_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
+{
+	debugfs_remove_recursive(pmc->dbgfs_dir);
+}
+
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
+{
+	struct dentry *dir, *f;
+
+	dir = debugfs_create_dir("pmc_atom", NULL);
+	if (!dir)
+		return -ENOMEM;
+
+	pmc->dbgfs_dir = dir;
+
+	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
+				dir, pmc, &pmc_dev_state_ops);
+	if (!f)
+		goto err;
+
+	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
+				dir, pmc, &pmc_pss_state_ops);
+	if (!f)
+		goto err;
+
+	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
+				dir, pmc, &pmc_sleep_tmr_ops);
+	if (!f)
+		goto err;
+
+	return 0;
+err:
+	pmc_dbgfs_unregister(pmc);
+	return -ENODEV;
+}
+#else
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
+{
+	return 0;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct pmc_dev *pmc = &pmc_device;
+	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
+	int ret;
+
+	/* Obtain ACPI base address */
+	pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
+	acpi_base_addr &= ACPI_BASE_ADDR_MASK;
+
+	/* Install power off function */
+	if (acpi_base_addr != 0 && pm_power_off == NULL)
+		pm_power_off = pmc_power_off;
+
+	pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
+	pmc->base_addr &= PMC_BASE_ADDR_MASK;
+
+	pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
+	if (!pmc->regmap) {
+		dev_err(&pdev->dev, "error: ioremap failed\n");
+		return -ENOMEM;
+	}
+
+	pmc->map = map;
+
+	/* PMC hardware registers setup */
+	pmc_hw_reg_setup(pmc);
+
+	ret = pmc_dbgfs_register(pmc);
+	if (ret)
+		dev_warn(&pdev->dev, "debugfs register failed\n");
+
+	pmc->init = true;
+	return ret;
+}
+
+/*
+ * Data for PCI driver interface
+ *
+ * used by pci_match_id() call below.
+ */
+static const struct pci_device_id pmc_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
+	{ 0, },
+};
+
+static int __init pmc_atom_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	const struct pci_device_id *ent;
+
+	/* We look for our device - PCU PMC
+	 * we assume that there is max. one device.
+	 *
+	 * We can't use plain pci_driver mechanism,
+	 * as the device is really a multiple function device,
+	 * main driver that binds to the pci_device is lpc_ich
+	 * and have to find & bind to the device this way.
+	 */
+	for_each_pci_dev(pdev) {
+		ent = pci_match_id(pmc_pci_ids, pdev);
+		if (ent)
+			return pmc_setup_dev(pdev, ent);
+	}
+	/* Device not found. */
+	return -ENODEV;
+}
+
+device_initcall(pmc_atom_init);
+
+/*
+MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
+MODULE_LICENSE("GPL v2");
+*/
diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
new file mode 100644
index 000000000000..aa8744c77c6d
--- /dev/null
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -0,0 +1,158 @@
+/*
+ * Intel Atom SOC Power Management Controller Header File
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef PMC_ATOM_H
+#define PMC_ATOM_H
+
+/* ValleyView Power Control Unit PCI Device ID */
+#define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
+/* CherryTrail Power Control Unit PCI Device ID */
+#define	PCI_DEVICE_ID_CHT_PMC	0x229C
+
+/* PMC Memory mapped IO registers */
+#define	PMC_BASE_ADDR_OFFSET	0x44
+#define	PMC_BASE_ADDR_MASK	0xFFFFFE00
+#define	PMC_MMIO_REG_LEN	0x100
+#define	PMC_REG_BIT_WIDTH	32
+
+/* BIOS uses FUNC_DIS to disable specific function */
+#define	PMC_FUNC_DIS		0x34
+#define	PMC_FUNC_DIS_2		0x38
+
+/* CHT specific bits in FUNC_DIS2 register */
+#define	BIT_FD_GMM		BIT(3)
+#define	BIT_FD_ISH		BIT(4)
+
+/* S0ix wake event control */
+#define	PMC_S0IX_WAKE_EN	0x3C
+
+#define	BIT_LPC_CLOCK_RUN		BIT(4)
+#define	BIT_SHARED_IRQ_GPSC		BIT(5)
+#define	BIT_ORED_DEDICATED_IRQ_GPSS	BIT(18)
+#define	BIT_ORED_DEDICATED_IRQ_GPSC	BIT(19)
+#define	BIT_SHARED_IRQ_GPSS		BIT(20)
+
+#define	PMC_WAKE_EN_SETTING	~(BIT_LPC_CLOCK_RUN | \
+				BIT_SHARED_IRQ_GPSC | \
+				BIT_ORED_DEDICATED_IRQ_GPSS | \
+				BIT_ORED_DEDICATED_IRQ_GPSC | \
+				BIT_SHARED_IRQ_GPSS)
+
+/* The timers acumulate time spent in sleep state */
+#define	PMC_S0IR_TMR		0x80
+#define	PMC_S0I1_TMR		0x84
+#define	PMC_S0I2_TMR		0x88
+#define	PMC_S0I3_TMR		0x8C
+#define	PMC_S0_TMR		0x90
+/* Sleep state counter is in units of of 32us */
+#define	PMC_TMR_SHIFT		5
+
+/* Power status of power islands */
+#define	PMC_PSS			0x98
+
+#define PMC_PSS_BIT_GBE			BIT(0)
+#define PMC_PSS_BIT_SATA		BIT(1)
+#define PMC_PSS_BIT_HDA			BIT(2)
+#define PMC_PSS_BIT_SEC			BIT(3)
+#define PMC_PSS_BIT_PCIE		BIT(4)
+#define PMC_PSS_BIT_LPSS		BIT(5)
+#define PMC_PSS_BIT_LPE			BIT(6)
+#define PMC_PSS_BIT_DFX			BIT(7)
+#define PMC_PSS_BIT_USH_CTRL		BIT(8)
+#define PMC_PSS_BIT_USH_SUS		BIT(9)
+#define PMC_PSS_BIT_USH_VCCS		BIT(10)
+#define PMC_PSS_BIT_USH_VCCA		BIT(11)
+#define PMC_PSS_BIT_OTG_CTRL		BIT(12)
+#define PMC_PSS_BIT_OTG_VCCS		BIT(13)
+#define PMC_PSS_BIT_OTG_VCCA_CLK	BIT(14)
+#define PMC_PSS_BIT_OTG_VCCA		BIT(15)
+#define PMC_PSS_BIT_USB			BIT(16)
+#define PMC_PSS_BIT_USB_SUS		BIT(17)
+
+/* CHT specific bits in PSS register */
+#define	PMC_PSS_BIT_CHT_UFS		BIT(7)
+#define	PMC_PSS_BIT_CHT_UXD		BIT(11)
+#define	PMC_PSS_BIT_CHT_UXD_FD		BIT(12)
+#define	PMC_PSS_BIT_CHT_UX_ENG		BIT(15)
+#define	PMC_PSS_BIT_CHT_USB_SUS		BIT(16)
+#define	PMC_PSS_BIT_CHT_GMM		BIT(17)
+#define	PMC_PSS_BIT_CHT_ISH		BIT(18)
+#define	PMC_PSS_BIT_CHT_DFX_MASTER	BIT(26)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER1	BIT(27)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER2	BIT(28)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER3	BIT(29)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER4	BIT(30)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER5	BIT(31)
+
+/* These registers reflect D3 status of functions */
+#define	PMC_D3_STS_0		0xA0
+
+#define	BIT_LPSS1_F0_DMA	BIT(0)
+#define	BIT_LPSS1_F1_PWM1	BIT(1)
+#define	BIT_LPSS1_F2_PWM2	BIT(2)
+#define	BIT_LPSS1_F3_HSUART1	BIT(3)
+#define	BIT_LPSS1_F4_HSUART2	BIT(4)
+#define	BIT_LPSS1_F5_SPI	BIT(5)
+#define	BIT_LPSS1_F6_XXX	BIT(6)
+#define	BIT_LPSS1_F7_XXX	BIT(7)
+#define	BIT_SCC_EMMC		BIT(8)
+#define	BIT_SCC_SDIO		BIT(9)
+#define	BIT_SCC_SDCARD		BIT(10)
+#define	BIT_SCC_MIPI		BIT(11)
+#define	BIT_HDA			BIT(12)
+#define	BIT_LPE			BIT(13)
+#define	BIT_OTG			BIT(14)
+#define	BIT_USH			BIT(15)
+#define	BIT_GBE			BIT(16)
+#define	BIT_SATA		BIT(17)
+#define	BIT_USB_EHCI		BIT(18)
+#define	BIT_SEC			BIT(19)
+#define	BIT_PCIE_PORT0		BIT(20)
+#define	BIT_PCIE_PORT1		BIT(21)
+#define	BIT_PCIE_PORT2		BIT(22)
+#define	BIT_PCIE_PORT3		BIT(23)
+#define	BIT_LPSS2_F0_DMA	BIT(24)
+#define	BIT_LPSS2_F1_I2C1	BIT(25)
+#define	BIT_LPSS2_F2_I2C2	BIT(26)
+#define	BIT_LPSS2_F3_I2C3	BIT(27)
+#define	BIT_LPSS2_F4_I2C4	BIT(28)
+#define	BIT_LPSS2_F5_I2C5	BIT(29)
+#define	BIT_LPSS2_F6_I2C6	BIT(30)
+#define	BIT_LPSS2_F7_I2C7	BIT(31)
+
+#define	PMC_D3_STS_1		0xA4
+#define	BIT_SMB			BIT(0)
+#define	BIT_OTG_SS_PHY		BIT(1)
+#define	BIT_USH_SS_PHY		BIT(2)
+#define	BIT_DFX			BIT(3)
+
+/* CHT specific bits in PMC_D3_STS_1 register */
+#define	BIT_STS_GMM		BIT(1)
+#define	BIT_STS_ISH		BIT(2)
+
+/* PMC I/O Registers */
+#define	ACPI_BASE_ADDR_OFFSET	0x40
+#define	ACPI_BASE_ADDR_MASK	0xFFFFFE00
+#define	ACPI_MMIO_REG_LEN	0x100
+
+#define	PM1_CNT			0x4
+#define	SLEEP_TYPE_MASK		0xFFFFECFF
+#define	SLEEP_TYPE_S5		0x1C00
+#define	SLEEP_ENABLE		0x2000
+
+extern int pmc_atom_read(int offset, u32 *value);
+extern int pmc_atom_write(int offset, u32 value);
+
+#endif /* PMC_ATOM_H */
-- 
cgit v1.2.3


From 2e6f38cef8838a5edd6051c3110ecf06f37ec544 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 23 Jan 2017 12:07:45 -0600
Subject: platform/x86: fix typo in comment

s/Acumulate/Accumulate/

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 include/linux/platform_data/x86/pmc_atom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
index aa8744c77c6d..e4905fe69c38 100644
--- a/include/linux/platform_data/x86/pmc_atom.h
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -50,7 +50,7 @@
 				BIT_ORED_DEDICATED_IRQ_GPSC | \
 				BIT_SHARED_IRQ_GPSS)
 
-/* The timers acumulate time spent in sleep state */
+/* The timers accumulate time spent in sleep state */
 #define	PMC_S0IR_TMR		0x80
 #define	PMC_S0I1_TMR		0x84
 #define	PMC_S0I2_TMR		0x88
-- 
cgit v1.2.3


From 4cb3e3782776f82400a5d135909dda466b813d45 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Wed, 25 Jan 2017 21:34:51 +0200
Subject: soc: samsung: pmu: Remove unused and duplicated defines

The exynos-regs-pmu.h was never a complete list of PMU registers.  It
contained a lot of holes for registers which were not used.  However, a
lot of unused defines came along with porting the code from vendor
kernel.  Few of defines were also duplicated.

Remove them so the file will be slightly smaller.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
---
 include/linux/soc/samsung/exynos-regs-pmu.h | 72 +++--------------------------
 1 file changed, 7 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index d30186e2b609..9793502a6a57 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -7,7 +7,13 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
-*/
+ *
+ *
+ * Notice:
+ * This is not a list of all Exynos Power Management Unit SFRs.
+ * There are too many of them, not mentioning subtle differences
+ * between SoCs. For now, put here only the used registers.
+ */
 
 #ifndef __LINUX_SOC_EXYNOS_REGS_PMU_H
 #define __LINUX_SOC_EXYNOS_REGS_PMU_H __FILE__
@@ -38,7 +44,6 @@
 #define EXYNOS_CORE_PO_RESET(n)			((1 << 4) << n)
 #define EXYNOS_WAKEUP_FROM_LOWPWR		(1 << 28)
 #define EXYNOS_SWRESET				0x0400
-#define EXYNOS5440_SWRESET			0x00C4
 
 #define S5P_WAKEUP_STAT				0x0600
 #define S5P_EINT_WAKEUP_MASK			0x0604
@@ -136,12 +141,6 @@
 #define EXYNOS_COMMON_OPTION(_nr)		\
 			(EXYNOS_COMMON_CONFIGURATION(_nr) + 0x8)
 
-#define EXYNOS_CORE_LOCAL_PWR_EN		0x3
-
-#define EXYNOS_ARM_COMMON_STATUS		0x2504
-#define EXYNOS_COMMON_OPTION(_nr)		\
-			(EXYNOS_COMMON_CONFIGURATION(_nr) + 0x8)
-
 #define EXYNOS_ARM_L2_CONFIGURATION		0x2600
 #define EXYNOS_L2_CONFIGURATION(_nr)		\
 			(EXYNOS_ARM_L2_CONFIGURATION + ((_nr) * 0x80))
@@ -149,18 +148,10 @@
 			(EXYNOS_L2_CONFIGURATION(_nr) + 0x4)
 #define EXYNOS_L2_OPTION(_nr)			\
 			(EXYNOS_L2_CONFIGURATION(_nr) + 0x8)
-#define EXYNOS_L2_COMMON_PWR_EN			0x3
-
-#define EXYNOS_ARM_CORE_X_STATUS_OFFSET		0x4
-
-#define EXYNOS5_APLL_SYSCLK_CONFIGURATION	0x2A00
-#define EXYNOS5_APLL_SYSCLK_STATUS		0x2A04
 
 #define EXYNOS5_ARM_L2_OPTION			0x2608
 #define EXYNOS5_USE_RETENTION			BIT(4)
 
-#define EXYNOS5_L2RSTDISABLE_VALUE		BIT(3)
-
 #define S5P_PAD_RET_MAUDIO_OPTION		0x3028
 #define S5P_PAD_RET_MMC2_OPTION			0x30c8
 #define S5P_PAD_RET_GPIO_OPTION			0x3108
@@ -411,7 +402,6 @@
 #define EXYNOS5_SATA_MEM_SYS_PWR_REG				0x11FC
 #define EXYNOS5_PAD_RETENTION_DRAM_SYS_PWR_REG			0x1200
 #define EXYNOS5_PAD_RETENTION_MAU_SYS_PWR_REG			0x1204
-#define EXYNOS5_PAD_RETENTION_EFNAND_SYS_PWR_REG		0x1208
 #define EXYNOS5_PAD_RETENTION_GPIO_SYS_PWR_REG			0x1220
 #define EXYNOS5_PAD_RETENTION_UART_SYS_PWR_REG			0x1224
 #define EXYNOS5_PAD_RETENTION_MMCA_SYS_PWR_REG			0x1228
@@ -485,7 +475,6 @@
 #define EXYNOS5420_SWRESET_KFC_SEL				0x3
 
 /* Only for EXYNOS5420 */
-#define EXYNOS5420_ISP_ARM_OPTION				0x2488
 #define EXYNOS5420_L2RSTDISABLE_VALUE				BIT(3)
 
 #define EXYNOS5420_LPI_MASK					0x0004
@@ -494,9 +483,6 @@
 #define EXYNOS5420_ATB_KFC					BIT(13)
 #define EXYNOS5420_ATB_ISP_ARM					BIT(19)
 #define EXYNOS5420_EMULATION					BIT(31)
-#define ATB_ISP_ARM						BIT(12)
-#define ATB_KFC							BIT(13)
-#define ATB_NOC							BIT(14)
 
 #define EXYNOS5420_ARM_INTR_SPREAD_ENABLE			0x0100
 #define EXYNOS5420_ARM_INTR_SPREAD_USE_STANDBYWFI		0x0104
@@ -510,11 +496,6 @@
 #define EXYNOS5420_KFC_CORE_RESET(_nr)				\
 	((EXYNOS5420_KFC_CORE_RESET0 | EXYNOS5420_KFC_ETM_RESET0) << (_nr))
 
-#define EXYNOS5420_BB_CON1					0x0784
-#define EXYNOS5420_BB_SEL_EN					BIT(31)
-#define EXYNOS5420_BB_PMOS_EN					BIT(7)
-#define EXYNOS5420_BB_1300X					0XF
-
 #define EXYNOS5420_ARM_CORE2_SYS_PWR_REG			0x1020
 #define EXYNOS5420_DIS_IRQ_ARM_CORE2_LOCAL_SYS_PWR_REG		0x1024
 #define EXYNOS5420_DIS_IRQ_ARM_CORE2_CENTRAL_SYS_PWR_REG	0x1028
@@ -546,15 +527,6 @@
 #define EXYNOS5420_SPLL_SYSCLK_SYS_PWR_REG                      0x1178
 #define EXYNOS5420_INTRAM_MEM_SYS_PWR_REG                       0x11B8
 #define EXYNOS5420_INTROM_MEM_SYS_PWR_REG                       0x11BC
-#define EXYNOS5420_ONENANDXL_MEM_SYS_PWR			0x11C0
-#define EXYNOS5420_USBDEV_MEM_SYS_PWR				0x11CC
-#define EXYNOS5420_USBDEV1_MEM_SYS_PWR				0x11D0
-#define EXYNOS5420_SDMMC_MEM_SYS_PWR				0x11D4
-#define EXYNOS5420_CSSYS_MEM_SYS_PWR				0x11D8
-#define EXYNOS5420_SECSS_MEM_SYS_PWR				0x11DC
-#define EXYNOS5420_ROTATOR_MEM_SYS_PWR				0x11E0
-#define EXYNOS5420_INTRAM_MEM_SYS_PWR				0x11E4
-#define EXYNOS5420_INTROM_MEM_SYS_PWR				0x11E8
 #define EXYNOS5420_PAD_RETENTION_JTAG_SYS_PWR_REG		0x1208
 #define EXYNOS5420_PAD_RETENTION_DRAM_SYS_PWR_REG		0x1210
 #define EXYNOS5420_PAD_RETENTION_UART_SYS_PWR_REG		0x1214
@@ -605,13 +577,7 @@
 #define EXYNOS5420_CMU_RESET_MSC_SYS_PWR_REG			0x159C
 #define EXYNOS5420_CMU_RESET_FSYS_SYS_PWR_REG			0x15A0
 #define EXYNOS5420_SFR_AXI_CGDIS1				0x15E4
-#define EXYNOS_ARM_CORE2_CONFIGURATION				0x2100
-#define EXYNOS5420_ARM_CORE2_OPTION				0x2108
-#define EXYNOS_ARM_CORE3_CONFIGURATION				0x2180
-#define EXYNOS5420_ARM_CORE3_OPTION				0x2188
-#define EXYNOS5420_ARM_COMMON_STATUS				0x2504
 #define EXYNOS5420_ARM_COMMON_OPTION				0x2508
-#define EXYNOS5420_KFC_COMMON_STATUS				0x2584
 #define EXYNOS5420_KFC_COMMON_OPTION				0x2588
 #define EXYNOS5420_LOGIC_RESET_DURATION3			0x2D1C
 
@@ -626,33 +592,9 @@
 #define EXYNOS_PAD_RET_DRAM_OPTION				0x3008
 #define EXYNOS_PAD_RET_MAUDIO_OPTION				0x3028
 #define EXYNOS_PAD_RET_JTAG_OPTION				0x3048
-#define EXYNOS_PAD_RET_GPIO_OPTION				0x3108
-#define EXYNOS_PAD_RET_UART_OPTION				0x3128
-#define EXYNOS_PAD_RET_MMCA_OPTION				0x3148
-#define EXYNOS_PAD_RET_MMCB_OPTION				0x3168
 #define EXYNOS_PAD_RET_EBIA_OPTION				0x3188
 #define EXYNOS_PAD_RET_EBIB_OPTION				0x31A8
 
-#define EXYNOS_PS_HOLD_CONTROL					0x330C
-
-/* For SYS_PWR_REG */
-#define EXYNOS_SYS_PWR_CFG					BIT(0)
-
-#define EXYNOS5420_MFC_CONFIGURATION				0x4060
-#define EXYNOS5420_MFC_STATUS					0x4064
-#define EXYNOS5420_MFC_OPTION					0x4068
-#define EXYNOS5420_G3D_CONFIGURATION				0x4080
-#define EXYNOS5420_G3D_STATUS					0x4084
-#define EXYNOS5420_G3D_OPTION					0x4088
-#define EXYNOS5420_DISP0_CONFIGURATION				0x40A0
-#define EXYNOS5420_DISP0_STATUS					0x40A4
-#define EXYNOS5420_DISP0_OPTION					0x40A8
-#define EXYNOS5420_DISP1_CONFIGURATION				0x40C0
-#define EXYNOS5420_DISP1_STATUS					0x40C4
-#define EXYNOS5420_DISP1_OPTION					0x40C8
-#define EXYNOS5420_MAU_CONFIGURATION				0x40E0
-#define EXYNOS5420_MAU_STATUS					0x40E4
-#define EXYNOS5420_MAU_OPTION					0x40E8
 #define EXYNOS5420_FSYS2_OPTION					0x4168
 #define EXYNOS5420_PSGEN_OPTION					0x4188
 
-- 
cgit v1.2.3


From ee55ae6194a5439bde3a3b8ee0abda63c610e740 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Wed, 25 Jan 2017 21:09:44 +0200
Subject: soc: samsung: pmu: Remove duplicated define for ARM_L2_OPTION
 register

The register ARM_L2_OPTION (0x2608 in Exynos4 and Exynos5 PMU) was
defined twice.  Both names were used in the Exynos542x code.  Simplify
this.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
---
 arch/arm/mach-exynos/suspend.c              | 6 +++---
 drivers/soc/samsung/exynos5250-pmu.c        | 2 +-
 drivers/soc/samsung/exynos5420-pmu.c        | 4 ++--
 include/linux/soc/samsung/exynos-regs-pmu.h | 3 +--
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c
index 06332f626565..bf97de884eea 100644
--- a/arch/arm/mach-exynos/suspend.c
+++ b/arch/arm/mach-exynos/suspend.c
@@ -388,9 +388,9 @@ static void exynos5420_pm_prepare(void)
 	if (IS_ENABLED(CONFIG_EXYNOS5420_MCPM))
 		pmu_raw_writel(virt_to_phys(mcpm_entry_point), S5P_INFORM0);
 
-	tmp = pmu_raw_readl(EXYNOS5_ARM_L2_OPTION);
-	tmp &= ~EXYNOS5_USE_RETENTION;
-	pmu_raw_writel(tmp, EXYNOS5_ARM_L2_OPTION);
+	tmp = pmu_raw_readl(EXYNOS_L2_OPTION(0));
+	tmp &= ~EXYNOS_L2_USE_RETENTION;
+	pmu_raw_writel(tmp, EXYNOS_L2_OPTION(0));
 
 	tmp = pmu_raw_readl(EXYNOS5420_SFR_AXI_CGDIS1);
 	tmp |= EXYNOS5420_UFS;
diff --git a/drivers/soc/samsung/exynos5250-pmu.c b/drivers/soc/samsung/exynos5250-pmu.c
index 3fac42561964..8d94f0819f32 100644
--- a/drivers/soc/samsung/exynos5250-pmu.c
+++ b/drivers/soc/samsung/exynos5250-pmu.c
@@ -29,7 +29,7 @@ static const struct exynos_pmu_conf exynos5250_pmu_config[] = {
 	{ EXYNOS5_DIS_IRQ_ISP_ARM_CENTRAL_SYS_PWR_REG,	{ 0x0, 0x0, 0x0} },
 	{ EXYNOS5_ARM_COMMON_SYS_PWR_REG,		{ 0x0, 0x0, 0x2} },
 	{ EXYNOS5_ARM_L2_SYS_PWR_REG,			{ 0x3, 0x3, 0x3} },
-	{ EXYNOS5_ARM_L2_OPTION,			{ 0x10, 0x10, 0x0 } },
+	{ EXYNOS_L2_OPTION(0),				{ 0x10, 0x10, 0x0 } },
 	{ EXYNOS5_CMU_ACLKSTOP_SYS_PWR_REG,		{ 0x1, 0x0, 0x1} },
 	{ EXYNOS5_CMU_SCLKSTOP_SYS_PWR_REG,		{ 0x1, 0x0, 0x1} },
 	{ EXYNOS5_CMU_RESET_SYS_PWR_REG,		{ 0x1, 0x1, 0x0} },
diff --git a/drivers/soc/samsung/exynos5420-pmu.c b/drivers/soc/samsung/exynos5420-pmu.c
index 3f2c64180ef8..0a89fa79c678 100644
--- a/drivers/soc/samsung/exynos5420-pmu.c
+++ b/drivers/soc/samsung/exynos5420-pmu.c
@@ -230,11 +230,11 @@ static void exynos5420_pmu_init(void)
 	pmu_raw_writel(EXYNOS5420_USE_STANDBY_WFI_ALL, S5P_CENTRAL_SEQ_OPTION);
 
 	value  = pmu_raw_readl(EXYNOS_L2_OPTION(0));
-	value &= ~EXYNOS5_USE_RETENTION;
+	value &= ~EXYNOS_L2_USE_RETENTION;
 	pmu_raw_writel(value, EXYNOS_L2_OPTION(0));
 
 	value = pmu_raw_readl(EXYNOS_L2_OPTION(1));
-	value &= ~EXYNOS5_USE_RETENTION;
+	value &= ~EXYNOS_L2_USE_RETENTION;
 	pmu_raw_writel(value, EXYNOS_L2_OPTION(1));
 
 	/*
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 9793502a6a57..9786c62d7159 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -149,8 +149,7 @@
 #define EXYNOS_L2_OPTION(_nr)			\
 			(EXYNOS_L2_CONFIGURATION(_nr) + 0x8)
 
-#define EXYNOS5_ARM_L2_OPTION			0x2608
-#define EXYNOS5_USE_RETENTION			BIT(4)
+#define EXYNOS_L2_USE_RETENTION			BIT(4)
 
 #define S5P_PAD_RET_MAUDIO_OPTION		0x3028
 #define S5P_PAD_RET_MMC2_OPTION			0x30c8
-- 
cgit v1.2.3


From 4f497a4310646f14ab518ab6108dbef07f4606af Mon Sep 17 00:00:00 2001
From: Corentin LABBE <clabbe.montjoie@gmail.com>
Date: Thu, 15 Dec 2016 11:15:25 +0100
Subject: PM / QoS: Remove unneeded linux/miscdevice.h include

pm_qos.h does not use any miscdevice, so this patch
remove this unnecessary inclusion.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_qos.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 0f65d36c2a75..d4d34791e463 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -6,7 +6,6 @@
  */
 #include <linux/plist.h>
 #include <linux/notifier.h>
-#include <linux/miscdevice.h>
 #include <linux/device.h>
 #include <linux/workqueue.h>
 
-- 
cgit v1.2.3


From 3aa26a3b2ea63c4d09420e74421370655aa1cf8f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 2 Jan 2017 14:41:02 +0530
Subject: PM / OPP: Rename dev_pm_opp_get_suspend_opp() and return OPP rate

There is only one user of dev_pm_opp_get_suspend_opp() and that uses it
to get the OPP rate for the suspend_opp.

Rename dev_pm_opp_get_suspend_opp() as dev_pm_opp_get_suspend_opp_freq()
and return the rate directly from it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 27 +++++++++++++--------------
 drivers/cpufreq/cpufreq-dt.c  |  7 +------
 include/linux/pm_opp.h        |  6 +++---
 3 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index dde7dd099aa0..614d779ab6a2 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -328,32 +328,31 @@ unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
 
 /**
- * dev_pm_opp_get_suspend_opp() - Get suspend opp
+ * dev_pm_opp_get_suspend_opp_freq() - Get frequency of suspend opp in Hz
  * @dev:	device for which we do this operation
  *
- * Return: This function returns pointer to the suspend opp if it is
- * defined and available, otherwise it returns NULL.
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
+ * Return: This function returns the frequency of the OPP marked as suspend_opp
+ * if one is available, else returns 0;
  */
-struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
 {
 	struct opp_table *opp_table;
+	unsigned long freq = 0;
 
-	opp_rcu_lockdep_assert();
+	rcu_read_lock();
 
 	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table) || !opp_table->suspend_opp ||
 	    !opp_table->suspend_opp->available)
-		return NULL;
+		goto unlock;
 
-	return opp_table->suspend_opp;
+	freq = dev_pm_opp_get_freq(opp_table->suspend_opp);
+
+unlock:
+	rcu_read_unlock();
+	return freq;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp_freq);
 
 /**
  * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 269013311e79..c943787d761e 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -148,7 +148,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	struct private_data *priv;
 	struct device *cpu_dev;
 	struct clk *cpu_clk;
-	struct dev_pm_opp *suspend_opp;
 	unsigned int transition_latency;
 	bool fallback = false;
 	const char *name;
@@ -252,11 +251,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	policy->driver_data = priv;
 	policy->clk = cpu_clk;
 
-	rcu_read_lock();
-	suspend_opp = dev_pm_opp_get_suspend_opp(cpu_dev);
-	if (suspend_opp)
-		policy->suspend_freq = dev_pm_opp_get_freq(suspend_opp) / 1000;
-	rcu_read_unlock();
+	policy->suspend_freq = dev_pm_opp_get_suspend_opp_freq(cpu_dev) / 1000;
 
 	ret = cpufreq_table_validate_and_show(policy, freq_table);
 	if (ret) {
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 0edd88f93904..7483ff291a8e 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -88,7 +88,7 @@ int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
 unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev);
 unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev);
-struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
+unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					      unsigned long freq,
@@ -159,9 +159,9 @@ static inline unsigned long dev_pm_opp_get_max_transition_latency(struct device
 	return 0;
 }
 
-static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+static inline unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
 {
-	return NULL;
+	return 0;
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
-- 
cgit v1.2.3


From dc2c9ad52af45ebecf9743aacb1916ebb2f1e848 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 2 Jan 2017 14:41:03 +0530
Subject: PM / OPP: Don't expose srcu_head to register notifiers

Let the OPP core provide helpers to register notifiers for any device,
instead of exposing srcu_head outside of the core.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 66 ++++++++++++++++++++++++++++++++-----------
 drivers/devfreq/devfreq.c     | 26 ++---------------
 include/linux/pm_opp.h        | 14 ++++++---
 3 files changed, 62 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 614d779ab6a2..f35effad60d1 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -1860,29 +1860,63 @@ int dev_pm_opp_disable(struct device *dev, unsigned long freq)
 EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
 
 /**
- * dev_pm_opp_get_notifier() - find notifier_head of the device with opp
- * @dev:	device pointer used to lookup OPP table.
+ * dev_pm_opp_register_notifier() - Register OPP notifier for the device
+ * @dev:	Device for which notifier needs to be registered
+ * @nb:		Notifier block to be registered
  *
- * Return: pointer to  notifier head if found, otherwise -ENODEV or
- * -EINVAL based on type of error casted as pointer. value must be checked
- *  with IS_ERR to determine valid pointer or error result.
+ * Return: 0 on success or a negative error value.
+ */
+int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		ret = PTR_ERR(opp_table);
+		goto unlock;
+	}
+
+	ret = srcu_notifier_chain_register(&opp_table->srcu_head, nb);
+
+unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_pm_opp_register_notifier);
+
+/**
+ * dev_pm_opp_unregister_notifier() - Unregister OPP notifier for the device
+ * @dev:	Device for which notifier needs to be unregistered
+ * @nb:		Notifier block to be unregistered
  *
- * Locking: This function must be called under rcu_read_lock(). opp_table is a
- * RCU protected pointer. The reason for the same is that the opp pointer which
- * is returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
+ * Return: 0 on success or a negative error value.
  */
-struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev)
+int dev_pm_opp_unregister_notifier(struct device *dev,
+				   struct notifier_block *nb)
 {
-	struct opp_table *opp_table = _find_opp_table(dev);
+	struct opp_table *opp_table;
+	int ret;
 
-	if (IS_ERR(opp_table))
-		return ERR_CAST(opp_table); /* matching type */
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		ret = PTR_ERR(opp_table);
+		goto unlock;
+	}
+
+	ret = srcu_notifier_chain_unregister(&opp_table->srcu_head, nb);
 
-	return &opp_table->srcu_head;
+unlock:
+	rcu_read_unlock();
+
+	return ret;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
+EXPORT_SYMBOL(dev_pm_opp_unregister_notifier);
 
 /*
  * Free OPPs either created using static entries present in DT or even the
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 47206a21bb90..a545c0fee6e1 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -1265,18 +1265,7 @@ EXPORT_SYMBOL(devfreq_recommended_opp);
  */
 int devfreq_register_opp_notifier(struct device *dev, struct devfreq *devfreq)
 {
-	struct srcu_notifier_head *nh;
-	int ret = 0;
-
-	rcu_read_lock();
-	nh = dev_pm_opp_get_notifier(dev);
-	if (IS_ERR(nh))
-		ret = PTR_ERR(nh);
-	rcu_read_unlock();
-	if (!ret)
-		ret = srcu_notifier_chain_register(nh, &devfreq->nb);
-
-	return ret;
+	return dev_pm_opp_register_notifier(dev, &devfreq->nb);
 }
 EXPORT_SYMBOL(devfreq_register_opp_notifier);
 
@@ -1292,18 +1281,7 @@ EXPORT_SYMBOL(devfreq_register_opp_notifier);
  */
 int devfreq_unregister_opp_notifier(struct device *dev, struct devfreq *devfreq)
 {
-	struct srcu_notifier_head *nh;
-	int ret = 0;
-
-	rcu_read_lock();
-	nh = dev_pm_opp_get_notifier(dev);
-	if (IS_ERR(nh))
-		ret = PTR_ERR(nh);
-	rcu_read_unlock();
-	if (!ret)
-		ret = srcu_notifier_chain_unregister(nh, &devfreq->nb);
-
-	return ret;
+	return dev_pm_opp_unregister_notifier(dev, &devfreq->nb);
 }
 EXPORT_SYMBOL(devfreq_unregister_opp_notifier);
 
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 7483ff291a8e..66a02deeb03f 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -108,7 +108,9 @@ int dev_pm_opp_enable(struct device *dev, unsigned long freq);
 
 int dev_pm_opp_disable(struct device *dev, unsigned long freq);
 
-struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev);
+int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb);
+int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb);
+
 int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 				unsigned int count);
 void dev_pm_opp_put_supported_hw(struct device *dev);
@@ -202,10 +204,14 @@ static inline int dev_pm_opp_disable(struct device *dev, unsigned long freq)
 	return 0;
 }
 
-static inline struct srcu_notifier_head *dev_pm_opp_get_notifier(
-							struct device *dev)
+static inline int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return -ENOTSUPP;
+}
+
+static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb)
+{
+	return -ENOTSUPP;
 }
 
 static inline int dev_pm_opp_set_supported_hw(struct device *dev,
-- 
cgit v1.2.3


From 6a4bc2b4c3c9afd643e8da44dd2318a5c5f22cd0 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 26 Jan 2017 14:44:17 -0800
Subject: net: Fix ndo_setup_tc comment

Commit 16e5cc647173 ("net: rework setup_tc ndo op to consume
general tc operand") changed the ndo_setup_tc() signature, but did not
update the comments in netdevice.h, so do that now.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3868c32d98af..d63cacb67ea6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -964,11 +964,12 @@ struct netdev_xdp {
  *      with PF and querying it may introduce a theoretical security risk.
  * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
- * int (*ndo_setup_tc)(struct net_device *dev, u8 tc)
- * 	Called to setup 'tc' number of traffic classes in the net device. This
- * 	is always called from the stack with the rtnl lock held and netif tx
- * 	queues stopped. This allows the netdevice to perform queue management
- * 	safely.
+ * int (*ndo_setup_tc)(struct net_device *dev, u32 handle,
+ *		       __be16 protocol, struct tc_to_netdev *tc);
+ *	Called to setup any 'tc' scheduler, classifier or action on @dev.
+ *	This is always called from the stack with the rtnl lock held and netif
+ *	tx queues stopped. This allows the netdevice to perform queue
+ *	management safely.
  *
  *	Fiber Channel over Ethernet (FCoE) offload functions.
  * int (*ndo_fcoe_enable)(struct net_device *dev);
-- 
cgit v1.2.3


From 07e4fead45e6e1932f0b960655ab554b6aab6a08 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 25 Jan 2017 08:06:40 -0800
Subject: blk-mq: create debugfs directory tree

In preparation for putting blk-mq debugging information in debugfs,
create a directory tree mirroring the one in sysfs:

    # tree -d /sys/kernel/debug/block
    /sys/kernel/debug/block
    |-- nvme0n1
    |   `-- mq
    |       |-- 0
    |       |   `-- cpu0
    |       |-- 1
    |       |   `-- cpu1
    |       |-- 2
    |       |   `-- cpu2
    |       `-- 3
    |           `-- cpu3
    `-- vda
        `-- mq
            `-- 0
                |-- cpu0
                |-- cpu1
                |-- cpu2
                `-- cpu3

Also add the scaffolding for the actual files that will go in here,
either under the hardware queue or software queue directories.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Makefile         |   1 +
 block/blk-mq-debugfs.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-sysfs.c   |   8 +++
 block/blk-mq.c         |   2 +
 block/blk-mq.h         |  33 +++++++++++
 include/linux/blkdev.h |   5 ++
 6 files changed, 201 insertions(+)
 create mode 100644 block/blk-mq-debugfs.c

(limited to 'include/linux')

diff --git a/block/Makefile b/block/Makefile
index 3ee0abd7205a..6cabe6bd2882 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
 obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
+obj-$(CONFIG_DEBUG_FS)		+= blk-mq-debugfs.o
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
new file mode 100644
index 000000000000..01711bbf5ade
--- /dev/null
+++ b/block/blk-mq-debugfs.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+
+struct blk_mq_debugfs_attr {
+	const char *name;
+	umode_t mode;
+	const struct file_operations *fops;
+};
+
+static struct dentry *block_debugfs_root;
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
+};
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
+};
+
+int blk_mq_debugfs_register(struct request_queue *q, const char *name)
+{
+	if (!block_debugfs_root)
+		return -ENOENT;
+
+	q->debugfs_dir = debugfs_create_dir(name, block_debugfs_root);
+	if (!q->debugfs_dir)
+		goto err;
+
+	if (blk_mq_debugfs_register_hctxs(q))
+		goto err;
+
+	return 0;
+
+err:
+	blk_mq_debugfs_unregister(q);
+	return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+	debugfs_remove_recursive(q->debugfs_dir);
+	q->mq_debugfs_dir = NULL;
+	q->debugfs_dir = NULL;
+}
+
+static int blk_mq_debugfs_register_ctx(struct request_queue *q,
+				       struct blk_mq_ctx *ctx,
+				       struct dentry *hctx_dir)
+{
+	struct dentry *ctx_dir;
+	char name[20];
+	int i;
+
+	snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
+	ctx_dir = debugfs_create_dir(name, hctx_dir);
+	if (!ctx_dir)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_ctx_attrs); i++) {
+		const struct blk_mq_debugfs_attr *attr;
+
+		attr = &blk_mq_debugfs_ctx_attrs[i];
+		if (!debugfs_create_file(attr->name, attr->mode, ctx_dir, ctx,
+					 attr->fops))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int blk_mq_debugfs_register_hctx(struct request_queue *q,
+					struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_ctx *ctx;
+	struct dentry *hctx_dir;
+	char name[20];
+	int i;
+
+	snprintf(name, sizeof(name), "%u", hctx->queue_num);
+	hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir);
+	if (!hctx_dir)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_hctx_attrs); i++) {
+		const struct blk_mq_debugfs_attr *attr;
+
+		attr = &blk_mq_debugfs_hctx_attrs[i];
+		if (!debugfs_create_file(attr->name, attr->mode, hctx_dir, hctx,
+					 attr->fops))
+			return -ENOMEM;
+	}
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	if (!q->debugfs_dir)
+		return -ENOENT;
+
+	q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir);
+	if (!q->mq_debugfs_dir)
+		goto err;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (blk_mq_debugfs_register_hctx(q, hctx))
+			goto err;
+	}
+
+	return 0;
+
+err:
+	blk_mq_debugfs_unregister_hctxs(q);
+	return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+	debugfs_remove_recursive(q->mq_debugfs_dir);
+	q->mq_debugfs_dir = NULL;
+}
+
+void blk_mq_debugfs_init(void)
+{
+	block_debugfs_root = debugfs_create_dir("block", NULL);
+}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 2caecaa98e40..91b93ca1c1e0 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -468,6 +468,8 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 		kobject_put(&hctx->kobj);
 	}
 
+	blk_mq_debugfs_unregister(q);
+
 	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
 	kobject_del(&q->mq_kobj);
 	kobject_put(&q->mq_kobj);
@@ -517,6 +519,8 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 
 	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
+	blk_mq_debugfs_register(q, kobject_name(&dev->kobj));
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
@@ -542,6 +546,8 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
 	if (!q->mq_sysfs_init_done)
 		return;
 
+	blk_mq_debugfs_unregister_hctxs(q);
+
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 }
@@ -554,6 +560,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
 	if (!q->mq_sysfs_init_done)
 		return ret;
 
+	blk_mq_debugfs_register_hctxs(q);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 84d13b5cafd0..ede835c2b08b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2799,6 +2799,8 @@ void blk_mq_enable_hotplug(void)
 
 static int __init blk_mq_init(void)
 {
+	blk_mq_debugfs_init();
+
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 0c7c034d9ddd..6c24b901acd7 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -78,6 +78,39 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 
+/*
+ * debugfs helpers
+ */
+#ifdef CONFIG_DEBUG_FS
+void blk_mq_debugfs_init(void);
+int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+void blk_mq_debugfs_unregister(struct request_queue *q);
+int blk_mq_debugfs_register_hctxs(struct request_queue *q);
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+#else
+static inline void blk_mq_debugfs_init(void)
+{
+}
+
+int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+{
+	return 0;
+}
+
+void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+}
+
+int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+	return 0;
+}
+
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+}
+#endif
+
 extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
 void blk_mq_release(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 25564857f5f8..0ee283f3cffe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -571,6 +571,11 @@ struct request_queue {
 	struct list_head	tag_set_list;
 	struct bio_set		*bio_split;
 
+#ifdef CONFIG_DEBUG_FS
+	struct dentry		*debugfs_dir;
+	struct dentry		*mq_debugfs_dir;
+#endif
+
 	bool			mq_sysfs_init_done;
 };
 
-- 
cgit v1.2.3


From 24af1ccfe12adddbe17d11801e1689791a4cc282 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 25 Jan 2017 14:32:13 -0800
Subject: sbitmap: add helpers for dumping to a seq_file

This is useful debugging information that will be used in the blk-mq
debugfs directory.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>

Changed 'weight' to 'busy'.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/sbitmap.h | 30 ++++++++++++++++
 lib/sbitmap.c           | 91 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index f017fd6e69c4..d4e0a204c118 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -258,6 +258,26 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
 
 unsigned int sbitmap_weight(const struct sbitmap *sb);
 
+/**
+ * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
+ * @sb: Bitmap to show.
+ * @m: struct seq_file to write to.
+ *
+ * This is intended for debugging. The format may change at any time.
+ */
+void sbitmap_show(struct sbitmap *sb, struct seq_file *m);
+
+/**
+ * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
+ * seq_file.
+ * @sb: Bitmap to show.
+ * @m: struct seq_file to write to.
+ *
+ * This is intended for debugging. The output isn't guaranteed to be internally
+ * consistent.
+ */
+void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m);
+
 /**
  * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
  * memory node.
@@ -370,4 +390,14 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
  */
 void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
 
+/**
+ * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
+ * seq_file.
+ * @sbq: Bitmap queue to show.
+ * @m: struct seq_file to write to.
+ *
+ * This is intended for debugging. The format may change at any time.
+ */
+void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 8f5c3b268c77..55e11c4b2f3b 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -17,6 +17,7 @@
 
 #include <linux/random.h>
 #include <linux/sbitmap.h>
+#include <linux/seq_file.h>
 
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		      gfp_t flags, int node)
@@ -180,6 +181,62 @@ unsigned int sbitmap_weight(const struct sbitmap *sb)
 }
 EXPORT_SYMBOL_GPL(sbitmap_weight);
 
+void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
+{
+	seq_printf(m, "depth=%u\n", sb->depth);
+	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
+	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
+	seq_printf(m, "map_nr=%u\n", sb->map_nr);
+}
+EXPORT_SYMBOL_GPL(sbitmap_show);
+
+static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte)
+{
+	if ((offset & 0xf) == 0) {
+		if (offset != 0)
+			seq_putc(m, '\n');
+		seq_printf(m, "%08x:", offset);
+	}
+	if ((offset & 0x1) == 0)
+		seq_putc(m, ' ');
+	seq_printf(m, "%02x", byte);
+}
+
+void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
+{
+	u8 byte = 0;
+	unsigned int byte_bits = 0;
+	unsigned int offset = 0;
+	int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		unsigned long word = READ_ONCE(sb->map[i].word);
+		unsigned int word_bits = READ_ONCE(sb->map[i].depth);
+
+		while (word_bits > 0) {
+			unsigned int bits = min(8 - byte_bits, word_bits);
+
+			byte |= (word & (BIT(bits) - 1)) << byte_bits;
+			byte_bits += bits;
+			if (byte_bits == 8) {
+				emit_byte(m, offset, byte);
+				byte = 0;
+				byte_bits = 0;
+				offset++;
+			}
+			word >>= bits;
+			word_bits -= bits;
+		}
+	}
+	if (byte_bits) {
+		emit_byte(m, offset, byte);
+		offset++;
+	}
+	if (offset)
+		seq_putc(m, '\n');
+}
+EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
+
 static unsigned int sbq_calc_wake_batch(unsigned int depth)
 {
 	unsigned int wake_batch;
@@ -377,3 +434,37 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
 	}
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);
+
+void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
+{
+	bool first;
+	int i;
+
+	sbitmap_show(&sbq->sb, m);
+
+	seq_puts(m, "alloc_hint={");
+	first = true;
+	for_each_possible_cpu(i) {
+		if (!first)
+			seq_puts(m, ", ");
+		first = false;
+		seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i));
+	}
+	seq_puts(m, "}\n");
+
+	seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
+	seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
+
+	seq_puts(m, "ws={\n");
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[i];
+
+		seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n",
+			   atomic_read(&ws->wait_cnt),
+			   waitqueue_active(&ws->wait) ? "active" : "inactive");
+	}
+	seq_puts(m, "}\n");
+
+	seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_show);
-- 
cgit v1.2.3


From 50e1dab86aa2c10cbca2f754aae9542169403141 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 26 Jan 2017 14:42:34 -0700
Subject: blk-mq-sched: fix starvation for multiple hardware queues and shared
 tags

If we have both multiple hardware queues and shared tag map between
devices, we need to ensure that we propagate the hardware queue
restart bit higher up. This is because we can get into a situation
where we don't have any IO pending on a hardware queue, yet we fail
getting a tag to start new IO. If that happens, it's not enough to
mark the hardware queue as needing a restart, we need to bubble
that up to the higher level queue as well.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Tested-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-mq-sched.c   | 28 ++++++++++++++++++++++++++++
 block/blk-mq-sched.h   | 15 +++++++++------
 block/blk-mq.c         |  3 ++-
 block/blk-mq.h         |  1 +
 include/linux/blkdev.h |  1 +
 5 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4cee060a292d..fcc0e893d687 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -301,6 +301,34 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
 
+static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+		if (blk_mq_hctx_has_pending(hctx))
+			blk_mq_run_hw_queue(hctx, true);
+	}
+}
+
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
+{
+	unsigned int i;
+
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		blk_mq_sched_restart_hctx(hctx);
+	else {
+		struct request_queue *q = hctx->queue;
+
+		if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+			return;
+
+		clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+
+		queue_for_each_hw_ctx(q, hctx, i)
+			blk_mq_sched_restart_hctx(hctx);
+	}
+}
+
 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
 				   struct blk_mq_hw_ctx *hctx,
 				   unsigned int hctx_idx)
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 6b465bc7014c..becbc7840364 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -19,6 +19,7 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq);
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
@@ -123,11 +124,6 @@ blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	BUG_ON(rq->internal_tag == -1);
 
 	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
-
-	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
-		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-		blk_mq_run_hw_queue(hctx, true);
-	}
 }
 
 static inline void blk_mq_sched_started_request(struct request *rq)
@@ -160,8 +156,15 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
 
 static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
 {
-	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+		if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+			struct request_queue *q = hctx->queue;
+
+			if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+				set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+		}
+	}
 }
 
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 711883384585..21795c6575bc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -40,7 +40,7 @@ static LIST_HEAD(all_q_list);
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
 	return sbitmap_any_bit_set(&hctx->ctx_map) ||
 			!list_empty_careful(&hctx->dispatch) ||
@@ -345,6 +345,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
 		blk_mq_sched_completed_request(hctx, rq);
+	blk_mq_sched_restart_queues(hctx);
 	blk_queue_exit(q);
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c24b901acd7..077a4003f1fd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -33,6 +33,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 
 /*
  * Internal helpers for allocating/freeing the request map
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0ee283f3cffe..883b8abe4305 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -607,6 +607,7 @@ struct request_queue {
 #define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
+#define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From c13660a08c8b3bb49def4374bfd414aaaa564662 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 26 Jan 2017 12:40:07 -0700
Subject: blk-mq-sched: change ->dispatch_requests() to ->dispatch_request()

When we invoke dispatch_requests(), the scheduler empties everything
into the passed in list. This isn't always a good thing, since it
means that we remove items that we could have potentially merged
with.

Change the function to dispatch single requests at the time. If
we do that, we can backoff exactly at the point where the device
can't consume more IO, and leave the rest with the scheduler for
better merging and future dispatch decision making.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Tested-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-mq-sched.c     | 23 +++++++++++++++--------
 block/blk-mq.c           |  2 +-
 block/mq-deadline.c      | 10 ++++++----
 include/linux/elevator.h |  2 +-
 4 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index fcc0e893d687..c27613de80c5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -201,15 +201,22 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 * leave them there for as long as we can. Mark the hw queue as
 	 * needing a restart in that case.
 	 */
-	if (list_empty(&rq_list)) {
-		if (e && e->type->ops.mq.dispatch_requests)
-			e->type->ops.mq.dispatch_requests(hctx, &rq_list);
-		else
-			blk_mq_flush_busy_ctxs(hctx, &rq_list);
-	} else
+	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart(hctx);
-
-	blk_mq_dispatch_rq_list(hctx, &rq_list);
+		blk_mq_dispatch_rq_list(hctx, &rq_list);
+	} else if (!e || !e->type->ops.mq.dispatch_request) {
+		blk_mq_flush_busy_ctxs(hctx, &rq_list);
+		blk_mq_dispatch_rq_list(hctx, &rq_list);
+	} else {
+		do {
+			struct request *rq;
+
+			rq = e->type->ops.mq.dispatch_request(hctx);
+			if (!rq)
+				break;
+			list_add(&rq->queuelist, &rq_list);
+		} while (blk_mq_dispatch_rq_list(hctx, &rq_list));
+	}
 }
 
 void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 21795c6575bc..301ae29fd229 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -998,7 +998,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 	 */
 	if (!list_empty(list)) {
 		spin_lock(&hctx->lock);
-		list_splice(list, &hctx->dispatch);
+		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
 
 		/*
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a01986d7b6fb..d93ec713fa62 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -287,14 +287,16 @@ done:
 	return rq;
 }
 
-static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
-				 struct list_head *rq_list)
+static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	struct request *rq;
 
 	spin_lock(&dd->lock);
-	blk_mq_sched_move_to_dispatch(hctx, rq_list, __dd_dispatch_request);
+	rq = __dd_dispatch_request(hctx);
 	spin_unlock(&dd->lock);
+
+	return rq;
 }
 
 static void dd_exit_queue(struct elevator_queue *e)
@@ -517,7 +519,7 @@ static struct elv_fs_entry deadline_attrs[] = {
 static struct elevator_type mq_deadline = {
 	.ops.mq = {
 		.insert_requests	= dd_insert_requests,
-		.dispatch_requests	= dd_dispatch_requests,
+		.dispatch_request	= dd_dispatch_request,
 		.next_request		= elv_rb_latter_request,
 		.former_request		= elv_rb_former_request,
 		.bio_merge		= dd_bio_merge,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ecb96fd67c6d..b5825c4f06f7 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -92,7 +92,7 @@ struct elevator_mq_ops {
 	struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
 	void (*put_request)(struct request *);
 	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
-	void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *);
+	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
 	bool (*has_work)(struct blk_mq_hw_ctx *);
 	void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
 	void (*started_request)(struct request *);
-- 
cgit v1.2.3


From d35a00b8e33dab7385f724e713ae71c8be0a49f4 Mon Sep 17 00:00:00 2001
From: Felix Jia <felix.jia@alliedtelesis.co.nz>
Date: Thu, 26 Jan 2017 16:59:17 +1300
Subject: net/ipv6: allow sysctl to change link-local address generation mode

The address generation mode for IPv6 link-local can only be configured
by netlink messages. This patch adds the ability to change the address
generation mode via sysctl.

v1 -> v2
Removed the rtnl lock and switch to use RCU lock to iterate through
the netdev list.

v2 -> v3
Removed the addrgenmode variable from the idev structure and use the
systcl storage for the flag.

Simplifed the logic for sysctl handling by removing the supported
for all operation.

Added support for more types of tunnel interfaces for link-local
address generation.

Based the patches from net-next.

v3 -> v4
Removed unnecessary whitespace changes.

Signed-off-by: Felix Jia <felix.jia@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/net/if_inet6.h    |   1 -
 include/uapi/linux/ipv6.h |   1 +
 net/ipv6/addrconf.c       | 104 +++++++++++++++++++++++++++++++++++++---------
 4 files changed, 86 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 671d014e6429..71be5b330d21 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -69,6 +69,7 @@ struct ipv6_devconf {
 	__s32		seg6_require_hmac;
 #endif
 	__u32		enhanced_dad;
+	__u32		addr_gen_mode;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 0fa4c324b713..f656f9051aca 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -205,7 +205,6 @@ struct inet6_dev {
 	__s32			rs_interval;	/* in jiffies */
 	__u8			rs_probes;
 
-	__u8			addr_gen_mode;
 	unsigned long		tstamp; /* ipv6InterfaceTable update timestamp */
 	struct rcu_head		rcu;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index eaf65dc82e22..8ef9e75e004e 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -182,6 +182,7 @@ enum {
 	DEVCONF_SEG6_ENABLED,
 	DEVCONF_SEG6_REQUIRE_HMAC,
 	DEVCONF_ENHANCED_DAD,
+	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ac9bd5620f81..e35259dd17ba 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,6 +243,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -294,6 +295,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 /* Check if a valid qdisc is available */
@@ -386,9 +388,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
 
 	if (ndev->cnf.stable_secret.initialized)
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	else
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+		ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
 
 	ndev->cnf.mtu6 = dev->mtu;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2387,8 +2389,8 @@ static void manage_tempaddrs(struct inet6_dev *idev,
 
 static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
 {
-	return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
-	       idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
+	return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
+	       idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
 }
 
 int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
@@ -3152,7 +3154,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 
 	ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
 
-	switch (idev->addr_gen_mode) {
+	switch (idev->cnf.addr_gen_mode) {
 	case IN6_ADDR_GEN_MODE_RANDOM:
 		ipv6_gen_mode_random_init(idev);
 		/* fallthrough */
@@ -3204,8 +3206,8 @@ static void addrconf_dev_config(struct net_device *dev)
 
 	/* this device type has no EUI support */
 	if (dev->type == ARPHRD_NONE &&
-	    idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
+	    idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
 
 	addrconf_addr_gen(idev, false);
 }
@@ -4982,6 +4984,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
 #endif
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
+	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5093,7 +5096,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
 	if (!nla)
 		goto nla_put_failure;
 
-	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode))
+	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
 		goto nla_put_failure;
 
 	read_lock_bh(&idev->lock);
@@ -5211,6 +5214,26 @@ static int inet6_validate_link_af(const struct net_device *dev,
 	return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
 }
 
+static int check_addr_gen_mode(int mode)
+{
+	if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
+	    mode != IN6_ADDR_GEN_MODE_NONE &&
+	    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    mode != IN6_ADDR_GEN_MODE_RANDOM)
+		return -EINVAL;
+	return 1;
+}
+
+static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
+				int mode)
+{
+	if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    !idev->cnf.stable_secret.initialized &&
+	    !net->ipv6.devconf_dflt->stable_secret.initialized)
+		return -EINVAL;
+	return 1;
+}
+
 static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 {
 	int err = -EINVAL;
@@ -5232,18 +5255,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 	if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
 		u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
 
-		if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
-		    mode != IN6_ADDR_GEN_MODE_NONE &&
-		    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    mode != IN6_ADDR_GEN_MODE_RANDOM)
-			return -EINVAL;
-
-		if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    !idev->cnf.stable_secret.initialized &&
-		    !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
+		if (check_addr_gen_mode(mode) < 0 ||
+		    check_stable_privacy(idev, dev_net(dev), mode) < 0)
 			return -EINVAL;
 
-		idev->addr_gen_mode = mode;
+		idev->cnf.addr_gen_mode = mode;
 		err = 0;
 	}
 
@@ -5652,6 +5668,47 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
+					 void __user *buffer, size_t *lenp,
+					 loff_t *ppos)
+{
+	int ret = 0;
+	int new_val;
+	struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+	struct net *net = (struct net *)ctl->extra2;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write) {
+		new_val = *((int *)ctl->data);
+
+		if (check_addr_gen_mode(new_val) < 0)
+			return -EINVAL;
+
+		/* request for default */
+		if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
+			ipv6_devconf_dflt.addr_gen_mode = new_val;
+
+		/* request for individual net device */
+		} else {
+			if (!idev)
+				return ret;
+
+			if (check_stable_privacy(idev, net, new_val) < 0)
+				return -EINVAL;
+
+			if (idev->cnf.addr_gen_mode != new_val) {
+				idev->cnf.addr_gen_mode = new_val;
+				rtnl_lock();
+				addrconf_dev_config(idev->dev);
+				rtnl_unlock();
+			}
+		}
+	}
+
+	return ret;
+}
+
 static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 					 void __user *buffer, size_t *lenp,
 					 loff_t *ppos)
@@ -5702,14 +5759,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 			struct inet6_dev *idev = __in6_dev_get(dev);
 
 			if (idev) {
-				idev->addr_gen_mode =
+				idev->cnf.addr_gen_mode =
 					IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 			}
 		}
 	} else {
 		struct inet6_dev *idev = ctl->extra1;
 
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	}
 
 out:
@@ -6096,6 +6153,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec,
 	},
+	{
+		.procname		= "addr_gen_mode",
+		.data			= &ipv6_devconf.addr_gen_mode,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= addrconf_sysctl_addr_gen_mode,
+	},
 	{
 		/* sentinel */
 	}
-- 
cgit v1.2.3


From f73f44eb00cb136990cfb7d40e436c13d7669ec8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Jan 2017 08:30:47 -0700
Subject: block: add a op_is_flush helper

This centralizes the checks for bios that needs to be go into the flush
state machine.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c             |  8 ++++----
 block/blk-mq-sched.c         |  5 ++---
 block/blk-mq.c               |  4 ++--
 drivers/md/bcache/request.c  |  2 +-
 drivers/md/dm-cache-target.c | 13 +++----------
 drivers/md/dm-thin.c         | 13 +++++--------
 include/linux/blk_types.h    |  9 +++++++++
 7 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index a61f1407f4f6..b830e14117dd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1035,7 +1035,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 	 * Flush requests do not use the elevator so skip initialization.
 	 * This allows a request to share the flush and elevator data.
 	 */
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA))
+	if (op_is_flush(bio->bi_opf))
 		return false;
 
 	return true;
@@ -1641,7 +1641,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) {
+	if (op_is_flush(bio->bi_opf)) {
 		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
@@ -2145,7 +2145,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+	if (op_is_flush(rq->cmd_flags))
 		where = ELEVATOR_INSERT_FLUSH;
 
 	add_acct_request(q, rq, where);
@@ -3256,7 +3256,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+		if (op_is_flush(rq->cmd_flags))
 			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
 		else
 			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c27613de80c5..4139b07ab33b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -111,7 +111,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct request *rq;
-	const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA);
 
 	blk_queue_enter_live(q);
 	ctx = blk_mq_get_ctx(q);
@@ -126,7 +125,7 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 		 * Flush requests are special and go directly to the
 		 * dispatch list.
 		 */
-		if (!is_flush && e->type->ops.mq.get_request) {
+		if (!op_is_flush(op) && e->type->ops.mq.get_request) {
 			rq = e->type->ops.mq.get_request(q, op, data);
 			if (rq)
 				rq->rq_flags |= RQF_QUEUED;
@@ -139,7 +138,7 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 	}
 
 	if (rq) {
-		if (!is_flush) {
+		if (!op_is_flush(op)) {
 			rq->elv.icq = NULL;
 			if (e && e->type->icq_cache)
 				blk_mq_sched_assign_ioc(q, rq, bio);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 301ae29fd229..da2123dd681e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1406,7 +1406,7 @@ insert:
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
-	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_mq_alloc_data data = { .flags = 0 };
 	struct request *rq;
 	unsigned int request_count = 0, srcu_idx;
@@ -1527,7 +1527,7 @@ done:
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
-	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
 	struct blk_mq_alloc_data data = { .flags = 0 };
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 76d20875503c..01035e718c1c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -666,7 +666,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->iop.write_prio	= 0;
 	s->iop.error		= 0;
 	s->iop.flags		= 0;
-	s->iop.flush_journal	= (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0;
+	s->iop.flush_journal	= op_is_flush(bio->bi_opf);
 	s->iop.wq		= bcache_wq;
 
 	return s;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index e04c61e0839e..5b9cf56de8ef 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -787,8 +787,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
 	spin_lock_irqsave(&cache->lock, flags);
-	if (cache->need_tick_bio &&
-	    !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) &&
+	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 	    bio_op(bio) != REQ_OP_DISCARD) {
 		pb->tick = true;
 		cache->need_tick_bio = false;
@@ -828,11 +827,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 	return to_oblock(block_nr);
 }
 
-static int bio_triggers_commit(struct cache *cache, struct bio *bio)
-{
-	return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-}
-
 /*
  * You must increment the deferred set whilst the prison cell is held.  To
  * encourage this, we ask for 'cell' to be passed in.
@@ -884,7 +878,7 @@ static void issue(struct cache *cache, struct bio *bio)
 {
 	unsigned long flags;
 
-	if (!bio_triggers_commit(cache, bio)) {
+	if (!op_is_flush(bio->bi_opf)) {
 		accounted_request(cache, bio);
 		return;
 	}
@@ -1069,8 +1063,7 @@ static void dec_io_migrations(struct cache *cache)
 
 static bool discard_or_flush(struct bio *bio)
 {
-	return bio_op(bio) == REQ_OP_DISCARD ||
-	       bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
 }
 
 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index d1c05c12a9db..110982db4b48 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 
 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 {
-	return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
+	return op_is_flush(bio->bi_opf) &&
 		dm_thin_changed_this_transaction(tc->td);
 }
 
@@ -870,8 +870,7 @@ static void __inc_remap_and_issue_cell(void *context,
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(&cell->bios))) {
-		if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-		    bio_op(bio) == REQ_OP_DISCARD)
+		if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
 			bio_list_add(&info->defer_bios, bio);
 		else {
 			inc_all_io_entry(info->tc->pool, bio);
@@ -1716,9 +1715,8 @@ static void __remap_and_issue_shared_cell(void *context,
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(&cell->bios))) {
-		if ((bio_data_dir(bio) == WRITE) ||
-		    (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-		     bio_op(bio) == REQ_OP_DISCARD))
+		if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
+		    bio_op(bio) == REQ_OP_DISCARD)
 			bio_list_add(&info->defer_bios, bio);
 		else {
 			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
@@ -2635,8 +2633,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-	    bio_op(bio) == REQ_OP_DISCARD) {
+	if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
 		thin_defer_bio_with_throttle(tc, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0e5b1cd5113c..37c9a43c5e78 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -220,6 +220,15 @@ static inline bool op_is_write(unsigned int op)
 	return (op & 1);
 }
 
+/*
+ * Check if the bio or request is one that needs special treatment in the
+ * flush state machine.
+ */
+static inline bool op_is_flush(unsigned int op)
+{
+	return op & (REQ_FUA | REQ_PREFLUSH);
+}
+
 /*
  * Reads are always treated as synchronous, as are requests with the FUA or
  * PREFLUSH flag.  Other operations may be marked as synchronous using the
-- 
cgit v1.2.3


From 0fc9ae107669760c2a8658cb5b5876dbe525e08d Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Fri, 27 Jan 2017 14:07:01 +0100
Subject: net: phy: broadcom: add support for BCM54210E
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's Broadcom PHY simply described as single-port
RGMII 10/100/1000BASE-T PHY. It requires disabling delay skew and GTXCLK
bits.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 34 +++++++++++++++++++++++++++++++++-
 include/linux/brcmphy.h    |  1 +
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 97d1c057c0a1..794b9ec81ba5 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -30,6 +30,22 @@ MODULE_DESCRIPTION("Broadcom PHY driver");
 MODULE_AUTHOR("Maciej W. Rozycki");
 MODULE_LICENSE("GPL");
 
+static int bcm54210e_config_init(struct phy_device *phydev)
+{
+	int val;
+
+	val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
+	val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
+	val |= MII_BCM54XX_AUXCTL_MISC_WREN;
+	bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC, val);
+
+	val = bcm_phy_read_shadow(phydev, BCM54810_SHD_CLK_CTL);
+	val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN;
+	bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val);
+
+	return 0;
+}
+
 static int bcm54810_config(struct phy_device *phydev)
 {
 	int rc, val;
@@ -230,7 +246,11 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 	    (phydev->dev_flags & PHY_BRCM_AUTO_PWRDWN_ENABLE))
 		bcm54xx_adjust_rxrefclk(phydev);
 
-	if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) {
+	if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E) {
+		err = bcm54210e_config_init(phydev);
+		if (err)
+			return err;
+	} else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) {
 		err = bcm54810_config(phydev);
 		if (err)
 			return err;
@@ -541,6 +561,17 @@ static struct phy_driver broadcom_drivers[] = {
 	.read_status	= genphy_read_status,
 	.ack_interrupt	= bcm_phy_ack_intr,
 	.config_intr	= bcm_phy_config_intr,
+}, {
+	.phy_id		= PHY_ID_BCM54210E,
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "Broadcom BCM54210E",
+	.features	= PHY_GBIT_FEATURES,
+	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+	.config_init	= bcm54xx_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.ack_interrupt	= bcm_phy_ack_intr,
+	.config_intr	= bcm_phy_config_intr,
 }, {
 	.phy_id		= PHY_ID_BCM5461,
 	.phy_id_mask	= 0xfffffff0,
@@ -680,6 +711,7 @@ module_phy_driver(broadcom_drivers);
 static struct mdio_device_id __maybe_unused broadcom_tbl[] = {
 	{ PHY_ID_BCM5411, 0xfffffff0 },
 	{ PHY_ID_BCM5421, 0xfffffff0 },
+	{ PHY_ID_BCM54210E, 0xfffffff0 },
 	{ PHY_ID_BCM5461, 0xfffffff0 },
 	{ PHY_ID_BCM54612E, 0xfffffff0 },
 	{ PHY_ID_BCM54616S, 0xfffffff0 },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index cf93f1399d3e..5881d1fdc1fb 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -17,6 +17,7 @@
 #define PHY_ID_BCM5482			0x0143bcb0
 #define PHY_ID_BCM5411			0x00206070
 #define PHY_ID_BCM5421			0x002060e0
+#define PHY_ID_BCM54210E		0x600d84a0
 #define PHY_ID_BCM5464			0x002060b0
 #define PHY_ID_BCM5461			0x002060c0
 #define PHY_ID_BCM54612E		0x03625e60
-- 
cgit v1.2.3


From 2b368b234ec43abbf46f2983478c438d36e58134 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Fri, 27 Jan 2017 15:26:32 +0100
Subject: net: wan: Remove unused stats member from struct frad_local

The stats member of struct frad_locl is used neither by the dlci nor the
sdla driver, so it might as well be removed.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_frad.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 4316aa173dde..46df7e565d6f 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -66,8 +66,6 @@ struct dlci_local
 
 struct frad_local
 {
-   struct net_device_stats stats;
-
    /* devices which this FRAD is slaved to */
    struct net_device     *master[CONFIG_DLCI_MAX];
    short             dlci[CONFIG_DLCI_MAX];
-- 
cgit v1.2.3


From f617f27653c4d9f5b2aa43d567ac0405df889944 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 27 Jan 2017 15:02:26 +0000
Subject: net: implement netif_cond_dbg macro

For reporting things that may or may not be serious, depending on some
 condition, netif_cond_dbg will check the condition and print the report
 at either dbg (if the condition is true) or the specified level.

Suggested-by: Jon Cooper <jcooper@solarflare.com>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d63cacb67ea6..9511e5a1cfc6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4347,6 +4347,15 @@ do {								\
 })
 #endif
 
+/* if @cond then downgrade to debug, else print at @level */
+#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...)     \
+	do {                                                              \
+		if (cond)                                                 \
+			netif_dbg(priv, type, netdev, fmt, ##args);       \
+		else                                                      \
+			netif_ ## level(priv, type, netdev, fmt, ##args); \
+	} while (0)
+
 #if defined(VERBOSE_DEBUG)
 #define netif_vdbg	netif_dbg
 #else
-- 
cgit v1.2.3


From b18b6a9cef7f30e9a8b7738d5fc8d568cf660855 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Sat, 21 Jan 2017 00:09:08 -0500
Subject: timers: Omit POSIX timer stuff from task_struct when disabled

When CONFIG_POSIX_TIMERS is disabled, it is preferable to remove related
structures from struct task_struct and struct signal_struct as they
won't contain anything useful and shouldn't be relied upon by mistake.
Code still referencing those structures is also disabled here.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/proc/base.c            |  4 ++--
 include/linux/init_task.h | 40 +++++++++++++++++++++++++---------------
 include/linux/sched.h     | 13 ++++++++++---
 kernel/fork.c             | 10 +++++++++-
 kernel/sched/rt.c         |  4 ++++
 kernel/sched/stats.h      | 32 ++++++++++++++++++++------------
 6 files changed, 70 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e7e61b28f31..03deeac1e835 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2179,7 +2179,7 @@ static const struct file_operations proc_map_files_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 struct timers_private {
 	struct pid *pid;
 	struct task_struct *task;
@@ -2936,7 +2936,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
 #endif
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 325f649d77ff..3a85d61f7614 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -42,6 +42,27 @@ extern struct fs_struct init_fs;
 #define INIT_PREV_CPUTIME(x)
 #endif
 
+#ifdef CONFIG_POSIX_TIMERS
+#define INIT_POSIX_TIMERS(s)						\
+	.posix_timers = LIST_HEAD_INIT(s.posix_timers),
+#define INIT_CPU_TIMERS(s)						\
+	.cpu_timers = {							\
+		LIST_HEAD_INIT(s.cpu_timers[0]),			\
+		LIST_HEAD_INIT(s.cpu_timers[1]),			\
+		LIST_HEAD_INIT(s.cpu_timers[2]),								\
+	},
+#define INIT_CPUTIMER(s)						\
+	.cputimer	= { 						\
+		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
+		.running	= false,				\
+		.checking_timer = false,				\
+	},
+#else
+#define INIT_POSIX_TIMERS(s)
+#define INIT_CPU_TIMERS(s)
+#define INIT_CPUTIMER(s)
+#endif
+
 #define INIT_SIGNALS(sig) {						\
 	.nr_threads	= 1,						\
 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
@@ -49,14 +70,10 @@ extern struct fs_struct init_fs;
 	.shared_pending	= { 						\
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}},					\
-	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
-	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
+	INIT_POSIX_TIMERS(sig)						\
+	INIT_CPU_TIMERS(sig)						\
 	.rlim		= INIT_RLIMITS,					\
-	.cputimer	= { 						\
-		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
-		.running	= false,				\
-		.checking_timer = false,				\
-	},								\
+	INIT_CPUTIMER(sig)						\
 	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
@@ -247,7 +264,7 @@ extern struct task_group root_task_group;
 	.blocked	= {{0}},					\
 	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
 	.journal_info	= NULL,						\
-	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	INIT_CPU_TIMERS(tsk)						\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
 	.pids = {							\
@@ -274,13 +291,6 @@ extern struct task_group root_task_group;
 }
 
 
-#define INIT_CPU_TIMERS(cpu_timers)					\
-{									\
-	LIST_HEAD_INIT(cpu_timers[0]),					\
-	LIST_HEAD_INIT(cpu_timers[1]),					\
-	LIST_HEAD_INIT(cpu_timers[2]),					\
-}
-
 /* Attach to the init_task data structure for proper alignment */
 #define __init_task_data __attribute__((__section__(".data..init_task")))
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..e8f6af59726d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,13 +734,14 @@ struct signal_struct {
 	unsigned int		is_child_subreaper:1;
 	unsigned int		has_child_subreaper:1;
 
+#ifdef CONFIG_POSIX_TIMERS
+
 	/* POSIX.1b Interval Timers */
 	int			posix_timer_id;
 	struct list_head	posix_timers;
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
-	struct pid *leader_pid;
 	ktime_t it_real_incr;
 
 	/*
@@ -759,12 +760,16 @@ struct signal_struct {
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
 
+	struct list_head cpu_timers[3];
+
+#endif
+
+	struct pid *leader_pid;
+
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t tick_dep_mask;
 #endif
 
-	struct list_head cpu_timers[3];
-
 	struct pid *tty_old_pgrp;
 
 	/* boolean value for session group leader */
@@ -1681,8 +1686,10 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
+#ifdef CONFIG_POSIX_TIMERS
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
+#endif
 
 /* process credentials */
 	const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..105c6676d93b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1304,6 +1304,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 	}
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 /*
  * Initialize POSIX timer handling for a thread group.
  */
@@ -1322,6 +1323,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
 }
+#else
+static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
+#endif
 
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
@@ -1346,11 +1350,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	init_waitqueue_head(&sig->wait_chldexit);
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
-	INIT_LIST_HEAD(&sig->posix_timers);
 	seqlock_init(&sig->stats_lock);
 	prev_cputime_init(&sig->prev_cputime);
 
 #ifdef CONFIG_POSIX_TIMERS
+	INIT_LIST_HEAD(&sig->posix_timers);
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
 #endif
@@ -1425,6 +1429,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 /*
  * Initialize POSIX timer handling for a single task.
  */
@@ -1437,6 +1442,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
+#else
+static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
+#endif
 
 static inline void
 init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8df6dbb..a688a8206727 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2246,6 +2246,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 	}
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 static void watchdog(struct rq *rq, struct task_struct *p)
 {
 	unsigned long soft, hard;
@@ -2267,6 +2268,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
 	}
 }
+#else
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 34659a853505..c69a9870ab79 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -172,18 +172,19 @@ sched_info_switch(struct rq *rq,
  */
 
 /**
- * cputimer_running - return true if cputimer is running
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
  *
  * @tsk:	Pointer to target task.
  */
-static inline bool cputimer_running(struct task_struct *tsk)
-
+#ifdef CONFIG_POSIX_TIMERS
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running))
-		return false;
+		return NULL;
 
 	/*
 	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
@@ -200,10 +201,17 @@ static inline bool cputimer_running(struct task_struct *tsk)
 	 * clock delta is behind the expiring timer value.
 	 */
 	if (unlikely(!tsk->sighand))
-		return false;
+		return NULL;
 
-	return true;
+	return cputimer;
+}
+#else
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+	return NULL;
 }
+#endif
 
 /**
  * account_group_user_time - Maintain utime for a thread group.
@@ -218,9 +226,9 @@ static inline bool cputimer_running(struct task_struct *tsk)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
@@ -239,9 +247,9 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
@@ -260,9 +268,9 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-- 
cgit v1.2.3


From 5ea708d15a928f7a479987704203616d3274c03b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Jan 2017 14:52:44 +0300
Subject: block: simplify blk_init_allocated_queue

Return an errno value instead of the passed in queue so that the callers
don't have to keep track of two queues, and move the assignment of the
request_fn and lock to the caller as passing them as argument doesn't
simplify anything.  While we're at it also remove two pointless NULL
assignments, given that the request structure is zeroed on allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 38 +++++++++++++++-----------------------
 drivers/md/dm-rq.c     |  3 ++-
 include/linux/blkdev.h |  3 +--
 3 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4cad535592c3..09819d24d385 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -823,15 +823,19 @@ EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
-	struct request_queue *uninit_q, *q;
+	struct request_queue *q;
 
-	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
-	if (!uninit_q)
+	q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+	if (!q)
 		return NULL;
 
-	q = blk_init_allocated_queue(uninit_q, rfn, lock);
-	if (!q)
-		blk_cleanup_queue(uninit_q);
+	q->request_fn = rfn;
+	if (lock)
+		q->queue_lock = lock;
+	if (blk_init_allocated_queue(q) < 0) {
+		blk_cleanup_queue(q);
+		return NULL;
+	}
 
 	return q;
 }
@@ -839,30 +843,19 @@ EXPORT_SYMBOL(blk_init_queue_node);
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
-struct request_queue *
-blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
-			 spinlock_t *lock)
-{
-	if (!q)
-		return NULL;
 
+int blk_init_allocated_queue(struct request_queue *q)
+{
 	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
 	if (!q->fq)
-		return NULL;
+		return -ENOMEM;
 
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 		goto fail;
 
 	INIT_WORK(&q->timeout_work, blk_timeout_work);
-	q->request_fn		= rfn;
-	q->prep_rq_fn		= NULL;
-	q->unprep_rq_fn		= NULL;
 	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
 
-	/* Override internal queue lock with supplied lock pointer */
-	if (lock)
-		q->queue_lock		= lock;
-
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
@@ -880,13 +873,12 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	}
 
 	mutex_unlock(&q->sysfs_lock);
-
-	return q;
+	return 0;
 
 fail:
 	blk_free_flush_queue(q->fq);
 	wbt_exit(q);
-	return NULL;
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d7275fb541a..93f6e9f1ebe2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -823,7 +823,8 @@ static void dm_old_request_fn(struct request_queue *q)
 int dm_old_init_request_queue(struct mapped_device *md)
 {
 	/* Fully initialize the queue */
-	if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL))
+	md->queue->request_fn = dm_old_request_fn;
+	if (blk_init_allocated_queue(md->queue) < 0)
 		return -EINVAL;
 
 	/* disable dm_old_request_fn's merge heuristic by default */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05675b1dfd20..6b1efc5760ea 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1137,8 +1137,7 @@ extern void blk_unprep_request(struct request *);
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
-extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
-						      request_fn_proc *, spinlock_t *);
+extern int blk_init_allocated_queue(struct request_queue *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-- 
cgit v1.2.3


From 6d247d7f71d1fa4b66a5f4da7b1daa21510d529b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Jan 2017 09:51:45 -0700
Subject: block: allow specifying size for extra command data

This mirrors the blk-mq capabilities to allocate extra drivers-specific
data behind struct request by setting a cmd_size field, as well as having
a constructor / destructor for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 59 ++++++++++++++++++++++++++++++++++++++++----------
 block/blk-flush.c      |  5 ++---
 block/blk-sysfs.c      |  7 ++++--
 include/linux/blkdev.h |  7 ++++++
 4 files changed, 61 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 09819d24d385..0a485ad802c9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -606,17 +606,41 @@ void blk_cleanup_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_cleanup_queue);
 
 /* Allocate memory local to the request queue */
-static void *alloc_request_struct(gfp_t gfp_mask, void *data)
+static void *alloc_request_simple(gfp_t gfp_mask, void *data)
 {
-	int nid = (int)(long)data;
-	return kmem_cache_alloc_node(request_cachep, gfp_mask, nid);
+	struct request_queue *q = data;
+
+	return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
 }
 
-static void free_request_struct(void *element, void *unused)
+static void free_request_simple(void *element, void *data)
 {
 	kmem_cache_free(request_cachep, element);
 }
 
+static void *alloc_request_size(gfp_t gfp_mask, void *data)
+{
+	struct request_queue *q = data;
+	struct request *rq;
+
+	rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
+			q->node);
+	if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
+		kfree(rq);
+		rq = NULL;
+	}
+	return rq;
+}
+
+static void free_request_size(void *element, void *data)
+{
+	struct request_queue *q = data;
+
+	if (q->exit_rq_fn)
+		q->exit_rq_fn(q, element);
+	kfree(element);
+}
+
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask)
 {
@@ -629,10 +653,15 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q,
 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 
-	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct,
-					  free_request_struct,
-					  (void *)(long)q->node, gfp_mask,
-					  q->node);
+	if (q->cmd_size) {
+		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
+				alloc_request_size, free_request_size,
+				q, gfp_mask, q->node);
+	} else {
+		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
+				alloc_request_simple, free_request_simple,
+				q, gfp_mask, q->node);
+	}
 	if (!rl->rq_pool)
 		return -ENOMEM;
 
@@ -846,12 +875,15 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 int blk_init_allocated_queue(struct request_queue *q)
 {
-	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
+	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
 	if (!q->fq)
 		return -ENOMEM;
 
+	if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
+		goto out_free_flush_queue;
+
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
-		goto fail;
+		goto out_exit_flush_rq;
 
 	INIT_WORK(&q->timeout_work, blk_timeout_work);
 	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
@@ -869,13 +901,16 @@ int blk_init_allocated_queue(struct request_queue *q)
 	/* init elevator */
 	if (elevator_init(q, NULL)) {
 		mutex_unlock(&q->sysfs_lock);
-		goto fail;
+		goto out_exit_flush_rq;
 	}
 
 	mutex_unlock(&q->sysfs_lock);
 	return 0;
 
-fail:
+out_exit_flush_rq:
+	if (q->exit_rq_fn)
+		q->exit_rq_fn(q, q->fq->flush_rq);
+out_free_flush_queue:
 	blk_free_flush_queue(q->fq);
 	wbt_exit(q);
 	return -ENOMEM;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 4427896641ac..0a0358e48b76 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -547,11 +547,10 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 	if (!fq)
 		goto fail;
 
-	if (q->mq_ops) {
+	if (q->mq_ops)
 		spin_lock_init(&fq->mq_flush_lock);
-		rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
-	}
 
+	rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
 	fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
 	if (!fq->flush_rq)
 		goto fail_rq;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1dbce057592d..894f77342fd4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -814,10 +814,13 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	if (!q->mq_ops)
+	if (!q->mq_ops) {
+		if (q->exit_rq_fn)
+			q->exit_rq_fn(q, q->fq->flush_rq);
 		blk_free_flush_queue(q->fq);
-	else
+	} else {
 		blk_mq_release(q);
+	}
 
 	blk_trace_shutdown(q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6b1efc5760ea..461b7cf6af1d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -273,6 +273,8 @@ typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
 typedef int (bsg_job_fn) (struct bsg_job *);
+typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
+typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
 	BLK_EH_NOT_HANDLED,
@@ -408,6 +410,8 @@ struct request_queue {
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 	lld_busy_fn		*lld_busy_fn;
+	init_rq_fn		*init_rq_fn;
+	exit_rq_fn		*exit_rq_fn;
 
 	const struct blk_mq_ops	*mq_ops;
 
@@ -577,6 +581,9 @@ struct request_queue {
 #endif
 
 	bool			mq_sysfs_init_done;
+
+	size_t			cmd_size;
+	void			*rq_alloc_data;
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
-- 
cgit v1.2.3


From 48b77ad6084481ef9330a5d2bee289966da0975b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Jan 2017 09:35:54 +0100
Subject: block: cleanup tracing

A couple tweaks to the tracing code:

 - trace the request size for all requests
 - trace request sector and nr_sectors only for fs requests, enforced by
   helpers
 - drop SCSI CDB tracing - we have SCSI tracing for this and are going
   to me the CDB out of the generic struct request soon.

With this the tracing code stops to know about BLOCK_PC requests entirely,
it's just FS vs passthrough requests now, where the latter includes any
driver-private requests.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blktrace_api.h | 14 +++++++-------
 include/trace/events/block.h | 27 +++++++++++----------------
 kernel/trace/blktrace.c      | 43 ++++++-------------------------------------
 3 files changed, 24 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index e417f080219a..a143a67a6f33 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -110,16 +110,16 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
+extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
 
-static inline int blk_cmd_buf_len(struct request *rq)
+static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
-	return (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? rq->cmd_len * 3 : 1;
+	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_pos(rq);
 }
 
-extern void blk_dump_cmd(char *buf, struct request *rq);
-extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
-
-#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
+static inline unsigned int blk_rq_trace_nr_sectors(struct request *rq)
+{
+	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_sectors(rq);
+}
 
 #endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 3e02e3a25413..a88ed13446ff 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -73,19 +73,17 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 		__field(  unsigned int,	nr_sector		)
 		__field(  int,		errors			)
 		__array(  char,		rwbs,	RWBS_LEN	)
-		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+		__dynamic_array( char,	cmd,	1		)
 	),
 
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_pos(rq);
-		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_sectors(rq);
+		__entry->sector    = blk_rq_trace_sector(rq);
+		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
 		__entry->errors    = rq->errors;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
-		blk_dump_cmd(__get_str(cmd), rq);
+		__get_str(cmd)[0] = '\0';
 	),
 
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
@@ -153,7 +151,7 @@ TRACE_EVENT(block_rq_complete,
 		__field(  unsigned int,	nr_sector		)
 		__field(  int,		errors			)
 		__array(  char,		rwbs,	RWBS_LEN	)
-		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+		__dynamic_array( char,	cmd,	1		)
 	),
 
 	TP_fast_assign(
@@ -163,7 +161,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->errors    = rq->errors;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
-		blk_dump_cmd(__get_str(cmd), rq);
+		__get_str(cmd)[0] = '\0';
 	),
 
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
@@ -186,20 +184,17 @@ DECLARE_EVENT_CLASS(block_rq,
 		__field(  unsigned int,	bytes			)
 		__array(  char,		rwbs,	RWBS_LEN	)
 		__array(  char,         comm,   TASK_COMM_LEN   )
-		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+		__dynamic_array( char,	cmd,	1		)
 	),
 
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_pos(rq);
-		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_sectors(rq);
-		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					blk_rq_bytes(rq) : 0;
+		__entry->sector    = blk_rq_trace_sector(rq);
+		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
+		__entry->bytes     = blk_rq_bytes(rq);
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
-		blk_dump_cmd(__get_str(cmd), rq);
+		__get_str(cmd)[0] = '\0';
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 95cecbf67f5c..512b34703ac3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -712,15 +712,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+	if (rq->cmd_type != REQ_TYPE_FS)
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
-				what, rq->errors, rq->cmd_len, rq->cmd);
-	} else  {
+	else
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
-				rq->cmd_flags, what, rq->errors, 0, NULL);
-	}
+
+	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
+			rq->cmd_flags, what, rq->errors, 0, NULL);
 }
 
 static void blk_add_trace_rq_abort(void *ignore,
@@ -972,11 +970,7 @@ void blk_add_driver_data(struct request_queue *q,
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
-		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
-				BLK_TA_DRV_DATA, rq->errors, len, data);
-	else
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
+	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1752,31 +1746,6 @@ void blk_trace_remove_sysfs(struct device *dev)
 
 #ifdef CONFIG_EVENT_TRACING
 
-void blk_dump_cmd(char *buf, struct request *rq)
-{
-	int i, end;
-	int len = rq->cmd_len;
-	unsigned char *cmd = rq->cmd;
-
-	if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
-		buf[0] = '\0';
-		return;
-	}
-
-	for (end = len - 1; end >= 0; end--)
-		if (cmd[end])
-			break;
-	end++;
-
-	for (i = 0; i < len; i++) {
-		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
-		if (i == end && end != len - 1) {
-			sprintf(buf, " ..");
-			break;
-		}
-	}
-}
-
 void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
 {
 	int i = 0;
-- 
cgit v1.2.3


From eb8db831be80692bf4bda3dfc55001daf64ec299 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 22 Jan 2017 18:32:46 +0100
Subject: dm: always defer request allocation to the owner of the request_queue

DM already calls blk_mq_alloc_request on the request_queue of the
underlying device if it is a blk-mq device.  But now that we allow drivers
to allocate additional data and initialize it ahead of time we need to do
the same for all drivers.   Doing so and using the new cmd_size
infrastructure in the block layer greatly simplifies the dm-rq and mpath
code, and should also make arbitrary combinations of SQ and MQ devices
with SQ or MQ device mapper tables easily possible as a further step.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-core.h          |   1 -
 drivers/md/dm-mpath.c         | 132 ++++------------------
 drivers/md/dm-rq.c            | 251 ++++++++++--------------------------------
 drivers/md/dm-rq.h            |   2 +-
 drivers/md/dm-target.c        |   7 --
 drivers/md/dm.c               |  30 ++---
 drivers/md/dm.h               |   3 +-
 include/linux/device-mapper.h |   3 -
 8 files changed, 85 insertions(+), 344 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 40ceba1fe8be..136fda3ff9e5 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -92,7 +92,6 @@ struct mapped_device {
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
-	mempool_t *rq_pool;
 
 	struct bio_set *bs;
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6400cffb986d..784f2374c3a4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -92,12 +92,6 @@ struct multipath {
 
 	unsigned queue_mode;
 
-	/*
-	 * We must use a mempool of dm_mpath_io structs so that we
-	 * can resubmit bios on error.
-	 */
-	mempool_t *mpio_pool;
-
 	struct mutex work_mutex;
 	struct work_struct trigger_event;
 
@@ -115,8 +109,6 @@ struct dm_mpath_io {
 
 typedef int (*action_fn) (struct pgpath *pgpath);
 
-static struct kmem_cache *_mpio_cache;
-
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
@@ -209,7 +201,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
 
-		m->mpio_pool = NULL;
 		m->queue_mode = DM_TYPE_NONE;
 
 		m->ti = ti;
@@ -229,16 +220,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
 			m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
 		else
 			m->queue_mode = DM_TYPE_REQUEST_BASED;
-	}
-
-	if (m->queue_mode == DM_TYPE_REQUEST_BASED) {
-		unsigned min_ios = dm_get_reserved_rq_based_ios();
-
-		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
-		if (!m->mpio_pool)
-			return -ENOMEM;
-	}
-	else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+	} else if (m->queue_mode == DM_TYPE_BIO_BASED) {
 		INIT_WORK(&m->process_queued_bios, process_queued_bios);
 		/*
 		 * bio-based doesn't support any direct scsi_dh management;
@@ -263,7 +245,6 @@ static void free_multipath(struct multipath *m)
 
 	kfree(m->hw_handler_name);
 	kfree(m->hw_handler_params);
-	mempool_destroy(m->mpio_pool);
 	kfree(m);
 }
 
@@ -272,38 +253,6 @@ static struct dm_mpath_io *get_mpio(union map_info *info)
 	return info->ptr;
 }
 
-static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info)
-{
-	struct dm_mpath_io *mpio;
-
-	if (!m->mpio_pool) {
-		/* Use blk-mq pdu memory requested via per_io_data_size */
-		mpio = get_mpio(info);
-		memset(mpio, 0, sizeof(*mpio));
-		return mpio;
-	}
-
-	mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
-	if (!mpio)
-		return NULL;
-
-	memset(mpio, 0, sizeof(*mpio));
-	info->ptr = mpio;
-
-	return mpio;
-}
-
-static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
-{
-	/* Only needed for non blk-mq (.request_fn) multipath */
-	if (m->mpio_pool) {
-		struct dm_mpath_io *mpio = info->ptr;
-
-		info->ptr = NULL;
-		mempool_free(mpio, m->mpio_pool);
-	}
-}
-
 static size_t multipath_per_bio_data_size(void)
 {
 	return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
@@ -530,16 +479,17 @@ static bool must_push_back_bio(struct multipath *m)
 /*
  * Map cloned requests (request-based multipath)
  */
-static int __multipath_map(struct dm_target *ti, struct request *clone,
-			   union map_info *map_context,
-			   struct request *rq, struct request **__clone)
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+				   union map_info *map_context,
+				   struct request **__clone)
 {
 	struct multipath *m = ti->private;
 	int r = DM_MAPIO_REQUEUE;
-	size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
+	size_t nr_bytes = blk_rq_bytes(rq);
 	struct pgpath *pgpath;
 	struct block_device *bdev;
-	struct dm_mpath_io *mpio;
+	struct dm_mpath_io *mpio = get_mpio(map_context);
+	struct request *clone;
 
 	/* Do we need to select a new pgpath? */
 	pgpath = lockless_dereference(m->current_pgpath);
@@ -556,42 +506,23 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		return r;
 	}
 
-	mpio = set_mpio(m, map_context);
-	if (!mpio)
-		/* ENOMEM, requeue */
-		return r;
-
+	memset(mpio, 0, sizeof(*mpio));
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
 
 	bdev = pgpath->path.dev->bdev;
 
-	if (clone) {
-		/*
-		 * Old request-based interface: allocated clone is passed in.
-		 * Used by: .request_fn stacked on .request_fn path(s).
-		 */
-		clone->q = bdev_get_queue(bdev);
-		clone->rq_disk = bdev->bd_disk;
-		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-	} else {
-		/*
-		 * blk-mq request-based interface; used by both:
-		 * .request_fn stacked on blk-mq path(s) and
-		 * blk-mq stacked on blk-mq path(s).
-		 */
-		clone = blk_mq_alloc_request(bdev_get_queue(bdev),
-					     rq_data_dir(rq), BLK_MQ_REQ_NOWAIT);
-		if (IS_ERR(clone)) {
-			/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
-			clear_request_fn_mpio(m, map_context);
-			return r;
-		}
-		clone->bio = clone->biotail = NULL;
-		clone->rq_disk = bdev->bd_disk;
-		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-		*__clone = clone;
+	clone = blk_get_request(bdev_get_queue(bdev),
+			rq->cmd_flags | REQ_NOMERGE,
+			GFP_ATOMIC);
+	if (IS_ERR(clone)) {
+		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
+		return r;
 	}
+	clone->bio = clone->biotail = NULL;
+	clone->rq_disk = bdev->bd_disk;
+	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	*__clone = clone;
 
 	if (pgpath->pg->ps.type->start_io)
 		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
@@ -600,22 +531,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	return DM_MAPIO_REMAPPED;
 }
 
-static int multipath_map(struct dm_target *ti, struct request *clone,
-			 union map_info *map_context)
-{
-	return __multipath_map(ti, clone, map_context, NULL, NULL);
-}
-
-static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
-				   union map_info *map_context,
-				   struct request **clone)
-{
-	return __multipath_map(ti, NULL, map_context, rq, clone);
-}
-
 static void multipath_release_clone(struct request *clone)
 {
-	blk_mq_free_request(clone);
+	blk_put_request(clone);
 }
 
 /*
@@ -1187,7 +1105,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_write_same_bios = 1;
 	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
-	else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
+	else
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
@@ -1610,7 +1528,6 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 		if (ps->type->end_io)
 			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
-	clear_request_fn_mpio(m, map_context);
 
 	return r;
 }
@@ -2060,7 +1977,6 @@ static struct target_type multipath_target = {
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
-	.map_rq = multipath_map,
 	.clone_and_map_rq = multipath_clone_and_map,
 	.release_clone_rq = multipath_release_clone,
 	.rq_end_io = multipath_end_io,
@@ -2080,11 +1996,6 @@ static int __init dm_multipath_init(void)
 {
 	int r;
 
-	/* allocate a slab for the dm_mpath_ios */
-	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
-	if (!_mpio_cache)
-		return -ENOMEM;
-
 	r = dm_register_target(&multipath_target);
 	if (r < 0) {
 		DMERR("request-based register failed %d", r);
@@ -2120,8 +2031,6 @@ bad_alloc_kmpath_handlerd:
 bad_alloc_kmultipathd:
 	dm_unregister_target(&multipath_target);
 bad_register_target:
-	kmem_cache_destroy(_mpio_cache);
-
 	return r;
 }
 
@@ -2131,7 +2040,6 @@ static void __exit dm_multipath_exit(void)
 	destroy_workqueue(kmultipathd);
 
 	dm_unregister_target(&multipath_target);
-	kmem_cache_destroy(_mpio_cache);
 }
 
 module_init(dm_multipath_init);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f12916f2424..8d0683474767 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -109,28 +109,6 @@ void dm_stop_queue(struct request_queue *q)
 		dm_mq_stop_queue(q);
 }
 
-static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
-						gfp_t gfp_mask)
-{
-	return mempool_alloc(md->io_pool, gfp_mask);
-}
-
-static void free_old_rq_tio(struct dm_rq_target_io *tio)
-{
-	mempool_free(tio, tio->md->io_pool);
-}
-
-static struct request *alloc_old_clone_request(struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	return mempool_alloc(md->rq_pool, gfp_mask);
-}
-
-static void free_old_clone_request(struct mapped_device *md, struct request *rq)
-{
-	mempool_free(rq, md->rq_pool);
-}
-
 /*
  * Partial completion handling for request-based dm
  */
@@ -185,7 +163,7 @@ static void end_clone_bio(struct bio *clone)
 
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
-	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+	return blk_mq_rq_to_pdu(rq);
 }
 
 static void rq_end_stats(struct mapped_device *md, struct request *orig)
@@ -233,31 +211,6 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	dm_put(md);
 }
 
-static void free_rq_clone(struct request *clone)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	struct mapped_device *md = tio->md;
-
-	blk_rq_unprep_clone(clone);
-
-	/*
-	 * It is possible for a clone_old_rq() allocated clone to
-	 * get passed in -- it may not yet have a request_queue.
-	 * This is known to occur if the error target replaces
-	 * a multipath target that has a request_fn queue stacked
-	 * on blk-mq queue(s).
-	 */
-	if (clone->q && clone->q->mq_ops)
-		/* stacked on blk-mq queue(s) */
-		tio->ti->type->release_clone_rq(clone);
-	else if (!md->queue->mq_ops)
-		/* request_fn queue stacked on request_fn queue(s) */
-		free_old_clone_request(md, clone);
-
-	if (!md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
 /*
  * Complete the clone and the original request.
  * Must be called without clone's queue lock held,
@@ -270,7 +223,9 @@ static void dm_end_request(struct request *clone, int error)
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 
-	free_rq_clone(clone);
+	blk_rq_unprep_clone(clone);
+	tio->ti->type->release_clone_rq(clone);
+
 	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
 		blk_end_request_all(rq, error);
@@ -279,22 +234,6 @@ static void dm_end_request(struct request *clone, int error)
 	rq_completed(md, rw, true);
 }
 
-static void dm_unprep_request(struct request *rq)
-{
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-	struct request *clone = tio->clone;
-
-	if (!rq->q->mq_ops) {
-		rq->special = NULL;
-		rq->rq_flags &= ~RQF_DONTPREP;
-	}
-
-	if (clone)
-		free_rq_clone(clone);
-	else if (!tio->md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
 /*
  * Requeue the original request of a clone.
  */
@@ -333,7 +272,10 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 	int rw = rq_data_dir(rq);
 
 	rq_end_stats(md, rq);
-	dm_unprep_request(rq);
+	if (tio->clone) {
+		blk_rq_unprep_clone(tio->clone);
+		tio->ti->type->release_clone_rq(tio->clone);
+	}
 
 	if (!rq->q->mq_ops)
 		dm_old_requeue_request(rq);
@@ -388,14 +330,11 @@ static void dm_softirq_done(struct request *rq)
 	if (!clone) {
 		rq_end_stats(tio->md, rq);
 		rw = rq_data_dir(rq);
-		if (!rq->q->mq_ops) {
+		if (!rq->q->mq_ops)
 			blk_end_request_all(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-			free_old_rq_tio(tio);
-		} else {
+		else
 			blk_mq_end_request(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-		}
+		rq_completed(tio->md, rw, false);
 		return;
 	}
 
@@ -439,16 +378,6 @@ static void end_clone_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
-	if (!clone->q->mq_ops) {
-		/*
-		 * For just cleaning up the information of the queue in which
-		 * the clone was dispatched.
-		 * The clone is *NOT* freed actually here because it is alloced
-		 * from dm own mempool (RQF_ALLOCED isn't set).
-		 */
-		__blk_put_request(clone->q, clone);
-	}
-
 	/*
 	 * Actual request completion is done in a softirq context which doesn't
 	 * hold the clone's queue lock.  Otherwise, deadlock could occur because:
@@ -506,28 +435,6 @@ static int setup_clone(struct request *clone, struct request *rq,
 	return 0;
 }
 
-static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
-				    struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	/*
-	 * Create clone for use with .request_fn request_queue
-	 */
-	struct request *clone;
-
-	clone = alloc_old_clone_request(md, gfp_mask);
-	if (!clone)
-		return NULL;
-
-	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		free_old_clone_request(md, clone);
-		return NULL;
-	}
-
-	return clone;
-}
-
 static void map_tio_request(struct kthread_work *work);
 
 static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
@@ -549,60 +456,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
 		kthread_init_work(&tio->work, map_tio_request);
 }
 
-static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
-					       struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	struct dm_rq_target_io *tio;
-	int srcu_idx;
-	struct dm_table *table;
-
-	tio = alloc_old_rq_tio(md, gfp_mask);
-	if (!tio)
-		return NULL;
-
-	init_tio(tio, rq, md);
-
-	table = dm_get_live_table(md, &srcu_idx);
-	/*
-	 * Must clone a request if this .request_fn DM device
-	 * is stacked on .request_fn device(s).
-	 */
-	if (!dm_table_all_blk_mq_devices(table)) {
-		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
-			dm_put_live_table(md, srcu_idx);
-			free_old_rq_tio(tio);
-			return NULL;
-		}
-	}
-	dm_put_live_table(md, srcu_idx);
-
-	return tio;
-}
-
-/*
- * Called with the queue lock held.
- */
-static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_rq_target_io *tio;
-
-	if (unlikely(rq->special)) {
-		DMWARN("Already has something in rq->special.");
-		return BLKPREP_KILL;
-	}
-
-	tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
-	if (!tio)
-		return BLKPREP_DEFER;
-
-	rq->special = tio;
-	rq->rq_flags |= RQF_DONTPREP;
-
-	return BLKPREP_OK;
-}
-
 /*
  * Returns:
  * DM_MAPIO_*       : the request has been processed as indicated
@@ -617,31 +470,18 @@ static int map_request(struct dm_rq_target_io *tio)
 	struct request *rq = tio->orig;
 	struct request *clone = NULL;
 
-	if (tio->clone) {
-		clone = tio->clone;
-		r = ti->type->map_rq(ti, clone, &tio->info);
-		if (r == DM_MAPIO_DELAY_REQUEUE)
-			return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */
-	} else {
-		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
-		if (r < 0) {
-			/* The target wants to complete the I/O */
-			dm_kill_unmapped_request(rq, r);
-			return r;
-		}
-		if (r == DM_MAPIO_REMAPPED &&
-		    setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
-	}
-
+	r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		/* The target has taken the I/O to submit by itself later */
 		break;
 	case DM_MAPIO_REMAPPED:
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
+
 		/* The target has remapped the I/O so dispatch it */
 		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
 				     blk_rq_pos(rq));
@@ -700,6 +540,29 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	dm_get(md);
 }
 
+static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq)
+{
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+	/*
+	 * Must initialize md member of tio, otherwise it won't
+	 * be available in dm_mq_queue_rq.
+	 */
+	tio->md = md;
+
+	if (md->init_tio_pdu) {
+		/* target-specific per-io data is immediately after the tio */
+		tio->info.ptr = tio + 1;
+	}
+
+	return 0;
+}
+
+static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp)
+{
+	return __dm_rq_init_rq(q->rq_alloc_data, rq);
+}
+
 static void map_tio_request(struct kthread_work *work)
 {
 	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
@@ -794,6 +657,7 @@ static void dm_old_request_fn(struct request_queue *q)
 		dm_start_request(md, rq);
 
 		tio = tio_from_request(rq);
+		init_tio(tio, rq, md);
 		/* Establish tio->ti before queuing work (map_tio_request) */
 		tio->ti = ti;
 		kthread_queue_work(&md->kworker, &tio->work);
@@ -804,10 +668,22 @@ static void dm_old_request_fn(struct request_queue *q)
 /*
  * Fully initialize a .request_fn request-based queue.
  */
-int dm_old_init_request_queue(struct mapped_device *md)
+int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
 {
+	struct dm_target *immutable_tgt;
+
 	/* Fully initialize the queue */
+	md->queue->cmd_size = sizeof(struct dm_rq_target_io);
+	md->queue->rq_alloc_data = md;
 	md->queue->request_fn = dm_old_request_fn;
+	md->queue->init_rq_fn = dm_rq_init_rq;
+
+	immutable_tgt = dm_table_get_immutable_target(t);
+	if (immutable_tgt && immutable_tgt->per_io_data_size) {
+		/* any target-specific per-io data is immediately after the tio */
+		md->queue->cmd_size += immutable_tgt->per_io_data_size;
+		md->init_tio_pdu = true;
+	}
 	if (blk_init_allocated_queue(md->queue) < 0)
 		return -EINVAL;
 
@@ -816,7 +692,6 @@ int dm_old_init_request_queue(struct mapped_device *md)
 
 	dm_init_normal_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
-	blk_queue_prep_rq(md->queue, dm_old_prep_fn);
 
 	/* Initialize the request-based DM worker thread */
 	kthread_init_worker(&md->kworker);
@@ -837,21 +712,7 @@ static int dm_mq_init_request(void *data, struct request *rq,
 		       unsigned int hctx_idx, unsigned int request_idx,
 		       unsigned int numa_node)
 {
-	struct mapped_device *md = data;
-	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
-
-	/*
-	 * Must initialize md member of tio, otherwise it won't
-	 * be available in dm_mq_queue_rq.
-	 */
-	tio->md = md;
-
-	if (md->init_tio_pdu) {
-		/* target-specific per-io data is immediately after the tio */
-		tio->info.ptr = tio + 1;
-	}
-
-	return 0;
+	return __dm_rq_init_rq(data, rq);
 }
 
 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index 4da06cae7bad..f0020d21b95f 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -48,7 +48,7 @@ struct dm_rq_clone_bio_info {
 bool dm_use_blk_mq_default(void);
 bool dm_use_blk_mq(struct mapped_device *md);
 
-int dm_old_init_request_queue(struct mapped_device *md);
+int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t);
 int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
 void dm_mq_cleanup_mapped_device(struct mapped_device *md);
 
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 710ae28fd618..43d3445b121d 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -131,12 +131,6 @@ static int io_err_map(struct dm_target *tt, struct bio *bio)
 	return -EIO;
 }
 
-static int io_err_map_rq(struct dm_target *ti, struct request *clone,
-			 union map_info *map_context)
-{
-	return -EIO;
-}
-
 static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
 				   union map_info *map_context,
 				   struct request **clone)
@@ -161,7 +155,6 @@ static struct target_type error_target = {
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
-	.map_rq = io_err_map_rq,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.release_clone_rq = io_err_release_clone_rq,
 	.direct_access = io_err_direct_access,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3086da5664f3..ff4a29a97ad3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -91,7 +91,6 @@ static int dm_numa_node = DM_NUMA_NODE;
  */
 struct dm_md_mempools {
 	mempool_t *io_pool;
-	mempool_t *rq_pool;
 	struct bio_set *bs;
 };
 
@@ -1419,7 +1418,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	if (md->kworker_task)
 		kthread_stop(md->kworker_task);
 	mempool_destroy(md->io_pool);
-	mempool_destroy(md->rq_pool);
 	if (md->bs)
 		bioset_free(md->bs);
 
@@ -1595,12 +1593,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+	BUG_ON(!p || md->io_pool || md->bs);
 
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
-	md->rq_pool = p->rq_pool;
-	p->rq_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
 
@@ -1777,7 +1773,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 
 	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
-		r = dm_old_init_request_queue(md);
+		r = dm_old_init_request_queue(md, t);
 		if (r) {
 			DMERR("Cannot initialize queue for request-based mapped device");
 			return r;
@@ -2493,7 +2489,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 					    unsigned integrity, unsigned per_io_data_size)
 {
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
-	struct kmem_cache *cachep = NULL;
 	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
@@ -2503,20 +2498,16 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_DAX_BIO_BASED:
-		cachep = _io_cache;
 		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+	
+		pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+		if (!pools->io_pool)
+			goto out;
 		break;
 	case DM_TYPE_REQUEST_BASED:
-		cachep = _rq_tio_cache;
-		pool_size = dm_get_reserved_rq_based_ios();
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-		/* fall through to setup remaining rq-based pools */
 	case DM_TYPE_MQ_REQUEST_BASED:
-		if (!pool_size)
-			pool_size = dm_get_reserved_rq_based_ios();
+		pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_io_data_size is used for blk-mq pdu at queue allocation */
 		break;
@@ -2524,12 +2515,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		BUG();
 	}
 
-	if (cachep) {
-		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-		if (!pools->io_pool)
-			goto out;
-	}
-
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
 		goto out;
@@ -2551,7 +2536,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 		return;
 
 	mempool_destroy(pools->io_pool);
-	mempool_destroy(pools->rq_pool);
 
 	if (pools->bs)
 		bioset_free(pools->bs);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f0aad08b9654..f298b01f7ab3 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -95,8 +95,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
 /*
  * To check whether the target type is request-based or not (bio-based).
  */
-#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
-				    ((t)->type->clone_and_map_rq != NULL))
+#define dm_target_request_based(t) ((t)->type->clone_and_map_rq != NULL)
 
 /*
  * To check whether the target type is a hybrid (capable of being
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ef7962e84444..a7e6903866fd 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -55,8 +55,6 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
  * = 2: The target wants to push back the io
  */
 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio);
-typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
-				  union map_info *map_context);
 typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
 					    struct request *rq,
 					    union map_info *map_context,
@@ -163,7 +161,6 @@ struct target_type {
 	dm_ctr_fn ctr;
 	dm_dtr_fn dtr;
 	dm_map_fn map;
-	dm_map_request_fn map_rq;
 	dm_clone_and_map_request_fn clone_and_map_rq;
 	dm_release_clone_request_fn release_clone_rq;
 	dm_endio_fn end_io;
-- 
cgit v1.2.3


From 8ae94eb65be9425af4d57a4f4cfebfdf03081e93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Jan 2017 15:25:02 +0300
Subject: block/bsg: move queue creation into bsg_setup_queue

Simply the boilerplate code needed for bsg nodes a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg-lib.c                     | 21 +++++++++++----------
 drivers/scsi/scsi_transport_fc.c    | 36 ++++++++----------------------------
 drivers/scsi/scsi_transport_iscsi.c | 15 ++++-----------
 include/linux/bsg-lib.h             |  5 ++---
 4 files changed, 25 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 9d652a992316..c74acf426840 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -177,7 +177,7 @@ failjob_rls_job:
  *
  * Drivers/subsys should pass this to the queue init function.
  */
-void bsg_request_fn(struct request_queue *q)
+static void bsg_request_fn(struct request_queue *q)
 	__releases(q->queue_lock)
 	__acquires(q->queue_lock)
 {
@@ -214,24 +214,24 @@ void bsg_request_fn(struct request_queue *q)
 	put_device(dev);
 	spin_lock_irq(q->queue_lock);
 }
-EXPORT_SYMBOL_GPL(bsg_request_fn);
 
 /**
  * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
  * @dev: device to attach bsg device to
- * @q: request queue setup by caller
  * @name: device to give bsg device
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
- *
- * The caller should have setup the reuqest queue with bsg_request_fn
- * as the request_fn.
  */
-int bsg_setup_queue(struct device *dev, struct request_queue *q,
-		    char *name, bsg_job_fn *job_fn, int dd_job_size)
+struct request_queue *bsg_setup_queue(struct device *dev, char *name,
+		bsg_job_fn *job_fn, int dd_job_size)
 {
+	struct request_queue *q;
 	int ret;
 
+	q = blk_init_queue(bsg_request_fn, NULL);
+	if (!q)
+		return ERR_PTR(-ENOMEM);
+
 	q->queuedata = dev;
 	q->bsg_job_size = dd_job_size;
 	q->bsg_job_fn = job_fn;
@@ -243,9 +243,10 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);
-		return ret;
+		blk_cleanup_queue(q);
+		return ERR_PTR(ret);
 	}
 
-	return 0;
+	return q;
 }
 EXPORT_SYMBOL_GPL(bsg_setup_queue);
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index afcedec58d17..13dcb9ba823c 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3765,7 +3765,6 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	struct device *dev = &shost->shost_gendev;
 	struct fc_internal *i = to_fc_internal(shost->transportt);
 	struct request_queue *q;
-	int err;
 	char bsg_name[20];
 
 	fc_host->rqst_q = NULL;
@@ -3776,24 +3775,14 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 
-	q = blk_init_queue(bsg_request_fn, NULL);
-	if (!q) {
-		dev_err(dev,
-			"fc_host%d: bsg interface failed to initialize - no request queue\n",
-			shost->host_no);
-		return -ENOMEM;
-	}
-
-	__scsi_init_queue(shost, q);
-	err = bsg_setup_queue(dev, q, bsg_name, fc_bsg_dispatch,
-				 i->f->dd_bsg_size);
-	if (err) {
+	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
+	if (IS_ERR(q)) {
 		dev_err(dev,
 			"fc_host%d: bsg interface failed to initialize - setup queue\n",
 			shost->host_no);
-		blk_cleanup_queue(q);
-		return err;
+		return PTR_ERR(q);
 	}
+	__scsi_init_queue(shost, q);
 	blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
 	blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT);
 	fc_host->rqst_q = q;
@@ -3825,27 +3814,18 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	struct device *dev = &rport->dev;
 	struct fc_internal *i = to_fc_internal(shost->transportt);
 	struct request_queue *q;
-	int err;
 
 	rport->rqst_q = NULL;
 
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = blk_init_queue(bsg_request_fn, NULL);
-	if (!q) {
-		dev_err(dev, "bsg interface failed to initialize - no request queue\n");
-		return -ENOMEM;
-	}
-
-	__scsi_init_queue(shost, q);
-	err = bsg_setup_queue(dev, q, NULL, fc_bsg_dispatch, i->f->dd_bsg_size);
-	if (err) {
+	q = bsg_setup_queue(dev, NULL, fc_bsg_dispatch, i->f->dd_bsg_size);
+	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
-		blk_cleanup_queue(q);
-		return err;
+		return PTR_ERR(q);
 	}
-
+	__scsi_init_queue(shost, q);
 	blk_queue_prep_rq(q, fc_bsg_rport_prep);
 	blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 04ebe6e65b83..568c9f26a561 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1537,25 +1537,18 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 	struct iscsi_internal *i = to_iscsi_internal(shost->transportt);
 	struct request_queue *q;
 	char bsg_name[20];
-	int ret;
 
 	if (!i->iscsi_transport->bsg_request)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-
-	q = blk_init_queue(bsg_request_fn, NULL);
-	if (!q)
-		return -ENOMEM;
-
-	__scsi_init_queue(shost, q);
-	ret = bsg_setup_queue(dev, q, bsg_name, iscsi_bsg_host_dispatch, 0);
-	if (ret) {
+	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
+	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
-		blk_cleanup_queue(q);
-		return ret;
+		return PTR_ERR(q);
 	}
+	__scsi_init_queue(shost, q);
 
 	ihost->bsg_q = q;
 	return 0;
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 657a718c27d2..e34dde2da0ef 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -66,9 +66,8 @@ struct bsg_job {
 
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
-int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name,
-		    bsg_job_fn *job_fn, int dd_job_size);
-void bsg_request_fn(struct request_queue *q);
+struct request_queue *bsg_setup_queue(struct device *dev, char *name,
+		bsg_job_fn *job_fn, int dd_job_size);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 
-- 
cgit v1.2.3


From 82ed4db499b8598f16f8871261bff088d6b0597f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Jan 2017 09:46:29 +0100
Subject: block: split scsi_request out of struct request

And require all drivers that want to support BLOCK_PC to allocate it
as the first thing of their private data.  To support this the legacy
IDE and BSG code is switched to set cmd_size on their queues to let
the block layer allocate the additional space.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                         | 31 -----------
 block/blk-exec.c                         | 17 ------
 block/blk-mq.c                           | 10 ----
 block/bsg-lib.c                          | 34 ++++++++----
 block/bsg.c                              | 47 ++++++++---------
 block/scsi_ioctl.c                       | 76 ++++++++++++++------------
 drivers/ata/libata-scsi.c                |  2 +-
 drivers/block/cciss.c                    | 28 +++++-----
 drivers/block/pktcdvd.c                  |  6 +--
 drivers/block/virtio_blk.c               | 11 ++--
 drivers/cdrom/cdrom.c                    | 32 +++++------
 drivers/ide/ide-atapi.c                  | 43 ++++++++-------
 drivers/ide/ide-cd.c                     | 91 +++++++++++++++-----------------
 drivers/ide/ide-cd_ioctl.c               |  1 +
 drivers/ide/ide-cd_verbose.c             |  6 +--
 drivers/ide/ide-devsets.c                |  9 ++--
 drivers/ide/ide-disk.c                   |  1 +
 drivers/ide/ide-eh.c                     |  2 +-
 drivers/ide/ide-floppy.c                 |  4 +-
 drivers/ide/ide-io.c                     |  3 +-
 drivers/ide/ide-ioctls.c                 |  6 ++-
 drivers/ide/ide-park.c                   | 12 +++--
 drivers/ide/ide-pm.c                     |  2 +
 drivers/ide/ide-probe.c                  | 36 +++++++++++--
 drivers/ide/ide-tape.c                   | 32 +++++------
 drivers/ide/ide-taskfile.c               |  1 +
 drivers/ide/sis5513.c                    |  2 +-
 drivers/message/fusion/mptsas.c          |  8 +--
 drivers/scsi/libfc/fc_lport.c            |  2 +-
 drivers/scsi/libsas/sas_expander.c       |  8 +--
 drivers/scsi/libsas/sas_host_smp.c       | 38 ++++++-------
 drivers/scsi/mpt3sas/mpt3sas_transport.c |  8 +--
 drivers/scsi/osd/osd_initiator.c         | 19 +++----
 drivers/scsi/osst.c                      | 15 +++---
 drivers/scsi/qla2xxx/qla_bsg.c           |  2 +-
 drivers/scsi/qla2xxx/qla_isr.c           |  6 ++-
 drivers/scsi/qla2xxx/qla_mr.c            |  2 +-
 drivers/scsi/scsi_error.c                | 22 ++++----
 drivers/scsi/scsi_lib.c                  | 51 ++++++++++--------
 drivers/scsi/scsi_transport_sas.c        |  5 ++
 drivers/scsi/sd.c                        |  4 +-
 drivers/scsi/sg.c                        | 30 ++++++-----
 drivers/scsi/st.c                        | 22 ++++----
 drivers/target/target_core_pscsi.c       | 11 ++--
 fs/nfsd/blocklayout.c                    | 17 +++---
 include/linux/blkdev.h                   | 13 -----
 include/linux/ide.h                      |  8 ++-
 include/scsi/scsi_cmnd.h                 |  2 +
 include/scsi/scsi_request.h              | 30 +++++++++++
 49 files changed, 458 insertions(+), 410 deletions(-)
 create mode 100644 include/scsi/scsi_request.h

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0a485ad802c9..f3bc083d4c14 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -132,8 +132,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
-	rq->cmd = rq->__cmd;
-	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->internal_tag = -1;
 	rq->start_time = jiffies;
@@ -160,8 +158,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
-	int bit;
-
 	printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		(unsigned long long) rq->cmd_flags);
@@ -171,13 +167,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
 	       rq->bio, rq->biotail, blk_rq_bytes(rq));
-
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		printk(KERN_INFO "  cdb: ");
-		for (bit = 0; bit < BLK_MAX_CDB; bit++)
-			printk("%02x ", rq->cmd[bit]);
-		printk("\n");
-	}
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
@@ -1315,18 +1304,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_get_request);
 
-/**
- * blk_rq_set_block_pc - initialize a request to type BLOCK_PC
- * @rq:		request to be initialized
- *
- */
-void blk_rq_set_block_pc(struct request *rq)
-{
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	memset(rq->__cmd, 0, sizeof(rq->__cmd));
-}
-EXPORT_SYMBOL(blk_rq_set_block_pc);
-
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
@@ -2459,14 +2436,6 @@ void blk_start_request(struct request *req)
 		wbt_issue(req->q->rq_wb, &req->issue_stat);
 	}
 
-	/*
-	 * We are now handing the request to the hardware, initialize
-	 * resid_len to full count and add the timeout handler.
-	 */
-	req->resid_len = blk_rq_bytes(req);
-	if (unlikely(blk_bidi_rq(req)))
-		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
-
 	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 	blk_add_timer(req);
 }
diff --git a/block/blk-exec.c b/block/blk-exec.c
index ed1f10165268..ed51800f4b44 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -11,11 +11,6 @@
 #include "blk.h"
 #include "blk-mq-sched.h"
 
-/*
- * for max sense size
- */
-#include <scsi/scsi_cmnd.h>
-
 /**
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
@@ -101,16 +96,9 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
-	char sense[SCSI_SENSE_BUFFERSIZE];
 	int err = 0;
 	unsigned long hang_check;
 
-	if (!rq->sense) {
-		memset(sense, 0, sizeof(sense));
-		rq->sense = sense;
-		rq->sense_len = 0;
-	}
-
 	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
 
@@ -124,11 +112,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	if (rq->errors)
 		err = -EIO;
 
-	if (rq->sense == sense)	{
-		rq->sense = NULL;
-		rq->sense_len = 0;
-	}
-
 	return err;
 }
 EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 60dac10228fe..3d76e860f126 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -199,13 +199,7 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->special = NULL;
 	/* tag was already set */
 	rq->errors = 0;
-
-	rq->cmd = rq->__cmd;
-
 	rq->extra_len = 0;
-	rq->sense_len = 0;
-	rq->resid_len = 0;
-	rq->sense = NULL;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
@@ -487,10 +481,6 @@ void blk_mq_start_request(struct request *rq)
 
 	trace_block_rq_issue(q, rq);
 
-	rq->resid_len = blk_rq_bytes(rq);
-	if (unlikely(blk_bidi_rq(rq)))
-		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
-
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 		blk_stat_set_issue_time(&rq->issue_stat);
 		rq->rq_flags |= RQF_STATS;
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index c74acf426840..cd15f9dbb147 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -71,22 +71,24 @@ void bsg_job_done(struct bsg_job *job, int result,
 {
 	struct request *req = job->req;
 	struct request *rsp = req->next_rq;
+	struct scsi_request *rq = scsi_req(req);
 	int err;
 
 	err = job->req->errors = result;
 	if (err < 0)
 		/* we're only returning the result field in the reply */
-		job->req->sense_len = sizeof(u32);
+		rq->sense_len = sizeof(u32);
 	else
-		job->req->sense_len = job->reply_len;
+		rq->sense_len = job->reply_len;
 	/* we assume all request payload was transferred, residual == 0 */
-	req->resid_len = 0;
+	rq->resid_len = 0;
 
 	if (rsp) {
-		WARN_ON(reply_payload_rcv_len > rsp->resid_len);
+		WARN_ON(reply_payload_rcv_len > scsi_req(rsp)->resid_len);
 
 		/* set reply (bidi) residual */
-		rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
+		scsi_req(rsp)->resid_len -=
+			min(reply_payload_rcv_len, scsi_req(rsp)->resid_len);
 	}
 	blk_complete_request(req);
 }
@@ -113,6 +115,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
 	if (!buf->sg_list)
 		return -ENOMEM;
 	sg_init_table(buf->sg_list, req->nr_phys_segments);
+	scsi_req(req)->resid_len = blk_rq_bytes(req);
 	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
 	buf->payload_len = blk_rq_bytes(req);
 	return 0;
@@ -127,6 +130,7 @@ static int bsg_create_job(struct device *dev, struct request *req)
 {
 	struct request *rsp = req->next_rq;
 	struct request_queue *q = req->q;
+	struct scsi_request *rq = scsi_req(req);
 	struct bsg_job *job;
 	int ret;
 
@@ -140,9 +144,9 @@ static int bsg_create_job(struct device *dev, struct request *req)
 	job->req = req;
 	if (q->bsg_job_size)
 		job->dd_data = (void *)&job[1];
-	job->request = req->cmd;
-	job->request_len = req->cmd_len;
-	job->reply = req->sense;
+	job->request = rq->cmd;
+	job->request_len = rq->cmd_len;
+	job->reply = rq->sense;
 	job->reply_len = SCSI_SENSE_BUFFERSIZE;	/* Size of sense buffer
 						 * allocated */
 	if (req->bio) {
@@ -228,9 +232,15 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name,
 	struct request_queue *q;
 	int ret;
 
-	q = blk_init_queue(bsg_request_fn, NULL);
+	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
 		return ERR_PTR(-ENOMEM);
+	q->cmd_size = sizeof(struct scsi_request);
+	q->request_fn = bsg_request_fn;
+
+	ret = blk_init_allocated_queue(q);
+	if (ret)
+		goto out_cleanup_queue;
 
 	q->queuedata = dev;
 	q->bsg_job_size = dd_job_size;
@@ -243,10 +253,12 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name,
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);
-		blk_cleanup_queue(q);
-		return ERR_PTR(ret);
+		goto out_cleanup_queue;
 	}
 
 	return q;
+out_cleanup_queue:
+	blk_cleanup_queue(q);
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(bsg_setup_queue);
diff --git a/block/bsg.c b/block/bsg.c
index a57046de2f07..e34c3320956c 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -85,7 +85,6 @@ struct bsg_command {
 	struct bio *bidi_bio;
 	int err;
 	struct sg_io_v4 hdr;
-	char sense[SCSI_SENSE_BUFFERSIZE];
 };
 
 static void bsg_free_command(struct bsg_command *bc)
@@ -140,18 +139,20 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
 				fmode_t has_write_perm)
 {
+	struct scsi_request *req = scsi_req(rq);
+
 	if (hdr->request_len > BLK_MAX_CDB) {
-		rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
-		if (!rq->cmd)
+		req->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
+		if (!req->cmd)
 			return -ENOMEM;
 	}
 
-	if (copy_from_user(rq->cmd, (void __user *)(unsigned long)hdr->request,
+	if (copy_from_user(req->cmd, (void __user *)(unsigned long)hdr->request,
 			   hdr->request_len))
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(rq->cmd, has_write_perm))
+		if (blk_verify_command(req->cmd, has_write_perm))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
@@ -159,7 +160,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 	/*
 	 * fill in request structure
 	 */
-	rq->cmd_len = hdr->request_len;
+	req->cmd_len = hdr->request_len;
 
 	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
@@ -205,8 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *rw)
  * map sg_io_v4 to a request.
  */
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
-	    u8 *sense)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
@@ -236,7 +236,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 	rq = blk_get_request(q, rw, GFP_KERNEL);
 	if (IS_ERR(rq))
 		return rq;
-	blk_rq_set_block_pc(rq);
+	scsi_req_init(rq);
 
 	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
 	if (ret)
@@ -280,13 +280,9 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 			goto out;
 	}
 
-	rq->sense = sense;
-	rq->sense_len = 0;
-
 	return rq;
 out:
-	if (rq->cmd != rq->__cmd)
-		kfree(rq->cmd);
+	scsi_req_free_cmd(scsi_req(rq));
 	blk_put_request(rq);
 	if (next_rq) {
 		blk_rq_unmap_user(next_rq->bio);
@@ -393,6 +389,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
 static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 				    struct bio *bio, struct bio *bidi_bio)
 {
+	struct scsi_request *req = scsi_req(rq);
 	int ret = 0;
 
 	dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
@@ -407,12 +404,12 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 		hdr->info |= SG_INFO_CHECK;
 	hdr->response_len = 0;
 
-	if (rq->sense_len && hdr->response) {
+	if (req->sense_len && hdr->response) {
 		int len = min_t(unsigned int, hdr->max_response_len,
-					rq->sense_len);
+					req->sense_len);
 
 		ret = copy_to_user((void __user *)(unsigned long)hdr->response,
-				   rq->sense, len);
+				   req->sense, len);
 		if (!ret)
 			hdr->response_len = len;
 		else
@@ -420,14 +417,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->resid_len;
-		hdr->din_resid = rq->next_rq->resid_len;
+		hdr->dout_resid = req->resid_len;
+		hdr->din_resid = scsi_req(rq->next_rq)->resid_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->resid_len;
+		hdr->din_resid = req->resid_len;
 	else
-		hdr->dout_resid = rq->resid_len;
+		hdr->dout_resid = req->resid_len;
 
 	/*
 	 * If the request generated a negative error number, return it
@@ -439,8 +436,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 		ret = rq->errors;
 
 	blk_rq_unmap_user(bio);
-	if (rq->cmd != rq->__cmd)
-		kfree(rq->cmd);
+	scsi_req_free_cmd(req);
 	blk_put_request(rq);
 
 	return ret;
@@ -625,7 +621,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
 		/*
 		 * get a request, fill in the blanks, and add to request queue
 		 */
-		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm, bc->sense);
+		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			rq = NULL;
@@ -911,12 +907,11 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		struct bio *bio, *bidi_bio = NULL;
 		struct sg_io_v4 hdr;
 		int at_head;
-		u8 sense[SCSI_SENSE_BUFFERSIZE];
 
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 			return -EFAULT;
 
-		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE, sense);
+		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
 		if (IS_ERR(rq))
 			return PTR_ERR(rq);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index c2b64923ab66..7edf44f25e08 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -230,15 +230,17 @@ EXPORT_SYMBOL(blk_verify_command);
 static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 			     struct sg_io_hdr *hdr, fmode_t mode)
 {
-	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
+	struct scsi_request *req = scsi_req(rq);
+
+	if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
+	if (blk_verify_command(req->cmd, mode & FMODE_WRITE))
 		return -EPERM;
 
 	/*
 	 * fill in request structure
 	 */
-	rq->cmd_len = hdr->cmd_len;
+	req->cmd_len = hdr->cmd_len;
 
 	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
@@ -254,6 +256,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 				 struct bio *bio)
 {
+	struct scsi_request *req = scsi_req(rq);
 	int r, ret = 0;
 
 	/*
@@ -267,13 +270,13 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->resid_len;
+	hdr->resid = req->resid_len;
 	hdr->sb_len_wr = 0;
 
-	if (rq->sense_len && hdr->sbp) {
-		int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
+	if (req->sense_len && hdr->sbp) {
+		int len = min((unsigned int) hdr->mx_sb_len, req->sense_len);
 
-		if (!copy_to_user(hdr->sbp, rq->sense, len))
+		if (!copy_to_user(hdr->sbp, req->sense, len))
 			hdr->sb_len_wr = len;
 		else
 			ret = -EFAULT;
@@ -294,7 +297,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	int writing = 0;
 	int at_head = 0;
 	struct request *rq;
-	char sense[SCSI_SENSE_BUFFERSIZE];
+	struct scsi_request *req;
 	struct bio *bio;
 
 	if (hdr->interface_id != 'S')
@@ -321,11 +324,12 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
-	blk_rq_set_block_pc(rq);
+	req = scsi_req(rq);
+	scsi_req_init(rq);
 
 	if (hdr->cmd_len > BLK_MAX_CDB) {
-		rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
-		if (!rq->cmd)
+		req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
+		if (!req->cmd)
 			goto out_put_request;
 	}
 
@@ -357,9 +361,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		goto out_free_cdb;
 
 	bio = rq->bio;
-	memset(sense, 0, sizeof(sense));
-	rq->sense = sense;
-	rq->sense_len = 0;
 	rq->retries = 0;
 
 	start_time = jiffies;
@@ -375,8 +376,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	ret = blk_complete_sghdr_rq(rq, hdr, bio);
 
 out_free_cdb:
-	if (rq->cmd != rq->__cmd)
-		kfree(rq->cmd);
+	scsi_req_free_cmd(req);
 out_put_request:
 	blk_put_request(rq);
 	return ret;
@@ -420,9 +420,10 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		struct scsi_ioctl_command __user *sic)
 {
 	struct request *rq;
+	struct scsi_request *req;
 	int err;
 	unsigned int in_len, out_len, bytes, opcode, cmdlen;
-	char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
+	char *buffer = NULL;
 
 	if (!sic)
 		return -EINVAL;
@@ -452,7 +453,8 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
 	}
-	blk_rq_set_block_pc(rq);
+	req = scsi_req(rq);
+	scsi_req_init(rq);
 
 	cmdlen = COMMAND_SIZE(opcode);
 
@@ -460,14 +462,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	 * get command and data to send to device, if any
 	 */
 	err = -EFAULT;
-	rq->cmd_len = cmdlen;
-	if (copy_from_user(rq->cmd, sic->data, cmdlen))
+	req->cmd_len = cmdlen;
+	if (copy_from_user(req->cmd, sic->data, cmdlen))
 		goto error;
 
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
+	err = blk_verify_command(req->cmd, mode & FMODE_WRITE);
 	if (err)
 		goto error;
 
@@ -503,18 +505,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		goto error;
 	}
 
-	memset(sense, 0, sizeof(sense));
-	rq->sense = sense;
-	rq->sense_len = 0;
-
 	blk_execute_rq(q, disk, rq, 0);
 
 	err = rq->errors & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
-		if (rq->sense_len && rq->sense) {
-			bytes = (OMAX_SB_LEN > rq->sense_len) ?
-				rq->sense_len : OMAX_SB_LEN;
-			if (copy_to_user(sic->data, rq->sense, bytes))
+		if (req->sense_len && req->sense) {
+			bytes = (OMAX_SB_LEN > req->sense_len) ?
+				req->sense_len : OMAX_SB_LEN;
+			if (copy_to_user(sic->data, req->sense, bytes))
 				err = -EFAULT;
 		}
 	} else {
@@ -542,11 +540,11 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	rq = blk_get_request(q, WRITE, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
-	blk_rq_set_block_pc(rq);
+	scsi_req_init(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
-	rq->cmd[0] = cmd;
-	rq->cmd[4] = data;
-	rq->cmd_len = 6;
+	scsi_req(rq)->cmd[0] = cmd;
+	scsi_req(rq)->cmd[4] = data;
+	scsi_req(rq)->cmd_len = 6;
 	err = blk_execute_rq(q, bd_disk, rq, 0);
 	blk_put_request(rq);
 
@@ -743,6 +741,18 @@ int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
 }
 EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
 
+void scsi_req_init(struct request *rq)
+{
+	struct scsi_request *req = scsi_req(rq);
+
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	memset(req->__cmd, 0, sizeof(req->__cmd));
+	req->cmd = req->__cmd;
+	req->cmd_len = BLK_MAX_CDB;
+	req->sense_len = 0;
+}
+EXPORT_SYMBOL(scsi_req_init);
+
 static int __init blk_scsi_ioctl_init(void)
 {
 	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 1f863e757ee4..6abd73975f87 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1271,7 +1271,7 @@ static int atapi_drain_needed(struct request *rq)
 	if (!blk_rq_bytes(rq) || op_is_write(req_op(rq)))
 		return 0;
 
-	return atapi_cmd_type(rq->cmd[0]) == ATAPI_MISC;
+	return atapi_cmd_type(scsi_req(rq)->cmd[0]) == ATAPI_MISC;
 }
 
 static int ata_scsi_dev_config(struct scsi_device *sdev,
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e5c5b8eb14a9..b93bb73b49d2 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -52,6 +52,7 @@
 #include <scsi/scsi.h>
 #include <scsi/sg.h>
 #include <scsi/scsi_ioctl.h>
+#include <scsi/scsi_request.h>
 #include <linux/cdrom.h>
 #include <linux/scatterlist.h>
 #include <linux/kthread.h>
@@ -1854,7 +1855,7 @@ static void cciss_softirq_done(struct request *rq)
 
 	/* set the residual count for pc requests */
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
-		rq->resid_len = c->err_info->ResidualCnt;
+		scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
 
 	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
 
@@ -1941,9 +1942,16 @@ static void cciss_get_serial_no(ctlr_info_t *h, int logvol,
 static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 				int drv_index)
 {
-	disk->queue = blk_init_queue(do_cciss_request, &h->lock);
+	disk->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!disk->queue)
 		goto init_queue_failure;
+
+	disk->queue->cmd_size = sizeof(struct scsi_request);
+	disk->queue->request_fn = do_cciss_request;
+	disk->queue->queue_lock = &h->lock;
+	if (blk_init_allocated_queue(disk->queue) < 0)
+		goto cleanup_queue;
+
 	sprintf(disk->disk_name, "cciss/c%dd%d", h->ctlr, drv_index);
 	disk->major = h->major;
 	disk->first_minor = drv_index << NWD_SHIFT;
@@ -3111,15 +3119,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 		return error_value;
 	}
 
-	/* SG_IO or similar, copy sense data back */
-	if (cmd->rq->sense) {
-		if (cmd->rq->sense_len > cmd->err_info->SenseLen)
-			cmd->rq->sense_len = cmd->err_info->SenseLen;
-		memcpy(cmd->rq->sense, cmd->err_info->SenseInfo,
-			cmd->rq->sense_len);
-	} else
-		cmd->rq->sense_len = 0;
-
+	scsi_req(cmd->rq)->sense_len = cmd->err_info->SenseLen;
 	return error_value;
 }
 
@@ -3150,7 +3150,6 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 			dev_warn(&h->pdev->dev, "cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
-			cmd->rq->resid_len = cmd->err_info->ResidualCnt;
 		}
 		break;
 	case CMD_DATA_OVERRUN:
@@ -3426,8 +3425,9 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[14] = c->Request.CDB[15] = 0;
 		}
 	} else if (creq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		c->Request.CDBLen = creq->cmd_len;
-		memcpy(c->Request.CDB, creq->cmd, BLK_MAX_CDB);
+		c->Request.CDBLen = scsi_req(creq)->cmd_len;
+		memcpy(c->Request.CDB, scsi_req(creq)->cmd, BLK_MAX_CDB);
+		scsi_req(creq)->sense = c->err_info->SenseInfo;
 	} else {
 		dev_warn(&h->pdev->dev, "bad request type %d\n",
 			creq->cmd_type);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 1b94c1ca5c5f..918a92ccd0c0 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -707,7 +707,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 			     WRITE : READ, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
-	blk_rq_set_block_pc(rq);
+	scsi_req_init(rq);
 
 	if (cgc->buflen) {
 		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
@@ -716,8 +716,8 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 			goto out;
 	}
 
-	rq->cmd_len = COMMAND_SIZE(cgc->cmd[0]);
-	memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
+	scsi_req(rq)->cmd_len = COMMAND_SIZE(cgc->cmd[0]);
+	memcpy(scsi_req(rq)->cmd, cgc->cmd, CDROM_PACKET_SIZE);
 
 	rq->timeout = 60*HZ;
 	if (cgc->quiet)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 10332c24f961..3027d2efc5d8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -52,6 +52,7 @@ struct virtio_blk {
 };
 
 struct virtblk_req {
+	struct scsi_request sreq;	/* for SCSI passthrough */
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
@@ -91,7 +92,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
 	 * inhdr with additional status information.
 	 */
 	if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
-		sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
+		sg_init_one(&cmd, vbr->sreq.cmd, vbr->sreq.cmd_len);
 		sgs[num_out++] = &cmd;
 	}
 
@@ -103,7 +104,6 @@ static int __virtblk_add_req(struct virtqueue *vq,
 	}
 
 	if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
-		memcpy(vbr->sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
 		sg_init_one(&sense, vbr->sense, SCSI_SENSE_BUFFERSIZE);
 		sgs[num_out + num_in++] = &sense;
 		sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
@@ -123,8 +123,10 @@ static inline void virtblk_request_done(struct request *req)
 	int error = virtblk_result(vbr);
 
 	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
-		req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
-		req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
+		scsi_req(req)->resid_len =
+			virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
+		vbr->sreq.sense_len =
+			virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
 		req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
 	} else if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		req->errors = (error != 0);
@@ -538,6 +540,7 @@ static int virtblk_init_request(void *data, struct request *rq,
 	struct virtio_blk *vblk = data;
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
 
+	vbr->sreq.sense = vbr->sense;
 	sg_init_table(vbr->sg, vblk->sg_elems);
 	return 0;
 }
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 59cca72647a6..36f5237a8a69 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -281,8 +281,8 @@
 #include <linux/fcntl.h>
 #include <linux/blkdev.h>
 #include <linux/times.h>
-
 #include <linux/uaccess.h>
+#include <scsi/scsi_request.h>
 
 /* used to tell the module to turn on full debugging messages */
 static bool debug;
@@ -2172,6 +2172,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 {
 	struct request_queue *q = cdi->disk->queue;
 	struct request *rq;
+	struct scsi_request *req;
 	struct bio *bio;
 	unsigned int len;
 	int nr, ret = 0;
@@ -2195,7 +2196,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 			ret = PTR_ERR(rq);
 			break;
 		}
-		blk_rq_set_block_pc(rq);
+		req = scsi_req(rq);
+		scsi_req_init(rq);
 
 		ret = blk_rq_map_user(q, rq, NULL, ubuf, len, GFP_KERNEL);
 		if (ret) {
@@ -2203,23 +2205,23 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 			break;
 		}
 
-		rq->cmd[0] = GPCMD_READ_CD;
-		rq->cmd[1] = 1 << 2;
-		rq->cmd[2] = (lba >> 24) & 0xff;
-		rq->cmd[3] = (lba >> 16) & 0xff;
-		rq->cmd[4] = (lba >>  8) & 0xff;
-		rq->cmd[5] = lba & 0xff;
-		rq->cmd[6] = (nr >> 16) & 0xff;
-		rq->cmd[7] = (nr >>  8) & 0xff;
-		rq->cmd[8] = nr & 0xff;
-		rq->cmd[9] = 0xf8;
-
-		rq->cmd_len = 12;
+		req->cmd[0] = GPCMD_READ_CD;
+		req->cmd[1] = 1 << 2;
+		req->cmd[2] = (lba >> 24) & 0xff;
+		req->cmd[3] = (lba >> 16) & 0xff;
+		req->cmd[4] = (lba >>  8) & 0xff;
+		req->cmd[5] = lba & 0xff;
+		req->cmd[6] = (nr >> 16) & 0xff;
+		req->cmd[7] = (nr >>  8) & 0xff;
+		req->cmd[8] = nr & 0xff;
+		req->cmd[9] = 0xf8;
+
+		req->cmd_len = 12;
 		rq->timeout = 60 * HZ;
 		bio = rq->bio;
 
 		if (blk_execute_rq(q, cdi->disk, rq, 0)) {
-			struct request_sense *s = rq->sense;
+			struct request_sense *s = req->sense;
 			ret = -EIO;
 			cdi->last_sense = s->sense_key;
 		}
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f90ea221f7f2..7c826ecbd276 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -93,6 +93,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	int error;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = (char *)pc;
 
@@ -103,9 +104,9 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 			goto put_req;
 	}
 
-	memcpy(rq->cmd, pc->c, 12);
+	memcpy(scsi_req(rq)->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
-		rq->cmd[13] = REQ_IDETAPE_PC1;
+		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
 	error = blk_execute_rq(drive->queue, disk, rq, 0);
 put_req:
 	blk_put_request(rq);
@@ -171,7 +172,8 @@ EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 {
 	struct request_sense *sense = &drive->sense_data;
-	struct request *sense_rq = &drive->sense_rq;
+	struct request *sense_rq = drive->sense_rq;
+	struct scsi_request *req = scsi_req(sense_rq);
 	unsigned int cmd_len, sense_len;
 	int err;
 
@@ -197,6 +199,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	memset(sense, 0, sizeof(*sense));
 
 	blk_rq_init(rq->q, sense_rq);
+	scsi_req_init(sense_rq);
 
 	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
 			      GFP_NOIO);
@@ -208,13 +211,13 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	}
 
 	sense_rq->rq_disk = rq->rq_disk;
-	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
-	sense_rq->cmd[4] = cmd_len;
 	sense_rq->cmd_type = REQ_TYPE_ATA_SENSE;
 	sense_rq->rq_flags |= RQF_PREEMPT;
 
+	req->cmd[0] = GPCMD_REQUEST_SENSE;
+	req->cmd[4] = cmd_len;
 	if (drive->media == ide_tape)
-		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+		req->cmd[13] = REQ_IDETAPE_PC1;
 
 	drive->sense_rq_armed = true;
 }
@@ -229,12 +232,12 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 		return -ENOMEM;
 	}
 
-	drive->sense_rq.special = special;
+	drive->sense_rq->special = special;
 	drive->sense_rq_armed = false;
 
 	drive->hwif->rq = NULL;
 
-	elv_add_request(drive->queue, &drive->sense_rq, ELEVATOR_INSERT_FRONT);
+	elv_add_request(drive->queue, drive->sense_rq, ELEVATOR_INSERT_FRONT);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
@@ -247,14 +250,14 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 void ide_retry_pc(ide_drive_t *drive)
 {
 	struct request *failed_rq = drive->hwif->rq;
-	struct request *sense_rq = &drive->sense_rq;
+	struct request *sense_rq = drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
 	(void)ide_read_error(drive);
 
 	/* init pc from sense_rq */
 	ide_init_pc(pc);
-	memcpy(pc->c, sense_rq->cmd, 12);
+	memcpy(pc->c, scsi_req(sense_rq)->cmd, 12);
 
 	if (drive->media == ide_tape)
 		drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
@@ -286,7 +289,7 @@ int ide_cd_expiry(ide_drive_t *drive)
 	 * commands/drives support that. Let ide_timer_expiry keep polling us
 	 * for these.
 	 */
-	switch (rq->cmd[0]) {
+	switch (scsi_req(rq)->cmd[0]) {
 	case GPCMD_BLANK:
 	case GPCMD_FORMAT_UNIT:
 	case GPCMD_RESERVE_RZONE_TRACK:
@@ -297,7 +300,7 @@ int ide_cd_expiry(ide_drive_t *drive)
 	default:
 		if (!(rq->rq_flags & RQF_QUIET))
 			printk(KERN_INFO PFX "cmd 0x%x timed out\n",
-					 rq->cmd[0]);
+					 scsi_req(rq)->cmd[0]);
 		wait = 0;
 		break;
 	}
@@ -420,7 +423,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
 		} else
-			rq->resid_len = 0;
+			scsi_req(rq)->resid_len = 0;
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -436,7 +439,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		local_irq_enable_in_hardirq();
 
 		if (drive->media == ide_tape &&
-		    (stat & ATA_ERR) && rq->cmd[0] == REQUEST_SENSE)
+		    (stat & ATA_ERR) && scsi_req(rq)->cmd[0] == REQUEST_SENSE)
 			stat &= ~ATA_ERR;
 
 		if ((stat & ATA_ERR) || (pc->flags & PC_FLAG_DMA_ERROR)) {
@@ -446,7 +449,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			if (drive->media != ide_tape)
 				pc->rq->errors++;
 
-			if (rq->cmd[0] == REQUEST_SENSE) {
+			if (scsi_req(rq)->cmd[0] == REQUEST_SENSE) {
 				printk(KERN_ERR PFX "%s: I/O error in request "
 						"sense command\n", drive->name);
 				return ide_do_reset(drive);
@@ -512,7 +515,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	ide_pio_bytes(drive, cmd, write, done);
 
 	/* Update transferred byte count */
-	rq->resid_len -= done;
+	scsi_req(rq)->resid_len -= done;
 
 	bcount -= done;
 
@@ -520,7 +523,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		ide_pad_transfer(drive, write, bcount);
 
 	debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n",
-		  rq->cmd[0], done, bcount, rq->resid_len);
+		  rq->cmd[0], done, bcount, scsi_req(rq)->resid_len);
 
 	/* And set the interrupt handler again */
 	ide_set_handler(drive, ide_pc_intr, timeout);
@@ -603,7 +606,7 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 
 	if (dev_is_idecd(drive)) {
 		/* ATAPI commands get padded out to 12 bytes minimum */
-		cmd_len = COMMAND_SIZE(rq->cmd[0]);
+		cmd_len = COMMAND_SIZE(scsi_req(rq)->cmd[0]);
 		if (cmd_len < ATAPI_MIN_CDB_BYTES)
 			cmd_len = ATAPI_MIN_CDB_BYTES;
 
@@ -650,7 +653,7 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 
 	/* Send the actual packet */
 	if ((drive->atapi_flags & IDE_AFLAG_ZIP_DRIVE) == 0)
-		hwif->tp_ops->output_data(drive, NULL, rq->cmd, cmd_len);
+		hwif->tp_ops->output_data(drive, NULL, scsi_req(rq)->cmd, cmd_len);
 
 	/* Begin DMA, if necessary */
 	if (dev_is_idecd(drive)) {
@@ -695,7 +698,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 							     bytes, 63 * 1024));
 
 		/* We haven't transferred any data yet */
-		rq->resid_len = bcount;
+		scsi_req(rq)->resid_len = bcount;
 
 		if (pc->flags & PC_FLAG_DMA_ERROR) {
 			pc->flags &= ~PC_FLAG_DMA_ERROR;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 9cbd217bc0c9..6eb98725e194 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -121,7 +121,7 @@ static int cdrom_log_sense(ide_drive_t *drive, struct request *rq)
 		 * don't log START_STOP unit with LoEj set, since we cannot
 		 * reliably check if drive can auto-close
 		 */
-		if (rq->cmd[0] == GPCMD_START_STOP_UNIT && sense->asc == 0x24)
+		if (scsi_req(rq)->cmd[0] == GPCMD_START_STOP_UNIT && sense->asc == 0x24)
 			break;
 		log = 1;
 		break;
@@ -163,7 +163,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 	 * toc has not been recorded yet, it will fail with 05/24/00 (which is a
 	 * confusing error)
 	 */
-	if (failed_command && failed_command->cmd[0] == GPCMD_READ_TOC_PMA_ATIP)
+	if (failed_command && scsi_req(failed_command)->cmd[0] == GPCMD_READ_TOC_PMA_ATIP)
 		if (sense->sense_key == 0x05 && sense->asc == 0x24)
 			return;
 
@@ -219,15 +219,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 	void *sense = bio_data(rq->bio);
 
 	if (failed) {
-		if (failed->sense) {
-			/*
-			 * Sense is always read into drive->sense_data.
-			 * Copy back if the failed request has its
-			 * sense pointer set.
-			 */
-			memcpy(failed->sense, sense, 18);
-			failed->sense_len = rq->sense_len;
-		}
+		/*
+		 * Sense is always read into drive->sense_data, copy back to the
+		 * original request.
+		 */
+		memcpy(scsi_req(failed)->sense, sense, 18);
+		scsi_req(failed)->sense_len = scsi_req(rq)->sense_len;
 		cdrom_analyze_sense_data(drive, failed);
 
 		if (ide_end_rq(drive, failed, -EIO, blk_rq_bytes(failed)))
@@ -338,7 +335,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 *
 		 * cdrom_log_sense() knows this!
 		 */
-		if (rq->cmd[0] == GPCMD_START_STOP_UNIT)
+		if (scsi_req(rq)->cmd[0] == GPCMD_START_STOP_UNIT)
 			break;
 		/* fall-through */
 	case DATA_PROTECT:
@@ -414,7 +411,7 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 	 * Some of the trailing request sense fields are optional,
 	 * and some drives don't send them.  Sigh.
 	 */
-	if (rq->cmd[0] == GPCMD_REQUEST_SENSE &&
+	if (scsi_req(rq)->cmd[0] == GPCMD_REQUEST_SENSE &&
 	    cmd->nleft > 0 && cmd->nleft <= 5)
 		cmd->nleft = 0;
 }
@@ -425,12 +422,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		    req_flags_t rq_flags)
 {
 	struct cdrom_info *info = drive->driver_data;
-	struct request_sense local_sense;
 	int retries = 10;
-	req_flags_t flags = 0;
-
-	if (!sense)
-		sense = &local_sense;
+	bool failed;
 
 	ide_debug_log(IDE_DBG_PC, "cmd[0]: 0x%x, write: 0x%x, timeout: %d, "
 				  "rq_flags: 0x%x",
@@ -440,12 +433,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 	do {
 		struct request *rq;
 		int error;
+		bool delay = false;
 
 		rq = blk_get_request(drive->queue, write, __GFP_RECLAIM);
-
-		memcpy(rq->cmd, cmd, BLK_MAX_CDB);
+		scsi_req_init(rq);
+		memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
 		rq->cmd_type = REQ_TYPE_ATA_PC;
-		rq->sense = sense;
 		rq->rq_flags |= rq_flags;
 		rq->timeout = timeout;
 		if (buffer) {
@@ -460,21 +453,21 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
 
 		if (buffer)
-			*bufflen = rq->resid_len;
-
-		flags = rq->rq_flags;
-		blk_put_request(rq);
+			*bufflen = scsi_req(rq)->resid_len;
+		if (sense)
+			memcpy(sense, scsi_req(rq)->sense, sizeof(*sense));
 
 		/*
 		 * FIXME: we should probably abort/retry or something in case of
 		 * failure.
 		 */
-		if (flags & RQF_FAILED) {
+		failed = (rq->rq_flags & RQF_FAILED) != 0;
+		if (failed) {
 			/*
 			 * The request failed.  Retry if it was due to a unit
 			 * attention status (usually means media was changed).
 			 */
-			struct request_sense *reqbuf = sense;
+			struct request_sense *reqbuf = scsi_req(rq)->sense;
 
 			if (reqbuf->sense_key == UNIT_ATTENTION)
 				cdrom_saw_media_change(drive);
@@ -485,19 +478,20 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 				 * a disk.  Retry, but wait a little to give
 				 * the drive time to complete the load.
 				 */
-				ssleep(2);
+				delay = true;
 			} else {
 				/* otherwise, don't retry */
 				retries = 0;
 			}
 			--retries;
 		}
-
-		/* end of retry loop */
-	} while ((flags & RQF_FAILED) && retries >= 0);
+		blk_put_request(rq);
+		if (delay)
+			ssleep(2);
+	} while (failed && retries >= 0);
 
 	/* return an error if the command failed */
-	return (flags & RQF_FAILED) ? -EIO : 0;
+	return failed ? -EIO : 0;
 }
 
 /*
@@ -636,7 +630,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		len -= blen;
 
 		if (sense && write == 0)
-			rq->sense_len += blen;
+			scsi_req(rq)->sense_len += blen;
 	}
 
 	/* pad, if necessary */
@@ -664,7 +658,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && rc == 0) {
-		rq->resid_len = 0;
+		scsi_req(rq)->resid_len = 0;
 		blk_end_request_all(rq, 0);
 		hwif->rq = NULL;
 	} else {
@@ -685,9 +679,9 @@ out_end:
 
 		/* make sure it's fully ended */
 		if (rq->cmd_type != REQ_TYPE_FS) {
-			rq->resid_len -= cmd->nbytes - cmd->nleft;
+			scsi_req(rq)->resid_len -= cmd->nbytes - cmd->nleft;
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
-				rq->resid_len += cmd->last_xfer_len;
+				scsi_req(rq)->resid_len += cmd->last_xfer_len;
 		}
 
 		ide_complete_rq(drive, uptodate ? 0 : -EIO, blk_rq_bytes(rq));
@@ -1312,28 +1306,29 @@ static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 	int hard_sect = queue_logical_block_size(q);
 	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
 	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
+	struct scsi_request *req = scsi_req(rq);
 
-	memset(rq->cmd, 0, BLK_MAX_CDB);
+	memset(req->cmd, 0, BLK_MAX_CDB);
 
 	if (rq_data_dir(rq) == READ)
-		rq->cmd[0] = GPCMD_READ_10;
+		req->cmd[0] = GPCMD_READ_10;
 	else
-		rq->cmd[0] = GPCMD_WRITE_10;
+		req->cmd[0] = GPCMD_WRITE_10;
 
 	/*
 	 * fill in lba
 	 */
-	rq->cmd[2] = (block >> 24) & 0xff;
-	rq->cmd[3] = (block >> 16) & 0xff;
-	rq->cmd[4] = (block >>  8) & 0xff;
-	rq->cmd[5] = block & 0xff;
+	req->cmd[2] = (block >> 24) & 0xff;
+	req->cmd[3] = (block >> 16) & 0xff;
+	req->cmd[4] = (block >>  8) & 0xff;
+	req->cmd[5] = block & 0xff;
 
 	/*
 	 * and transfer length
 	 */
-	rq->cmd[7] = (blocks >> 8) & 0xff;
-	rq->cmd[8] = blocks & 0xff;
-	rq->cmd_len = 10;
+	req->cmd[7] = (blocks >> 8) & 0xff;
+	req->cmd[8] = blocks & 0xff;
+	req->cmd_len = 10;
 	return BLKPREP_OK;
 }
 
@@ -1343,7 +1338,7 @@ static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
  */
 static int ide_cdrom_prep_pc(struct request *rq)
 {
-	u8 *c = rq->cmd;
+	u8 *c = scsi_req(rq)->cmd;
 
 	/* transform 6-byte read/write commands to the 10-byte version */
 	if (c[0] == READ_6 || c[0] == WRITE_6) {
@@ -1354,7 +1349,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
 		c[2] = 0;
 		c[1] &= 0xe0;
 		c[0] += (READ_10 - READ_6);
-		rq->cmd_len = 10;
+		scsi_req(rq)->cmd_len = 10;
 		return BLKPREP_OK;
 	}
 
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index f085e3a2e1d6..da0aa0153fb1 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -304,6 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	int ret;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->rq_flags = RQF_QUIET;
 	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
diff --git a/drivers/ide/ide-cd_verbose.c b/drivers/ide/ide-cd_verbose.c
index f079ca2f260b..58a6feb74c02 100644
--- a/drivers/ide/ide-cd_verbose.c
+++ b/drivers/ide/ide-cd_verbose.c
@@ -315,12 +315,12 @@ void ide_cd_log_error(const char *name, struct request *failed_command,
 		while (hi > lo) {
 			mid = (lo + hi) / 2;
 			if (packet_command_texts[mid].packet_command ==
-			    failed_command->cmd[0]) {
+			    scsi_req(failed_command)->cmd[0]) {
 				s = packet_command_texts[mid].text;
 				break;
 			}
 			if (packet_command_texts[mid].packet_command >
-			    failed_command->cmd[0])
+			    scsi_req(failed_command)->cmd[0])
 				hi = mid;
 			else
 				lo = mid + 1;
@@ -329,7 +329,7 @@ void ide_cd_log_error(const char *name, struct request *failed_command,
 		printk(KERN_ERR "  The failed \"%s\" packet command "
 				"was: \n  \"", s);
 		for (i = 0; i < BLK_MAX_CDB; i++)
-			printk(KERN_CONT "%02x ", failed_command->cmd[i]);
+			printk(KERN_CONT "%02x ", scsi_req(failed_command)->cmd[i]);
 		printk(KERN_CONT "\"\n");
 	}
 
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 0dd43b4fcec6..fd56c9dd9bc7 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -166,10 +166,11 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 		return setting->set(drive, arg);
 
 	rq = blk_get_request(q, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
-	rq->cmd_len = 5;
-	rq->cmd[0] = REQ_DEVSET_EXEC;
-	*(int *)&rq->cmd[1] = arg;
+	scsi_req(rq)->cmd_len = 5;
+	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
+	*(int *)&scsi_req(rq)->cmd[1] = arg;
 	rq->special = setting->set;
 
 	if (blk_execute_rq(q, NULL, rq, 0))
@@ -183,7 +184,7 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 {
 	int err, (*setfunc)(ide_drive_t *, int) = rq->special;
 
-	err = setfunc(drive, *(int *)&rq->cmd[1]);
+	err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
 	if (err)
 		rq->errors = err;
 	ide_complete_rq(drive, err, blk_rq_bytes(rq));
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 5ceace542b77..3437c5b28599 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -478,6 +478,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 		return -EBUSY;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 
 	drive->mult_req = arg;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index d6da011299f5..35e5b892f9d7 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -148,7 +148,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 	struct request *rq = drive->hwif->rq;
 
 	if (rq && rq->cmd_type == REQ_TYPE_DRV_PRIV &&
-	    rq->cmd[0] == REQ_DRIVE_RESET) {
+	    scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
 		if (err <= 0 && rq->errors == 0)
 			rq->errors = -EIO;
 		ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index f079d8d1d856..3bd678ad3519 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -203,7 +203,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 	put_unaligned(cpu_to_be16(blocks), (unsigned short *)&pc->c[7]);
 	put_unaligned(cpu_to_be32(block), (unsigned int *) &pc->c[2]);
 
-	memcpy(rq->cmd, pc->c, 12);
+	memcpy(scsi_req(rq)->cmd, pc->c, 12);
 
 	pc->rq = rq;
 	if (cmd == WRITE)
@@ -216,7 +216,7 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 		struct ide_atapi_pc *pc, struct request *rq)
 {
 	ide_init_pc(pc);
-	memcpy(pc->c, rq->cmd, sizeof(pc->c));
+	memcpy(pc->c, scsi_req(rq)->cmd, sizeof(pc->c));
 	pc->rq = rq;
 	if (blk_rq_bytes(rq)) {
 		pc->flags |= PC_FLAG_DMA_OK;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 201e43fcbc94..3378503afed4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -279,7 +279,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 
 static ide_startstop_t ide_special_rq(ide_drive_t *drive, struct request *rq)
 {
-	u8 cmd = rq->cmd[0];
+	u8 cmd = scsi_req(rq)->cmd[0];
 
 	switch (cmd) {
 	case REQ_PARK_HEADS:
@@ -545,6 +545,7 @@ repeat:
 			goto plug_device;
 		}
 
+		scsi_req(rq)->resid_len = blk_rq_bytes(rq);
 		hwif->rq = rq;
 
 		spin_unlock_irq(&hwif->lock);
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index d05db2469209..a5d22c6e956a 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -126,6 +126,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 		struct request *rq;
 
 		rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+		scsi_req_init(rq);
 		rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 		err = blk_execute_rq(drive->queue, NULL, rq, 0);
 		blk_put_request(rq);
@@ -222,9 +223,10 @@ static int generic_drive_reset(ide_drive_t *drive)
 	int ret = 0;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
-	rq->cmd_len = 1;
-	rq->cmd[0] = REQ_DRIVE_RESET;
+	scsi_req(rq)->cmd_len = 1;
+	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
 		ret = rq->errors;
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 2d7dca56dd24..c37604a75097 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -32,8 +32,9 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	spin_unlock_irq(&hwif->lock);
 
 	rq = blk_get_request(q, READ, __GFP_RECLAIM);
-	rq->cmd[0] = REQ_PARK_HEADS;
-	rq->cmd_len = 1;
+	scsi_req_init(rq);
+	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
+	scsi_req(rq)->cmd_len = 1;
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = &timeout;
 	rc = blk_execute_rq(q, NULL, rq, 1);
@@ -46,11 +47,12 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	 * timeout has expired, so power management will be reenabled.
 	 */
 	rq = blk_get_request(q, READ, GFP_NOWAIT);
+	scsi_req_init(rq);
 	if (IS_ERR(rq))
 		goto out;
 
-	rq->cmd[0] = REQ_UNPARK_HEADS;
-	rq->cmd_len = 1;
+	scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
+	scsi_req(rq)->cmd_len = 1;
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
 
@@ -64,7 +66,7 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 	struct ide_taskfile *tf = &cmd.tf;
 
 	memset(&cmd, 0, sizeof(cmd));
-	if (rq->cmd[0] == REQ_PARK_HEADS) {
+	if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) {
 		drive->sleep = *(unsigned long *)rq->special;
 		drive->dev_flags |= IDE_DFLAG_SLEEPING;
 		tf->command = ATA_CMD_IDLEIMMEDIATE;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index a015acdffb39..f6767aba7cfb 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -19,6 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
@@ -89,6 +90,7 @@ int generic_ide_resume(struct device *dev)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
 	rq->rq_flags |= RQF_PREEMPT;
 	rq->special = &rqpm;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 330e319419e6..a74ae8df4bb8 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -741,6 +741,14 @@ static void ide_port_tune_devices(ide_hwif_t *hwif)
 	}
 }
 
+static int ide_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp)
+{
+	struct ide_request *req = blk_mq_rq_to_pdu(rq);
+
+	req->sreq.sense = req->sense;
+	return 0;
+}
+
 /*
  * init request queue
  */
@@ -758,11 +766,18 @@ static int ide_init_queue(ide_drive_t *drive)
 	 *	limits and LBA48 we could raise it but as yet
 	 *	do not.
 	 */
-
-	q = blk_init_queue_node(do_ide_request, NULL, hwif_to_node(hwif));
+	q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif));
 	if (!q)
 		return 1;
 
+	q->request_fn = do_ide_request;
+	q->init_rq_fn = ide_init_rq;
+	q->cmd_size = sizeof(struct ide_request);
+	if (blk_init_allocated_queue(q) < 0) {
+		blk_cleanup_queue(q);
+		return 1;
+	}
+
 	q->queuedata = drive;
 	blk_queue_segment_boundary(q, 0xffff);
 
@@ -1131,10 +1146,12 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
 	ide_port_for_each_dev(i, drive, hwif) {
 		u8 j = (hwif->index * MAX_DRIVES) + i;
 		u16 *saved_id = drive->id;
+		struct request *saved_sense_rq = drive->sense_rq;
 
 		memset(drive, 0, sizeof(*drive));
 		memset(saved_id, 0, SECTOR_SIZE);
 		drive->id = saved_id;
+		drive->sense_rq = saved_sense_rq;
 
 		drive->media			= ide_disk;
 		drive->select			= (i << 4) | ATA_DEVICE_OBS;
@@ -1241,6 +1258,7 @@ static void ide_port_free_devices(ide_hwif_t *hwif)
 	int i;
 
 	ide_port_for_each_dev(i, drive, hwif) {
+		kfree(drive->sense_rq);
 		kfree(drive->id);
 		kfree(drive);
 	}
@@ -1248,11 +1266,10 @@ static void ide_port_free_devices(ide_hwif_t *hwif)
 
 static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
 {
+	ide_drive_t *drive;
 	int i;
 
 	for (i = 0; i < MAX_DRIVES; i++) {
-		ide_drive_t *drive;
-
 		drive = kzalloc_node(sizeof(*drive), GFP_KERNEL, node);
 		if (drive == NULL)
 			goto out_nomem;
@@ -1267,12 +1284,21 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
 		 */
 		drive->id = kzalloc_node(SECTOR_SIZE, GFP_KERNEL, node);
 		if (drive->id == NULL)
-			goto out_nomem;
+			goto out_free_drive;
+
+		drive->sense_rq = kmalloc(sizeof(struct request) +
+				sizeof(struct ide_request), GFP_KERNEL);
+		if (!drive->sense_rq)
+			goto out_free_id;
 
 		hwif->devices[i] = drive;
 	}
 	return 0;
 
+out_free_id:
+	kfree(drive->id);
+out_free_drive:
+	kfree(drive);
 out_nomem:
 	ide_port_free_devices(hwif);
 	return -ENOMEM;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 9ecf4e35adcd..f6bc1e2bb035 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -282,7 +282,7 @@ static void idetape_analyze_error(ide_drive_t *drive)
 
 	/* correct remaining bytes to transfer */
 	if (pc->flags & PC_FLAG_DMA_ERROR)
-		rq->resid_len = tape->blk_size * get_unaligned_be32(&sense[3]);
+		scsi_req(rq)->resid_len = tape->blk_size * get_unaligned_be32(&sense[3]);
 
 	/*
 	 * If error was the result of a zero-length read or write command,
@@ -316,7 +316,7 @@ static void idetape_analyze_error(ide_drive_t *drive)
 			pc->flags |= PC_FLAG_ABORT;
 		}
 		if (!(pc->flags & PC_FLAG_ABORT) &&
-		    (blk_rq_bytes(rq) - rq->resid_len))
+		    (blk_rq_bytes(rq) - scsi_req(rq)->resid_len))
 			pc->retries = IDETAPE_MAX_PC_RETRIES + 1;
 	}
 }
@@ -348,7 +348,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 					"itself - Aborting request!\n");
 	} else if (pc->c[0] == READ_6 || pc->c[0] == WRITE_6) {
 		unsigned int blocks =
-			(blk_rq_bytes(rq) - rq->resid_len) / tape->blk_size;
+			(blk_rq_bytes(rq) - scsi_req(rq)->resid_len) / tape->blk_size;
 
 		tape->avg_size += blocks * tape->blk_size;
 
@@ -560,7 +560,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 		pc->flags |= PC_FLAG_WRITING;
 	}
 
-	memcpy(rq->cmd, pc->c, 12);
+	memcpy(scsi_req(rq)->cmd, pc->c, 12);
 }
 
 static ide_startstop_t idetape_do_request(ide_drive_t *drive,
@@ -570,10 +570,11 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc *pc = NULL;
 	struct ide_cmd cmd;
+	struct scsi_request *req = scsi_req(rq);
 	u8 stat;
 
 	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, sector: %llu, nr_sectors: %u",
-		      rq->cmd[0], (unsigned long long)blk_rq_pos(rq),
+		      req->cmd[0], (unsigned long long)blk_rq_pos(rq),
 		      blk_rq_sectors(rq));
 
 	BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV ||
@@ -592,7 +593,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	stat = hwif->tp_ops->read_status(hwif);
 
 	if ((drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) == 0 &&
-	    (rq->cmd[13] & REQ_IDETAPE_PC2) == 0)
+	    (req->cmd[13] & REQ_IDETAPE_PC2) == 0)
 		drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
 
 	if (drive->dev_flags & IDE_DFLAG_POST_RESET) {
@@ -609,7 +610,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		} else if (time_after(jiffies, tape->dsc_timeout)) {
 			printk(KERN_ERR "ide-tape: %s: DSC timeout\n",
 				tape->name);
-			if (rq->cmd[13] & REQ_IDETAPE_PC2) {
+			if (req->cmd[13] & REQ_IDETAPE_PC2) {
 				idetape_media_access_finished(drive);
 				return ide_stopped;
 			} else {
@@ -626,23 +627,23 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		tape->postponed_rq = false;
 	}
 
-	if (rq->cmd[13] & REQ_IDETAPE_READ) {
+	if (req->cmd[13] & REQ_IDETAPE_READ) {
 		pc = &tape->queued_pc;
 		ide_tape_create_rw_cmd(tape, pc, rq, READ_6);
 		goto out;
 	}
-	if (rq->cmd[13] & REQ_IDETAPE_WRITE) {
+	if (req->cmd[13] & REQ_IDETAPE_WRITE) {
 		pc = &tape->queued_pc;
 		ide_tape_create_rw_cmd(tape, pc, rq, WRITE_6);
 		goto out;
 	}
-	if (rq->cmd[13] & REQ_IDETAPE_PC1) {
+	if (req->cmd[13] & REQ_IDETAPE_PC1) {
 		pc = (struct ide_atapi_pc *)rq->special;
-		rq->cmd[13] &= ~(REQ_IDETAPE_PC1);
-		rq->cmd[13] |= REQ_IDETAPE_PC2;
+		req->cmd[13] &= ~(REQ_IDETAPE_PC1);
+		req->cmd[13] |= REQ_IDETAPE_PC2;
 		goto out;
 	}
-	if (rq->cmd[13] & REQ_IDETAPE_PC2) {
+	if (req->cmd[13] & REQ_IDETAPE_PC2) {
 		idetape_media_access_finished(drive);
 		return ide_stopped;
 	}
@@ -853,8 +854,9 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	BUG_ON(size < 0 || size % tape->blk_size);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
-	rq->cmd[13] = cmd;
+	scsi_req(rq)->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
 	rq->__sector = tape->first_frame;
 
@@ -868,7 +870,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
 	/* calculate the number of transferred bytes and update buffer state */
-	size -= rq->resid_len;
+	size -= scsi_req(rq)->resid_len;
 	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
 		tape->valid = size;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index a716693417a3..a393e13b8007 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -431,6 +431,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	int rw = !(cmd->tf_flags & IDE_TFLAG_WRITE) ? READ : WRITE;
 
 	rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM);
+	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 
 	/*
diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c
index 247853ea1368..c3062b53056f 100644
--- a/drivers/ide/sis5513.c
+++ b/drivers/ide/sis5513.c
@@ -54,7 +54,7 @@
 #define DRV_NAME "sis5513"
 
 /* registers layout and init values are chipset family dependent */
-
+#undef ATA_16
 #define ATA_16		0x01
 #define ATA_33		0x02
 #define ATA_66		0x03
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 7ee1667acde4..b8c4b2ba7519 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -2320,10 +2320,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		SmpPassthroughReply_t *smprep;
 
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
-		memcpy(req->sense, smprep, sizeof(*smprep));
-		req->sense_len = sizeof(*smprep);
-		req->resid_len = 0;
-		rsp->resid_len -= smprep->ResponseDataLength;
+		memcpy(scsi_req(req)->sense, smprep, sizeof(*smprep));
+		scsi_req(req)->sense_len = sizeof(*smprep);
+		scsi_req(req)->resid_len = 0;
+		scsi_req(rsp)->resid_len -= smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT
 		    "%s: smp passthru reply failed to be returned\n",
diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c
index 919736a74ffa..aa76f36abe03 100644
--- a/drivers/scsi/libfc/fc_lport.c
+++ b/drivers/scsi/libfc/fc_lport.c
@@ -2095,7 +2095,7 @@ int fc_lport_bsg_request(struct bsg_job *job)
 
 	bsg_reply->reply_payload_rcv_len = 0;
 	if (rsp)
-		rsp->resid_len = job->reply_payload.payload_len;
+		scsi_req(rsp)->resid_len = job->reply_payload.payload_len;
 
 	mutex_lock(&lport->lp_mutex);
 
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 022bb6e10d98..570b2cb2da43 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -2174,12 +2174,12 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			       bio_data(rsp->bio), blk_rq_bytes(rsp));
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
-		rsp->resid_len = ret;
-		req->resid_len = 0;
+		scsi_req(rsp)->resid_len = ret;
+		scsi_req(req)->resid_len = 0;
 		ret = 0;
 	} else if (ret == 0) {
-		rsp->resid_len = 0;
-		req->resid_len = 0;
+		scsi_req(rsp)->resid_len = 0;
+		scsi_req(req)->resid_len = 0;
 	}
 
 	return ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index d24792575169..45cbbc44f4d7 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -274,15 +274,15 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
-		req->resid_len -= 8;
-		rsp->resid_len -= 32;
+		scsi_req(req)->resid_len -= 8;
+		scsi_req(rsp)->resid_len -= 32;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		resp_data[9] = sas_ha->num_phys;
 		break;
 
 	case SMP_REPORT_MANUF_INFO:
-		req->resid_len -= 8;
-		rsp->resid_len -= 64;
+		scsi_req(req)->resid_len -= 8;
+		scsi_req(rsp)->resid_len -= 64;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		memcpy(resp_data + 12, shost->hostt->name,
 		       SAS_EXPANDER_VENDOR_ID_LEN);
@@ -295,13 +295,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_DISCOVER:
-		req->resid_len -= 16;
-		if ((int)req->resid_len < 0) {
-			req->resid_len = 0;
+		scsi_req(req)->resid_len -= 16;
+		if ((int)scsi_req(req)->resid_len < 0) {
+			scsi_req(req)->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		rsp->resid_len -= 56;
+		scsi_req(rsp)->resid_len -= 56;
 		sas_host_smp_discover(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -311,13 +311,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_REPORT_PHY_SATA:
-		req->resid_len -= 16;
-		if ((int)req->resid_len < 0) {
-			req->resid_len = 0;
+		scsi_req(req)->resid_len -= 16;
+		if ((int)scsi_req(req)->resid_len < 0) {
+			scsi_req(req)->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		rsp->resid_len -= 60;
+		scsi_req(rsp)->resid_len -= 60;
 		sas_report_phy_sata(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -331,15 +331,15 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		int to_write = req_data[4];
 
 		if (blk_rq_bytes(req) < base_frame_size + to_write * 4 ||
-		    req->resid_len < base_frame_size + to_write * 4) {
+		    scsi_req(req)->resid_len < base_frame_size + to_write * 4) {
 			resp_data[2] = SMP_RESP_INV_FRM_LEN;
 			break;
 		}
 
 		to_write = sas_host_smp_write_gpio(sas_ha, resp_data, req_data[2],
 						   req_data[3], to_write, &req_data[8]);
-		req->resid_len -= base_frame_size + to_write * 4;
-		rsp->resid_len -= 8;
+		scsi_req(req)->resid_len -= base_frame_size + to_write * 4;
+		scsi_req(rsp)->resid_len -= 8;
 		break;
 	}
 
@@ -348,13 +348,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_PHY_CONTROL:
-		req->resid_len -= 44;
-		if ((int)req->resid_len < 0) {
-			req->resid_len = 0;
+		scsi_req(req)->resid_len -= 44;
+		if ((int)scsi_req(req)->resid_len < 0) {
+			scsi_req(req)->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		rsp->resid_len -= 8;
+		scsi_req(rsp)->resid_len -= 8;
 		sas_phy_control(sas_ha, req_data[9], req_data[10],
 				req_data[32] >> 4, req_data[33] >> 4,
 				resp_data);
diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c
index 7f1d5785bc30..e7a7a704a315 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_transport.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c
@@ -2057,10 +2057,10 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		    ioc->name, __func__,
 		    le16_to_cpu(mpi_reply->ResponseDataLength)));
 
-		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
-		req->sense_len = sizeof(*mpi_reply);
-		req->resid_len = 0;
-		rsp->resid_len -=
+		memcpy(scsi_req(req)->sense, mpi_reply, sizeof(*mpi_reply));
+		scsi_req(req)->sense_len = sizeof(*mpi_reply);
+		scsi_req(req)->resid_len = 0;
+		scsi_req(rsp)->resid_len -=
 		    le16_to_cpu(mpi_reply->ResponseDataLength);
 
 		/* check if the resp needs to be copied from the allocated
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index ef99f62831fb..fcb040eebf47 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -48,6 +48,7 @@
 #include <scsi/osd_sense.h>
 
 #include <scsi/scsi_device.h>
+#include <scsi/scsi_request.h>
 
 #include "osd_debug.h"
 
@@ -477,11 +478,13 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
 {
 	or->async_error = error;
 	or->req_errors = req->errors ? : error;
-	or->sense_len = req->sense_len;
+	or->sense_len = scsi_req(req)->sense_len;
+	if (or->sense_len)
+		memcpy(or->sense, scsi_req(req)->sense, or->sense_len);
 	if (or->out.req)
-		or->out.residual = or->out.req->resid_len;
+		or->out.residual = scsi_req(or->out.req)->resid_len;
 	if (or->in.req)
-		or->in.residual = or->in.req->resid_len;
+		or->in.residual = scsi_req(or->in.req)->resid_len;
 }
 
 int osd_execute_request(struct osd_request *or)
@@ -1565,7 +1568,7 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 	req = blk_get_request(q, has_write ? WRITE : READ, flags);
 	if (IS_ERR(req))
 		return req;
-	blk_rq_set_block_pc(req);
+	scsi_req_init(req);
 
 	for_each_bio(bio) {
 		struct bio *bounce_bio = bio;
@@ -1599,8 +1602,6 @@ static int _init_blk_request(struct osd_request *or,
 
 	req->timeout = or->timeout;
 	req->retries = or->retries;
-	req->sense = or->sense;
-	req->sense_len = 0;
 
 	if (has_out) {
 		or->out.req = req;
@@ -1612,7 +1613,7 @@ static int _init_blk_request(struct osd_request *or,
 				ret = PTR_ERR(req);
 				goto out;
 			}
-			blk_rq_set_block_pc(req);
+			scsi_req_init(req);
 			or->in.req = or->request->next_rq = req;
 		}
 	} else if (has_in)
@@ -1699,8 +1700,8 @@ int osd_finalize_request(struct osd_request *or,
 
 	osd_sec_sign_cdb(&or->cdb, cap_key);
 
-	or->request->cmd = or->cdb.buff;
-	or->request->cmd_len = _osd_req_cdb_len(or);
+	scsi_req(or->request)->cmd = or->cdb.buff;
+	scsi_req(or->request)->cmd_len = _osd_req_cdb_len(or);
 
 	return 0;
 }
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index e8196c55b633..d314aa5c71cf 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -322,6 +322,7 @@ static int osst_chk_result(struct osst_tape * STp, struct osst_request * SRpnt)
 /* Wakeup from interrupt */
 static void osst_end_async(struct request *req, int update)
 {
+	struct scsi_request *rq = scsi_req(req);
 	struct osst_request *SRpnt = req->end_io_data;
 	struct osst_tape *STp = SRpnt->stp;
 	struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
@@ -330,6 +331,8 @@ static void osst_end_async(struct request *req, int update)
 #if DEBUG
 	STp->write_pending = 0;
 #endif
+	if (rq->sense_len)
+		memcpy(SRpnt->sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
 	if (SRpnt->waiting)
 		complete(SRpnt->waiting);
 
@@ -357,6 +360,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 			int use_sg, int timeout, int retries)
 {
 	struct request *req;
+	struct scsi_request *rq;
 	struct page **pages = NULL;
 	struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
 
@@ -367,7 +371,8 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 
-	blk_rq_set_block_pc(req);
+	rq = scsi_req(req);
+	scsi_req_init(req);
 	req->rq_flags |= RQF_QUIET;
 
 	SRpnt->bio = NULL;
@@ -404,11 +409,9 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 			goto free_req;
 	}
 
-	req->cmd_len = cmd_len;
-	memset(req->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
-	memcpy(req->cmd, cmd, req->cmd_len);
-	req->sense = SRpnt->sense;
-	req->sense_len = 0;
+	rq->cmd_len = cmd_len;
+	memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
+	memcpy(rq->cmd, cmd, rq->cmd_len);
 	req->timeout = timeout;
 	req->retries = retries;
 	req->end_io_data = SRpnt;
diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c
index 1bf8061ff803..40ca75bbcb9d 100644
--- a/drivers/scsi/qla2xxx/qla_bsg.c
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
@@ -921,7 +921,7 @@ qla2x00_process_loopback(struct bsg_job *bsg_job)
 
 	bsg_job->reply_len = sizeof(struct fc_bsg_reply) +
 	    sizeof(response) + sizeof(uint8_t);
-	fw_sts_ptr = ((uint8_t *)bsg_job->req->sense) +
+	fw_sts_ptr = ((uint8_t *)scsi_req(bsg_job->req)->sense) +
 	    sizeof(struct fc_bsg_reply);
 	memcpy(fw_sts_ptr, response, sizeof(response));
 	fw_sts_ptr += sizeof(response);
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index dc88a09f9043..67e157d81da8 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -1468,7 +1468,8 @@ qla24xx_els_ct_entry(scsi_qla_host_t *vha, struct req_que *req,
 			    type, sp->handle, comp_status, fw_status[1], fw_status[2],
 			    le16_to_cpu(((struct els_sts_entry_24xx *)
 				pkt)->total_byte_count));
-			fw_sts_ptr = ((uint8_t*)bsg_job->req->sense) + sizeof(struct fc_bsg_reply);
+			fw_sts_ptr = ((uint8_t*)scsi_req(bsg_job->req)->sense) +
+				sizeof(struct fc_bsg_reply);
 			memcpy( fw_sts_ptr, fw_status, sizeof(fw_status));
 		}
 		else {
@@ -1482,7 +1483,8 @@ qla24xx_els_ct_entry(scsi_qla_host_t *vha, struct req_que *req,
 				    pkt)->error_subcode_2));
 			res = DID_ERROR << 16;
 			bsg_reply->reply_payload_rcv_len = 0;
-			fw_sts_ptr = ((uint8_t*)bsg_job->req->sense) + sizeof(struct fc_bsg_reply);
+			fw_sts_ptr = ((uint8_t*)scsi_req(bsg_job->req)->sense) +
+					sizeof(struct fc_bsg_reply);
 			memcpy( fw_sts_ptr, fw_status, sizeof(fw_status));
 		}
 		ql_dump_buffer(ql_dbg_user + ql_dbg_buffer, vha, 0x5056,
diff --git a/drivers/scsi/qla2xxx/qla_mr.c b/drivers/scsi/qla2xxx/qla_mr.c
index 02f1de18bc2b..96c33e292eba 100644
--- a/drivers/scsi/qla2xxx/qla_mr.c
+++ b/drivers/scsi/qla2xxx/qla_mr.c
@@ -2244,7 +2244,7 @@ qlafx00_ioctl_iosb_entry(scsi_qla_host_t *vha, struct req_que *req,
 		memcpy(fstatus.reserved_3,
 		    pkt->reserved_2, 20 * sizeof(uint8_t));
 
-		fw_sts_ptr = ((uint8_t *)bsg_job->req->sense) +
+		fw_sts_ptr = ((uint8_t *)scsi_req(bsg_job->req)->sense) +
 		    sizeof(struct fc_bsg_reply);
 
 		memcpy(fw_sts_ptr, (uint8_t *)&fstatus,
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 7c084600827b..4b40f746d534 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1968,6 +1968,7 @@ static void eh_lock_door_done(struct request *req, int uptodate)
 static void scsi_eh_lock_door(struct scsi_device *sdev)
 {
 	struct request *req;
+	struct scsi_request *rq;
 
 	/*
 	 * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a
@@ -1976,17 +1977,16 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL);
 	if (IS_ERR(req))
 		return;
+	rq = scsi_req(req);
+	scsi_req_init(req);
 
-	blk_rq_set_block_pc(req);
-
-	req->cmd[0] = ALLOW_MEDIUM_REMOVAL;
-	req->cmd[1] = 0;
-	req->cmd[2] = 0;
-	req->cmd[3] = 0;
-	req->cmd[4] = SCSI_REMOVAL_PREVENT;
-	req->cmd[5] = 0;
-
-	req->cmd_len = COMMAND_SIZE(req->cmd[0]);
+	rq->cmd[0] = ALLOW_MEDIUM_REMOVAL;
+	rq->cmd[1] = 0;
+	rq->cmd[2] = 0;
+	rq->cmd[3] = 0;
+	rq->cmd[4] = SCSI_REMOVAL_PREVENT;
+	rq->cmd[5] = 0;
+	rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
 
 	req->rq_flags |= RQF_QUIET;
 	req->timeout = 10 * HZ;
@@ -2355,7 +2355,7 @@ scsi_ioctl_reset(struct scsi_device *dev, int __user *arg)
 	scmd = (struct scsi_cmnd *)(rq + 1);
 	scsi_init_command(dev, scmd);
 	scmd->request = rq;
-	scmd->cmnd = rq->cmd;
+	scmd->cmnd = scsi_req(rq)->cmd;
 
 	scmd->scsi_done		= scsi_reset_provider_done_command;
 	memset(&scmd->sdb, 0, sizeof(scmd->sdb));
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 81ff5ad23b83..8188e5c71f75 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -220,21 +220,21 @@ static int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 {
 	struct request *req;
 	int write = (data_direction == DMA_TO_DEVICE);
+	struct scsi_request *rq;
 	int ret = DRIVER_ERROR << 24;
 
 	req = blk_get_request(sdev->request_queue, write, __GFP_RECLAIM);
 	if (IS_ERR(req))
 		return ret;
-	blk_rq_set_block_pc(req);
+	rq = scsi_req(req);
+	scsi_req_init(req);
 
 	if (bufflen &&	blk_rq_map_kern(sdev->request_queue, req,
 					buffer, bufflen, __GFP_RECLAIM))
 		goto out;
 
-	req->cmd_len = COMMAND_SIZE(cmd[0]);
-	memcpy(req->cmd, cmd, req->cmd_len);
-	req->sense = sense;
-	req->sense_len = 0;
+	rq->cmd_len = COMMAND_SIZE(cmd[0]);
+	memcpy(rq->cmd, cmd, rq->cmd_len);
 	req->retries = retries;
 	req->timeout = timeout;
 	req->cmd_flags |= flags;
@@ -251,11 +251,13 @@ static int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	 * is invalid.  Prevent the garbage from being misinterpreted
 	 * and prevent security leaks by zeroing out the excess data.
 	 */
-	if (unlikely(req->resid_len > 0 && req->resid_len <= bufflen))
-		memset(buffer + (bufflen - req->resid_len), 0, req->resid_len);
+	if (unlikely(rq->resid_len > 0 && rq->resid_len <= bufflen))
+		memset(buffer + (bufflen - rq->resid_len), 0, rq->resid_len);
 
 	if (resid)
-		*resid = req->resid_len;
+		*resid = rq->resid_len;
+	if (sense && rq->sense_len)
+		memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
 	ret = req->errors;
  out:
 	blk_put_request(req);
@@ -806,16 +808,13 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 
 	if (req->cmd_type == REQ_TYPE_BLOCK_PC) { /* SG_IO ioctl from block level */
 		if (result) {
-			if (sense_valid && req->sense) {
+			if (sense_valid) {
 				/*
 				 * SG_IO wants current and deferred errors
 				 */
-				int len = 8 + cmd->sense_buffer[7];
-
-				if (len > SCSI_SENSE_BUFFERSIZE)
-					len = SCSI_SENSE_BUFFERSIZE;
-				memcpy(req->sense, cmd->sense_buffer,  len);
-				req->sense_len = len;
+				scsi_req(req)->sense_len =
+					min(8 + cmd->sense_buffer[7],
+					    SCSI_SENSE_BUFFERSIZE);
 			}
 			if (!sense_deferred)
 				error = __scsi_error_from_host_byte(cmd, result);
@@ -825,14 +824,14 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		 */
 		req->errors = cmd->result;
 
-		req->resid_len = scsi_get_resid(cmd);
+		scsi_req(req)->resid_len = scsi_get_resid(cmd);
 
 		if (scsi_bidi_cmnd(cmd)) {
 			/*
 			 * Bidi commands Must be complete as a whole,
 			 * both sides at once.
 			 */
-			req->next_rq->resid_len = scsi_in(cmd)->resid;
+			scsi_req(req->next_rq)->resid_len = scsi_in(cmd)->resid;
 			if (scsi_end_request(req, 0, blk_rq_bytes(req),
 					blk_rq_bytes(req->next_rq)))
 				BUG();
@@ -1165,7 +1164,10 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	void *prot = cmd->prot_sdb;
 	unsigned long flags;
 
-	memset(cmd, 0, sizeof(*cmd));
+	/* zero out the cmd, except for the embedded scsi_request */
+	memset((char *)cmd + sizeof(cmd->req), 0,
+		sizeof(*cmd) - sizeof(cmd->req));
+
 	cmd->device = dev;
 	cmd->sense_buffer = buf;
 	cmd->prot_sdb = prot;
@@ -1197,7 +1199,8 @@ static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
 	}
 
-	cmd->cmd_len = req->cmd_len;
+	cmd->cmd_len = scsi_req(req)->cmd_len;
+	cmd->cmnd = scsi_req(req)->cmd;
 	cmd->transfersize = blk_rq_bytes(req);
 	cmd->allowed = req->retries;
 	return BLKPREP_OK;
@@ -1217,6 +1220,7 @@ static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 			return ret;
 	}
 
+	cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
 	memset(cmd->cmnd, 0, BLK_MAX_CDB);
 	return scsi_cmd_to_driver(cmd)->init_command(cmd);
 }
@@ -1355,7 +1359,6 @@ static int scsi_prep_fn(struct request_queue *q, struct request *req)
 
 	cmd->tag = req->tag;
 	cmd->request = req;
-	cmd->cmnd = req->cmd;
 	cmd->prot_op = SCSI_PROT_NORMAL;
 
 	ret = scsi_setup_cmnd(sdev, req);
@@ -1874,7 +1877,9 @@ static int scsi_mq_prep_fn(struct request *req)
 	unsigned char *sense_buf = cmd->sense_buffer;
 	struct scatterlist *sg;
 
-	memset(cmd, 0, sizeof(struct scsi_cmnd));
+	/* zero out the cmd, except for the embedded scsi_request */
+	memset((char *)cmd + sizeof(cmd->req), 0,
+		sizeof(*cmd) - sizeof(cmd->req));
 
 	req->special = cmd;
 
@@ -1884,7 +1889,6 @@ static int scsi_mq_prep_fn(struct request *req)
 
 	cmd->tag = req->tag;
 
-	cmd->cmnd = req->cmd;
 	cmd->prot_op = SCSI_PROT_NORMAL;
 
 	INIT_LIST_HEAD(&cmd->list);
@@ -1959,7 +1963,6 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!scsi_host_queue_ready(q, shost, sdev))
 		goto out_dec_target_busy;
 
-
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = prep_to_mq(scsi_mq_prep_fn(req));
 		if (ret != BLK_MQ_RQ_QUEUE_OK)
@@ -2036,6 +2039,7 @@ static int scsi_init_request(void *data, struct request *rq,
 		scsi_alloc_sense_buffer(shost, GFP_KERNEL, numa_node);
 	if (!cmd->sense_buffer)
 		return -ENOMEM;
+	cmd->req.sense = cmd->sense_buffer;
 	return 0;
 }
 
@@ -2125,6 +2129,7 @@ static int scsi_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp)
 	cmd->sense_buffer = scsi_alloc_sense_buffer(shost, gfp, NUMA_NO_NODE);
 	if (!cmd->sense_buffer)
 		goto fail;
+	cmd->req.sense = cmd->sense_buffer;
 
 	if (scsi_host_get_prot(shost) >= SHOST_DIX_TYPE0_PROTECTION) {
 		cmd->prot_sdb = kmem_cache_zalloc(scsi_sdb_cache, gfp);
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 60b651bfaa01..126a5ee00987 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -33,6 +33,7 @@
 #include <linux/bsg.h>
 
 #include <scsi/scsi.h>
+#include <scsi/scsi_request.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_transport.h>
@@ -177,6 +178,10 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
 	while ((req = blk_fetch_request(q)) != NULL) {
 		spin_unlock_irq(q->queue_lock);
 
+		scsi_req(req)->resid_len = blk_rq_bytes(req);
+		if (req->next_rq)
+			scsi_req(req->next_rq)->resid_len =
+				blk_rq_bytes(req->next_rq);
 		handler = to_sas_internal(shost->transportt)->f->smp_handler;
 		ret = handler(shost, rphy, req);
 		req->errors = ret;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 0b09638fa39b..c779986edb43 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -781,7 +781,7 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
 	rq->special_vec.bv_len = len;
 
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
-	rq->resid_len = len;
+	scsi_req(rq)->resid_len = len;
 
 	ret = scsi_init_io(cmd);
 out:
@@ -1164,7 +1164,7 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
 		__free_page(rq->special_vec.bv_page);
 
-	if (SCpnt->cmnd != rq->cmd) {
+	if (SCpnt->cmnd != scsi_req(rq)->cmd) {
 		mempool_free(SCpnt->cmnd, sd_cdb_pool);
 		SCpnt->cmnd = NULL;
 		SCpnt->cmd_len = 0;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index dbe5b4b95df0..226a8def7a8b 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -781,9 +781,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
 	}
 	if (atomic_read(&sdp->detaching)) {
 		if (srp->bio) {
-			if (srp->rq->cmd != srp->rq->__cmd)
-				kfree(srp->rq->cmd);
-
+			scsi_req_free_cmd(scsi_req(srp->rq));
 			blk_end_request_all(srp->rq, -EIO);
 			srp->rq = NULL;
 		}
@@ -1279,6 +1277,7 @@ static void
 sg_rq_end_io(struct request *rq, int uptodate)
 {
 	struct sg_request *srp = rq->end_io_data;
+	struct scsi_request *req = scsi_req(rq);
 	Sg_device *sdp;
 	Sg_fd *sfp;
 	unsigned long iflags;
@@ -1297,9 +1296,9 @@ sg_rq_end_io(struct request *rq, int uptodate)
 	if (unlikely(atomic_read(&sdp->detaching)))
 		pr_info("%s: device detaching\n", __func__);
 
-	sense = rq->sense;
+	sense = req->sense;
 	result = rq->errors;
-	resid = rq->resid_len;
+	resid = req->resid_len;
 
 	SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp,
 				      "sg_cmd_done: pack_id=%d, res=0x%x\n",
@@ -1333,6 +1332,10 @@ sg_rq_end_io(struct request *rq, int uptodate)
 			sdp->device->changed = 1;
 		}
 	}
+
+	if (req->sense_len)
+		memcpy(srp->sense_b, req->sense, SCSI_SENSE_BUFFERSIZE);
+
 	/* Rely on write phase to clean out srp status values, so no "else" */
 
 	/*
@@ -1342,8 +1345,7 @@ sg_rq_end_io(struct request *rq, int uptodate)
 	 * blk_rq_unmap_user() can be called from user context.
 	 */
 	srp->rq = NULL;
-	if (rq->cmd != rq->__cmd)
-		kfree(rq->cmd);
+	scsi_req_free_cmd(scsi_req(rq));
 	__blk_put_request(rq->q, rq);
 
 	write_lock_irqsave(&sfp->rq_list_lock, iflags);
@@ -1658,6 +1660,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 {
 	int res;
 	struct request *rq;
+	struct scsi_request *req;
 	Sg_fd *sfp = srp->parentfp;
 	sg_io_hdr_t *hp = &srp->header;
 	int dxfer_len = (int) hp->dxfer_len;
@@ -1700,17 +1703,17 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 		kfree(long_cmdp);
 		return PTR_ERR(rq);
 	}
+	req = scsi_req(rq);
 
-	blk_rq_set_block_pc(rq);
+	scsi_req_init(rq);
 
 	if (hp->cmd_len > BLK_MAX_CDB)
-		rq->cmd = long_cmdp;
-	memcpy(rq->cmd, cmd, hp->cmd_len);
-	rq->cmd_len = hp->cmd_len;
+		req->cmd = long_cmdp;
+	memcpy(req->cmd, cmd, hp->cmd_len);
+	req->cmd_len = hp->cmd_len;
 
 	srp->rq = rq;
 	rq->end_io_data = srp;
-	rq->sense = srp->sense_b;
 	rq->retries = SG_DEFAULT_RETRIES;
 
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
@@ -1786,8 +1789,7 @@ sg_finish_rem_req(Sg_request *srp)
 		ret = blk_rq_unmap_user(srp->bio);
 
 	if (srp->rq) {
-		if (srp->rq->cmd != srp->rq->__cmd)
-			kfree(srp->rq->cmd);
+		scsi_req_free_cmd(scsi_req(srp->rq));
 		blk_put_request(srp->rq);
 	}
 
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5f35b863e1a7..4af900100a24 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -475,7 +475,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 	ktime_t now;
 
 	now = ktime_get();
-	if (req->cmd[0] == WRITE_6) {
+	if (scsi_req(req)->cmd[0] == WRITE_6) {
 		now = ktime_sub(now, STp->stats->write_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_write_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
@@ -489,7 +489,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 		} else
 			atomic64_add(atomic_read(&STp->stats->last_write_size),
 				&STp->stats->write_byte_cnt);
-	} else if (req->cmd[0] == READ_6) {
+	} else if (scsi_req(req)->cmd[0] == READ_6) {
 		now = ktime_sub(now, STp->stats->read_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_read_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
@@ -514,15 +514,18 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 static void st_scsi_execute_end(struct request *req, int uptodate)
 {
 	struct st_request *SRpnt = req->end_io_data;
+	struct scsi_request *rq = scsi_req(req);
 	struct scsi_tape *STp = SRpnt->stp;
 	struct bio *tmp;
 
 	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
-	STp->buffer->cmdstat.residual = req->resid_len;
+	STp->buffer->cmdstat.residual = rq->resid_len;
 
 	st_do_stats(STp, req);
 
 	tmp = SRpnt->bio;
+	if (rq->sense_len)
+		memcpy(SRpnt->sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
 	if (SRpnt->waiting)
 		complete(SRpnt->waiting);
 
@@ -535,6 +538,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 			   int timeout, int retries)
 {
 	struct request *req;
+	struct scsi_request *rq;
 	struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
 	int err = 0;
 	int write = (data_direction == DMA_TO_DEVICE);
@@ -544,8 +548,8 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 			      GFP_KERNEL);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
-
-	blk_rq_set_block_pc(req);
+	rq = scsi_req(req);
+	scsi_req_init(req);
 	req->rq_flags |= RQF_QUIET;
 
 	mdata->null_mapped = 1;
@@ -571,11 +575,9 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 	}
 
 	SRpnt->bio = req->bio;
-	req->cmd_len = COMMAND_SIZE(cmd[0]);
-	memset(req->cmd, 0, BLK_MAX_CDB);
-	memcpy(req->cmd, cmd, req->cmd_len);
-	req->sense = SRpnt->sense;
-	req->sense_len = 0;
+	rq->cmd_len = COMMAND_SIZE(cmd[0]);
+	memset(rq->cmd, 0, BLK_MAX_CDB);
+	memcpy(rq->cmd, cmd, rq->cmd_len);
 	req->timeout = timeout;
 	req->retries = retries;
 	req->end_io_data = SRpnt;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 04d7aa7390d0..e52f4e1a4675 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1013,7 +1013,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 		goto fail;
 	}
 
-	blk_rq_set_block_pc(req);
+	scsi_req_init(req);
 
 	if (sgl) {
 		ret = pscsi_map_sg(cmd, sgl, sgl_nents, req);
@@ -1023,10 +1023,8 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 
 	req->end_io = pscsi_req_done;
 	req->end_io_data = cmd;
-	req->cmd_len = scsi_command_size(pt->pscsi_cdb);
-	req->cmd = &pt->pscsi_cdb[0];
-	req->sense = &pt->pscsi_sense[0];
-	req->sense_len = 0;
+	scsi_req(req)->cmd_len = scsi_command_size(pt->pscsi_cdb);
+	scsi_req(req)->cmd = &pt->pscsi_cdb[0];
 	if (pdv->pdv_sd->type == TYPE_DISK)
 		req->timeout = PS_TIMEOUT_DISK;
 	else
@@ -1075,7 +1073,7 @@ static void pscsi_req_done(struct request *req, int uptodate)
 	struct pscsi_plugin_task *pt = cmd->priv;
 
 	pt->pscsi_result = req->errors;
-	pt->pscsi_resid = req->resid_len;
+	pt->pscsi_resid = scsi_req(req)->resid_len;
 
 	cmd->scsi_status = status_byte(pt->pscsi_result) << 1;
 	if (cmd->scsi_status) {
@@ -1096,6 +1094,7 @@ static void pscsi_req_done(struct request *req, int uptodate)
 		break;
 	}
 
+	memcpy(pt->pscsi_sense, scsi_req(req)->sense, TRANSPORT_SENSE_BUFFER);
 	__blk_put_request(req->q, req);
 	kfree(pt);
 }
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 0780ff864539..930d98caab5b 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -10,6 +10,7 @@
 #include <linux/nfsd/debug.h>
 #include <scsi/scsi_proto.h>
 #include <scsi/scsi_common.h>
+#include <scsi/scsi_request.h>
 
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
@@ -213,6 +214,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 {
 	struct request_queue *q = bdev->bd_disk->queue;
 	struct request *rq;
+	struct scsi_request *req;
 	size_t bufflen = 252, len, id_len;
 	u8 *buf, *d, type, assoc;
 	int error;
@@ -226,18 +228,19 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 		error = -ENOMEM;
 		goto out_free_buf;
 	}
-	blk_rq_set_block_pc(rq);
+	req = scsi_req(rq);
+	scsi_req_init(rq);
 
 	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
 	if (error)
 		goto out_put_request;
 
-	rq->cmd[0] = INQUIRY;
-	rq->cmd[1] = 1;
-	rq->cmd[2] = 0x83;
-	rq->cmd[3] = bufflen >> 8;
-	rq->cmd[4] = bufflen & 0xff;
-	rq->cmd_len = COMMAND_SIZE(INQUIRY);
+	req->cmd[0] = INQUIRY;
+	req->cmd[1] = 1;
+	req->cmd[2] = 0x83;
+	req->cmd[3] = bufflen >> 8;
+	req->cmd[4] = bufflen & 0xff;
+	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
 	error = blk_execute_rq(rq->q, NULL, rq, 1);
 	if (error) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 461b7cf6af1d..e4c5f284fe2d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -128,8 +128,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_NOMERGE_FLAGS \
 	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
 
-#define BLK_MAX_CDB	16
-
 /*
  * Try to put the fields that are referenced together in the same cacheline.
  *
@@ -227,17 +225,7 @@ struct request {
 
 	int errors;
 
-	/*
-	 * when request is used as a packet command carrier
-	 */
-	unsigned char __cmd[BLK_MAX_CDB];
-	unsigned char *cmd;
-	unsigned short cmd_len;
-
 	unsigned int extra_len;	/* length of alignment and padding */
-	unsigned int sense_len;
-	unsigned int resid_len;	/* residual count */
-	void *sense;
 
 	unsigned long deadline;
 	struct list_head timeout_list;
@@ -925,7 +913,6 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
-extern void blk_rq_set_block_pc(struct request *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a633898f36ac..086fbe172817 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -20,6 +20,7 @@
 #include <linux/mutex.h>
 /* for request_sense */
 #include <linux/cdrom.h>
+#include <scsi/scsi_cmnd.h>
 #include <asm/byteorder.h>
 #include <asm/io.h>
 
@@ -52,6 +53,11 @@ enum ata_cmd_type_bits {
 	((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \
 	 (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME)
 
+struct ide_request {
+	struct scsi_request sreq;
+	u8 sense[SCSI_SENSE_BUFFERSIZE];
+};
+
 /* Error codes returned in rq->errors to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
@@ -579,7 +585,7 @@ struct ide_drive_s {
 
 	/* current sense rq and buffer */
 	bool sense_rq_armed;
-	struct request sense_rq;
+	struct request *sense_rq;
 	struct request_sense sense_data;
 };
 
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 9fc1aecfc813..f708f1acfc50 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -8,6 +8,7 @@
 #include <linux/timer.h>
 #include <linux/scatterlist.h>
 #include <scsi/scsi_device.h>
+#include <scsi/scsi_request.h>
 
 struct Scsi_Host;
 struct scsi_driver;
@@ -57,6 +58,7 @@ struct scsi_pointer {
 #define SCMD_TAGGED		(1 << 0)
 
 struct scsi_cmnd {
+	struct scsi_request req;
 	struct scsi_device *device;
 	struct list_head list;  /* scsi_cmnd participates in queue lists */
 	struct list_head eh_entry; /* entry for the host eh_cmd_q */
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
new file mode 100644
index 000000000000..ba0aeb980f7e
--- /dev/null
+++ b/include/scsi/scsi_request.h
@@ -0,0 +1,30 @@
+#ifndef _SCSI_SCSI_REQUEST_H
+#define _SCSI_SCSI_REQUEST_H
+
+#include <linux/blk-mq.h>
+
+#define BLK_MAX_CDB	16
+
+struct scsi_request {
+	unsigned char	__cmd[BLK_MAX_CDB];
+	unsigned char	*cmd;
+	unsigned short	cmd_len;
+	unsigned int	sense_len;
+	unsigned int	resid_len;	/* residual count */
+	void		*sense;
+};
+
+static inline struct scsi_request *scsi_req(struct request *rq)
+{
+	return blk_mq_rq_to_pdu(rq);
+}
+
+static inline void scsi_req_free_cmd(struct scsi_request *req)
+{
+	if (req->cmd != req->__cmd)
+		kfree(req->cmd);
+}
+
+void scsi_req_init(struct request *);
+
+#endif /* _SCSI_SCSI_REQUEST_H */
-- 
cgit v1.2.3


From 15f2e88ddd4bc9b2c6b6236162993b5caa80abb9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Fri, 16 Dec 2016 14:46:09 -0500
Subject: radix tree: Add some implicit includes

We were using spinlock_t and INIT_LIST_HEAD without including spinlock.h
or list.h.  They were being implicitly included through some other header
file, but that's fragile.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/radix-tree.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 52bda854593b..13d8d741ca34 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -22,11 +22,13 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/bitops.h>
-#include <linux/preempt.h>
-#include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/preempt.h>
 #include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
 
 /*
  * The bottom two bits of the slot determine how the remaining bits in the
-- 
cgit v1.2.3


From 35534c869c62f59203c1822769bbef14e894a9e9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Mon, 19 Dec 2016 17:43:19 -0500
Subject: radix tree: constify some pointers

If we're just getting the value of a tag, or looking up an entry,
we won't modify the radix tree, so we can declare these functions as
taking a const pointer.  Mostly for documentation purposes, though it
might help code generation.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/radix-tree.h | 22 +++++++++---------
 lib/radix-tree.c           | 57 +++++++++++++++++++++++++---------------------
 2 files changed, 42 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 13d8d741ca34..32683c7c2e0d 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -125,7 +125,7 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
-static inline bool radix_tree_empty(struct radix_tree_root *root)
+static inline bool radix_tree_empty(const struct radix_tree_root *root)
 {
 	return root->rnode == NULL;
 }
@@ -294,10 +294,10 @@ static inline int radix_tree_insert(struct radix_tree_root *root,
 {
 	return __radix_tree_insert(root, index, 0, entry);
 }
-void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
+void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
 			  struct radix_tree_node **nodep, void ***slotp);
-void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long);
 typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
 void __radix_tree_replace(struct radix_tree_root *root,
 			  struct radix_tree_node *node,
@@ -316,10 +316,10 @@ void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 void radix_tree_clear_tags(struct radix_tree_root *root,
 			   struct radix_tree_node *node,
 			   void **slot);
-unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
+unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
 			void **results, unsigned long first_index,
 			unsigned int max_items);
-unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
 int radix_tree_preload(gfp_t gfp_mask);
@@ -330,19 +330,19 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
 void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
-int radix_tree_tag_get(struct radix_tree_root *root,
+int radix_tree_tag_get(const struct radix_tree_root *,
 			unsigned long index, unsigned int tag);
 void radix_tree_iter_tag_set(struct radix_tree_root *root,
 		const struct radix_tree_iter *iter, unsigned int tag);
 unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+radix_tree_gang_lookup_tag(const struct radix_tree_root *, void **results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag);
 unsigned int
-radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
+radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *, void ***results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag);
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
@@ -395,7 +395,7 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
  * Also it fills @iter with data about chunk: position in the tree (index),
  * its end (next_index), and constructs a bit mask for tagged iterating (tags).
  */
-void **radix_tree_next_chunk(struct radix_tree_root *root,
+void **radix_tree_next_chunk(const struct radix_tree_root *,
 			     struct radix_tree_iter *iter, unsigned flags);
 
 /**
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 84812a9fb16f..6f86fbac0e36 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -83,26 +83,28 @@ static inline void *node_to_entry(void *ptr)
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
 /* Sibling slots point directly to another slot in the same node */
-static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+static inline
+bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
 {
 	void **ptr = node;
 	return (parent->slots <= ptr) &&
 			(ptr < parent->slots + RADIX_TREE_MAP_SIZE);
 }
 #else
-static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+static inline
+bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
 {
 	return false;
 }
 #endif
 
-static inline unsigned long get_slot_offset(struct radix_tree_node *parent,
-						 void **slot)
+static inline
+unsigned long get_slot_offset(const struct radix_tree_node *parent, void **slot)
 {
 	return slot - parent->slots;
 }
 
-static unsigned int radix_tree_descend(struct radix_tree_node *parent,
+static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
 			struct radix_tree_node **nodep, unsigned long index)
 {
 	unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
@@ -122,7 +124,7 @@ static unsigned int radix_tree_descend(struct radix_tree_node *parent,
 	return offset;
 }
 
-static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
 {
 	return root->gfp_mask & __GFP_BITS_MASK;
 }
@@ -139,13 +141,13 @@ static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
 	__clear_bit(offset, node->tags[tag]);
 }
 
-static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
 		int offset)
 {
 	return test_bit(offset, node->tags[tag]);
 }
 
-static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
 {
 	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
 }
@@ -160,12 +162,12 @@ static inline void root_tag_clear_all(struct radix_tree_root *root)
 	root->gfp_mask &= __GFP_BITS_MASK;
 }
 
-static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
 {
 	return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
 }
 
-static inline unsigned root_tags_get(struct radix_tree_root *root)
+static inline unsigned root_tags_get(const struct radix_tree_root *root)
 {
 	return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT;
 }
@@ -174,7 +176,8 @@ static inline unsigned root_tags_get(struct radix_tree_root *root)
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
  */
-static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+static inline int any_tag_set(const struct radix_tree_node *node,
+							unsigned int tag)
 {
 	unsigned idx;
 	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
@@ -232,7 +235,7 @@ static inline unsigned long shift_maxindex(unsigned int shift)
 	return (RADIX_TREE_MAP_SIZE << shift) - 1;
 }
 
-static inline unsigned long node_maxindex(struct radix_tree_node *node)
+static inline unsigned long node_maxindex(const struct radix_tree_node *node)
 {
 	return shift_maxindex(node->shift);
 }
@@ -510,7 +513,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
 	return __radix_tree_preload(gfp_mask, nr_nodes);
 }
 
-static unsigned radix_tree_load_root(struct radix_tree_root *root,
+static unsigned radix_tree_load_root(const struct radix_tree_root *root,
 		struct radix_tree_node **nodep, unsigned long *maxindex)
 {
 	struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
@@ -908,8 +911,9 @@ EXPORT_SYMBOL(__radix_tree_insert);
  *	allocated and @root->rnode is used as a direct slot instead of
  *	pointing to a node, in which case *@nodep will be NULL.
  */
-void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
-			  struct radix_tree_node **nodep, void ***slotp)
+void *__radix_tree_lookup(const struct radix_tree_root *root,
+			  unsigned long index, struct radix_tree_node **nodep,
+			  void ***slotp)
 {
 	struct radix_tree_node *node, *parent;
 	unsigned long maxindex;
@@ -952,7 +956,8 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
  *	exclusive from other writers. Any dereference of the slot must be done
  *	using radix_tree_deref_slot.
  */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+void **radix_tree_lookup_slot(const struct radix_tree_root *root,
+				unsigned long index)
 {
 	void **slot;
 
@@ -974,7 +979,7 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
  *	them safely). No RCU barriers are required to access or modify the
  *	returned item, however.
  */
-void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
 {
 	return __radix_tree_lookup(root, index, NULL, NULL);
 }
@@ -1404,7 +1409,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
  * the RCU lock is held, unless tag modification and node deletion are excluded
  * from concurrency.
  */
-int radix_tree_tag_get(struct radix_tree_root *root,
+int radix_tree_tag_get(const struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
 	struct radix_tree_node *node, *parent;
@@ -1568,7 +1573,7 @@ EXPORT_SYMBOL(radix_tree_iter_resume);
  * @flags:	RADIX_TREE_ITER_* flags and tag index
  * Returns:	pointer to chunk first slot, or NULL if iteration is over
  */
-void **radix_tree_next_chunk(struct radix_tree_root *root,
+void **radix_tree_next_chunk(const struct radix_tree_root *root,
 			     struct radix_tree_iter *iter, unsigned flags)
 {
 	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
@@ -1679,7 +1684,7 @@ EXPORT_SYMBOL(radix_tree_next_chunk);
  *	stored in 'results'.
  */
 unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items)
 {
 	struct radix_tree_iter iter;
@@ -1724,7 +1729,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  *	protection, radix_tree_deref_slot may fail requiring a retry.
  */
 unsigned int
-radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+radix_tree_gang_lookup_slot(const struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items)
 {
@@ -1761,7 +1766,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
  *	returns the number of items which were placed at *@results.
  */
 unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag)
 {
@@ -1802,9 +1807,9 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
  *	returns the number of slots which were placed at *@results.
  */
 unsigned int
-radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
-		unsigned long first_index, unsigned int max_items,
-		unsigned int tag)
+radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
+		void ***results, unsigned long first_index,
+		unsigned int max_items, unsigned int tag)
 {
 	struct radix_tree_iter iter;
 	void **slot;
@@ -1921,7 +1926,7 @@ void radix_tree_clear_tags(struct radix_tree_root *root,
  *	@root:		radix tree root
  *	@tag:		tag to test
  */
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
+int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
 {
 	return root_tag_get(root, tag);
 }
-- 
cgit v1.2.3


From d732248fdb5c5434f2ab0c258ce25a7e2ff2521a Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Tue, 24 Jan 2017 14:41:41 +0100
Subject: iio: cros_ec: Add cros_ec barometer driver

Handle the barometer sensor presented by the ChromeOS EC Sensor hub.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Enric Balletbo Serra <enric.balletbo@collabora.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/pressure/Kconfig          |  10 ++
 drivers/iio/pressure/Makefile         |   1 +
 drivers/iio/pressure/cros_ec_baro.c   | 220 ++++++++++++++++++++++++++++++++++
 drivers/platform/chrome/cros_ec_dev.c |   3 +
 include/linux/mfd/cros_ec_commands.h  |   3 +-
 5 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iio/pressure/cros_ec_baro.c

(limited to 'include/linux')

diff --git a/drivers/iio/pressure/Kconfig b/drivers/iio/pressure/Kconfig
index bd8d96b96771..5d16b252ab6b 100644
--- a/drivers/iio/pressure/Kconfig
+++ b/drivers/iio/pressure/Kconfig
@@ -42,6 +42,16 @@ config BMP280_SPI
 	depends on SPI_MASTER
 	select REGMAP
 
+config IIO_CROS_EC_BARO
+	tristate "ChromeOS EC Barometer Sensor"
+	depends on IIO_CROS_EC_SENSORS_CORE
+	help
+	  Say yes here to build support for the Barometer sensor when
+	  presented by the ChromeOS EC Sensor hub.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called cros_ec_baro.
+
 config HID_SENSOR_PRESS
 	depends on HID_SENSOR_HUB
 	select IIO_BUFFER
diff --git a/drivers/iio/pressure/Makefile b/drivers/iio/pressure/Makefile
index de3dbc81dc5a..838642789389 100644
--- a/drivers/iio/pressure/Makefile
+++ b/drivers/iio/pressure/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BMP280) += bmp280.o
 bmp280-objs := bmp280-core.o bmp280-regmap.o
 obj-$(CONFIG_BMP280_I2C) += bmp280-i2c.o
 obj-$(CONFIG_BMP280_SPI) += bmp280-spi.o
+obj-$(CONFIG_IIO_CROS_EC_BARO) += cros_ec_baro.o
 obj-$(CONFIG_HID_SENSOR_PRESS)   += hid-sensor-press.o
 obj-$(CONFIG_HP03) += hp03.o
 obj-$(CONFIG_MPL115) += mpl115.o
diff --git a/drivers/iio/pressure/cros_ec_baro.c b/drivers/iio/pressure/cros_ec_baro.c
new file mode 100644
index 000000000000..48b2a30f57ae
--- /dev/null
+++ b/drivers/iio/pressure/cros_ec_baro.c
@@ -0,0 +1,220 @@
+/*
+ * cros_ec_baro - Driver for barometer sensor behind CrosEC.
+ *
+ * Copyright (C) 2017 Google, Inc
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/iio/buffer.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/kfifo_buf.h>
+#include <linux/iio/trigger.h>
+#include <linux/iio/triggered_buffer.h>
+#include <linux/iio/trigger_consumer.h>
+#include <linux/kernel.h>
+#include <linux/mfd/cros_ec.h>
+#include <linux/mfd/cros_ec_commands.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+
+#include "../common/cros_ec_sensors/cros_ec_sensors_core.h"
+
+/*
+ * One channel for pressure, the other for timestamp.
+ */
+#define CROS_EC_BARO_MAX_CHANNELS (1 + 1)
+
+/* State data for ec_sensors iio driver. */
+struct cros_ec_baro_state {
+	/* Shared by all sensors */
+	struct cros_ec_sensors_core_state core;
+
+	struct iio_chan_spec channels[CROS_EC_BARO_MAX_CHANNELS];
+};
+
+static int cros_ec_baro_read(struct iio_dev *indio_dev,
+			     struct iio_chan_spec const *chan,
+			     int *val, int *val2, long mask)
+{
+	struct cros_ec_baro_state *st = iio_priv(indio_dev);
+	u16 data = 0;
+	int ret = IIO_VAL_INT;
+	int idx = chan->scan_index;
+
+	mutex_lock(&st->core.cmd_lock);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		if (cros_ec_sensors_read_cmd(indio_dev, 1 << idx,
+					     (s16 *)&data) < 0)
+			ret = -EIO;
+		*val = data;
+		break;
+	case IIO_CHAN_INFO_SCALE:
+		st->core.param.cmd = MOTIONSENSE_CMD_SENSOR_RANGE;
+		st->core.param.sensor_range.data = EC_MOTION_SENSE_NO_VALUE;
+
+		if (cros_ec_motion_send_host_cmd(&st->core, 0)) {
+			ret = -EIO;
+			break;
+		}
+		*val = st->core.resp->sensor_range.ret;
+
+		/* scale * in_pressure_raw --> kPa */
+		*val2 = 10 << CROS_EC_SENSOR_BITS;
+		ret = IIO_VAL_FRACTIONAL;
+		break;
+	default:
+		ret = cros_ec_sensors_core_read(&st->core, chan, val, val2,
+						mask);
+		break;
+	}
+
+	mutex_unlock(&st->core.cmd_lock);
+
+	return ret;
+}
+
+static int cros_ec_baro_write(struct iio_dev *indio_dev,
+			      struct iio_chan_spec const *chan,
+			      int val, int val2, long mask)
+{
+	struct cros_ec_baro_state *st = iio_priv(indio_dev);
+	int ret = 0;
+
+	mutex_lock(&st->core.cmd_lock);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_SCALE:
+		st->core.param.cmd = MOTIONSENSE_CMD_SENSOR_RANGE;
+		st->core.param.sensor_range.data = val;
+
+		/* Always roundup, so caller gets at least what it asks for. */
+		st->core.param.sensor_range.roundup = 1;
+
+		if (cros_ec_motion_send_host_cmd(&st->core, 0))
+			ret = -EIO;
+		break;
+	default:
+		ret = cros_ec_sensors_core_write(&st->core, chan, val, val2,
+						 mask);
+		break;
+	}
+
+	mutex_unlock(&st->core.cmd_lock);
+
+	return ret;
+}
+
+static const struct iio_info cros_ec_baro_info = {
+	.read_raw = &cros_ec_baro_read,
+	.write_raw = &cros_ec_baro_write,
+	.driver_module = THIS_MODULE,
+};
+
+static int cros_ec_baro_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct cros_ec_dev *ec_dev = dev_get_drvdata(dev->parent);
+	struct cros_ec_device *ec_device;
+	struct iio_dev *indio_dev;
+	struct cros_ec_baro_state *state;
+	struct iio_chan_spec *channel;
+	int ret;
+
+	if (!ec_dev || !ec_dev->ec_dev) {
+		dev_warn(dev, "No CROS EC device found.\n");
+		return -EINVAL;
+	}
+	ec_device = ec_dev->ec_dev;
+
+	indio_dev = devm_iio_device_alloc(dev, sizeof(*state));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+	if (ret)
+		return ret;
+
+	indio_dev->info = &cros_ec_baro_info;
+	state = iio_priv(indio_dev);
+	state->core.type = state->core.resp->info.type;
+	state->core.loc = state->core.resp->info.location;
+	channel = state->channels;
+	/* Common part */
+	channel->info_mask_separate = BIT(IIO_CHAN_INFO_RAW);
+	channel->info_mask_shared_by_all =
+		BIT(IIO_CHAN_INFO_SCALE) |
+		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+		BIT(IIO_CHAN_INFO_FREQUENCY);
+	channel->scan_type.realbits = CROS_EC_SENSOR_BITS;
+	channel->scan_type.storagebits = CROS_EC_SENSOR_BITS;
+	channel->scan_type.shift = 0;
+	channel->scan_index = 0;
+	channel->ext_info = cros_ec_sensors_ext_info;
+	channel->scan_type.sign = 'u';
+
+	state->core.calib[0] = 0;
+
+	/* Sensor specific */
+	switch (state->core.type) {
+	case MOTIONSENSE_TYPE_BARO:
+		channel->type = IIO_PRESSURE;
+		break;
+	default:
+		dev_warn(dev, "Unknown motion sensor\n");
+		return -EINVAL;
+	}
+
+	/* Timestamp */
+	channel++;
+	channel->type = IIO_TIMESTAMP;
+	channel->channel = -1;
+	channel->scan_index = 1;
+	channel->scan_type.sign = 's';
+	channel->scan_type.realbits = 64;
+	channel->scan_type.storagebits = 64;
+
+	indio_dev->channels = state->channels;
+	indio_dev->num_channels = CROS_EC_BARO_MAX_CHANNELS;
+
+	state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
+
+	ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
+					      cros_ec_sensors_capture, NULL);
+	if (ret)
+		return ret;
+
+	return devm_iio_device_register(dev, indio_dev);
+}
+
+static const struct platform_device_id cros_ec_baro_ids[] = {
+	{
+		.name = "cros-ec-baro",
+	},
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, cros_ec_baro_ids);
+
+static struct platform_driver cros_ec_baro_platform_driver = {
+	.driver = {
+		.name	= "cros-ec-baro",
+	},
+	.probe		= cros_ec_baro_probe,
+	.id_table	= cros_ec_baro_ids,
+};
+module_platform_driver(cros_ec_baro_platform_driver);
+
+MODULE_DESCRIPTION("ChromeOS EC barometer sensor driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c
index 47268ecedc4d..6f09da4dadb8 100644
--- a/drivers/platform/chrome/cros_ec_dev.c
+++ b/drivers/platform/chrome/cros_ec_dev.c
@@ -328,6 +328,9 @@ static void cros_ec_sensors_register(struct cros_ec_dev *ec)
 		case MOTIONSENSE_TYPE_ACCEL:
 			sensor_cells[id].name = "cros-ec-accel";
 			break;
+		case MOTIONSENSE_TYPE_BARO:
+			sensor_cells[id].name = "cros-ec-baro";
+			break;
 		case MOTIONSENSE_TYPE_GYRO:
 			sensor_cells[id].name = "cros-ec-gyro";
 			break;
diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 1683003603f3..098c3501ad2c 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1441,7 +1441,8 @@ enum motionsensor_type {
 	MOTIONSENSE_TYPE_PROX = 3,
 	MOTIONSENSE_TYPE_LIGHT = 4,
 	MOTIONSENSE_TYPE_ACTIVITY = 5,
-	MOTIONSENSE_TYPE_MAX
+	MOTIONSENSE_TYPE_BARO = 6,
+	MOTIONSENSE_TYPE_MAX,
 };
 
 /* List of motion sensor locations. */
-- 
cgit v1.2.3


From cefae80249f6bf3fdbc7c3a00bc43deb4c1bccf0 Mon Sep 17 00:00:00 2001
From: Luis Oliveira <Luis.Oliveira@synopsys.com>
Date: Thu, 26 Jan 2017 17:45:32 +0000
Subject: i2c: core: helper function to detect slave mode

This function has the purpose of mode detection by checking the
device nodes for a reg matching with the I2C_OWN_SLAVE_ADDREESS flag.
Currently only checks using OF functions (ACPI slave not supported yet).

Signed-off-by: Luis Oliveira <lolivei@synopsys.com>
Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/i2c.h    |  1 +
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 583e95042a21..408de741fec5 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -3690,6 +3690,39 @@ int i2c_slave_unregister(struct i2c_client *client)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i2c_slave_unregister);
+
+/**
+ * i2c_detect_slave_mode - detect operation mode
+ * @dev: The device owning the bus
+ *
+ * This checks the device nodes for an I2C slave by checking the address
+ * used in the reg property. If the address match the I2C_OWN_SLAVE_ADDRESS
+ * flag this means the device is configured to act as a I2C slave and it will
+ * be listening at that address.
+ *
+ * Returns true if an I2C own slave address is detected, otherwise returns
+ * false.
+ */
+bool i2c_detect_slave_mode(struct device *dev)
+{
+	if (IS_BUILTIN(CONFIG_OF) && dev->of_node) {
+		struct device_node *child;
+		u32 reg;
+
+		for_each_child_of_node(dev->of_node, child) {
+			of_property_read_u32(child, "reg", &reg);
+			if (reg & I2C_OWN_SLAVE_ADDRESS) {
+				of_node_put(child);
+				return true;
+			}
+		}
+	} else if (IS_BUILTIN(CONFIG_ACPI) && ACPI_HANDLE(dev)) {
+		dev_dbg(dev, "ACPI slave is not supported yet\n");
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(i2c_detect_slave_mode);
+
 #endif
 
 MODULE_AUTHOR("Simon G. Vogl <simon@tk.uni-linz.ac.at>");
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 4b45ec46161f..f0ba4bac7452 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -282,6 +282,7 @@ enum i2c_slave_event {
 
 extern int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb);
 extern int i2c_slave_unregister(struct i2c_client *client);
+extern bool i2c_detect_slave_mode(struct device *dev);
 
 static inline int i2c_slave_event(struct i2c_client *client,
 				  enum i2c_slave_event event, u8 *val)
-- 
cgit v1.2.3


From 0a595ebaaa6b53a2226d3fee2a2fd616ea5ba378 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 14 Dec 2016 10:12:56 -0800
Subject: f2fs: support IO alignment for DATA and NODE writes

This patch implements IO alignment by filling dummy blocks in DATA and NODE
write bios. If we can guarantee, for example, 32KB or 64KB for such the IOs,
we can eliminate underlying dummy page problem which FTL conducts in order to
close MLC or TLC partial written pages.

Note that,
 - it requires "-o mode=lfs".
 - IO size should be power of 2, not exceed BIO_MAX_PAGES, 256.
 - read IO is still 4KB.
 - do checkpoint at fsync, if dummy NODE page was written.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          | 55 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/f2fs/f2fs.h          |  4 +++-
 fs/f2fs/segment.c       |  9 ++++++--
 fs/f2fs/segment.h       |  3 +++
 fs/f2fs/super.c         | 13 +++++++++++-
 include/linux/f2fs_fs.h |  6 ++++++
 6 files changed, 84 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a06b2d187aec..12d235f2c771 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -93,6 +93,17 @@ static void f2fs_write_end_io(struct bio *bio)
 		struct page *page = bvec->bv_page;
 		enum count_type type = WB_DATA_TYPE(page);
 
+		if (IS_DUMMY_WRITTEN_PAGE(page)) {
+			set_page_private(page, (unsigned long)NULL);
+			ClearPagePrivate(page);
+			unlock_page(page);
+			mempool_free(page, sbi->write_io_dummy);
+
+			if (unlikely(bio->bi_error))
+				f2fs_stop_checkpoint(sbi, true);
+			continue;
+		}
+
 		fscrypt_pullback_bio_page(&page, true);
 
 		if (unlikely(bio->bi_error)) {
@@ -171,10 +182,42 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 				struct bio *bio, enum page_type type)
 {
 	if (!is_read_io(bio_op(bio))) {
+		unsigned int start;
+
 		if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
 			current->plug && (type == DATA || type == NODE))
 			blk_finish_plug(current->plug);
+
+		if (type != DATA && type != NODE)
+			goto submit_io;
+
+		start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
+		start %= F2FS_IO_SIZE(sbi);
+
+		if (start == 0)
+			goto submit_io;
+
+		/* fill dummy pages */
+		for (; start < F2FS_IO_SIZE(sbi); start++) {
+			struct page *page =
+				mempool_alloc(sbi->write_io_dummy,
+					GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+			f2fs_bug_on(sbi, !page);
+
+			SetPagePrivate(page);
+			set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
+			lock_page(page);
+			if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+				f2fs_bug_on(sbi, 1);
+		}
+		/*
+		 * In the NODE case, we lose next block address chain. So, we
+		 * need to do checkpoint in f2fs_sync_file.
+		 */
+		if (type == NODE)
+			set_sbi_flag(sbi, SBI_NEED_CP);
 	}
+submit_io:
 	if (is_read_io(bio_op(bio)))
 		trace_f2fs_submit_read_bio(sbi->sb, type, bio);
 	else
@@ -319,13 +362,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	return 0;
 }
 
-void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
+int f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 {
 	struct f2fs_sb_info *sbi = fio->sbi;
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 	struct f2fs_bio_info *io;
 	bool is_read = is_read_io(fio->op);
 	struct page *bio_page;
+	int err = 0;
 
 	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
 
@@ -346,6 +390,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
+		if ((fio->type == DATA || fio->type == NODE) &&
+				fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
+			err = -EAGAIN;
+			dec_page_count(sbi, WB_DATA_TYPE(bio_page));
+			goto out_fail;
+		}
 		io->bio = __bio_alloc(sbi, fio->new_blkaddr,
 						BIO_MAX_PAGES, is_read);
 		io->fio = *fio;
@@ -359,9 +409,10 @@ alloc_new:
 
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
-
+out_fail:
 	up_write(&io->io_rwsem);
 	trace_f2fs_submit_page_mbio(fio->page, fio);
+	return err;
 }
 
 static void __set_data_blkaddr(struct dnode_of_data *dn)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2da8c3aa0ce5..0d7eaddef4a0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -792,6 +792,8 @@ struct f2fs_sb_info {
 	struct f2fs_bio_info read_io;			/* for read bios */
 	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
 	struct mutex wio_mutex[NODE + 1];	/* bio ordering for NODE/DATA */
+	int write_io_size_bits;			/* Write IO size bits */
+	mempool_t *write_io_dummy;		/* Dummy pages */
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
@@ -2174,7 +2176,7 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
 				struct page *, nid_t, enum page_type, int);
 void f2fs_flush_merged_bios(struct f2fs_sb_info *);
 int f2fs_submit_page_bio(struct f2fs_io_info *);
-void f2fs_submit_page_mbio(struct f2fs_io_info *);
+int f2fs_submit_page_mbio(struct f2fs_io_info *);
 struct block_device *f2fs_target_device(struct f2fs_sb_info *,
 				block_t, struct bio *);
 int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b6bb6490a640..2e8d12e9cae9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1604,15 +1604,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
 	int type = __get_segment_type(fio->page, fio->type);
+	int err;
 
 	if (fio->type == NODE || fio->type == DATA)
 		mutex_lock(&fio->sbi->wio_mutex[fio->type]);
-
+reallocate:
 	allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
 					&fio->new_blkaddr, sum, type);
 
 	/* writeout dirty page into bdev */
-	f2fs_submit_page_mbio(fio);
+	err = f2fs_submit_page_mbio(fio);
+	if (err == -EAGAIN) {
+		fio->old_blkaddr = fio->new_blkaddr;
+		goto reallocate;
+	}
 
 	if (fio->type == NODE || fio->type == DATA)
 		mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 9d44ce83acb2..08f1455c812c 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -186,9 +186,12 @@ struct segment_allocation {
  * the page is atomically written, and it is in inmem_pages list.
  */
 #define ATOMIC_WRITTEN_PAGE		((unsigned long)-1)
+#define DUMMY_WRITTEN_PAGE		((unsigned long)-2)
 
 #define IS_ATOMIC_WRITTEN_PAGE(page)			\
 		(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+#define IS_DUMMY_WRITTEN_PAGE(page)			\
+		(page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
 
 struct inmem_pages {
 	struct list_head list;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 46fd30d8af77..00fc36e49368 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1763,6 +1763,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 				FDEV(i).total_segments,
 				FDEV(i).start_blk, FDEV(i).end_blk);
 	}
+	f2fs_msg(sbi->sb, KERN_INFO,
+			"IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
 	return 0;
 }
 
@@ -1880,12 +1882,19 @@ try_onemore:
 	if (err)
 		goto free_options;
 
+	if (F2FS_IO_SIZE(sbi) > 1) {
+		sbi->write_io_dummy =
+			mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0);
+		if (!sbi->write_io_dummy)
+			goto free_options;
+	}
+
 	/* get an inode for meta space */
 	sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
 	if (IS_ERR(sbi->meta_inode)) {
 		f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
 		err = PTR_ERR(sbi->meta_inode);
-		goto free_options;
+		goto free_io_dummy;
 	}
 
 	err = get_valid_checkpoint(sbi);
@@ -2103,6 +2112,8 @@ free_devices:
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
 	iput(sbi->meta_inode);
+free_io_dummy:
+	mempool_destroy(sbi->write_io_dummy);
 free_options:
 	destroy_percpu_info(sbi);
 	kfree(options);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index cea41a124a80..f0748524ca8c 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -36,6 +36,12 @@
 #define F2FS_NODE_INO(sbi)	(sbi->node_ino_num)
 #define F2FS_META_INO(sbi)	(sbi->meta_ino_num)
 
+#define F2FS_IO_SIZE(sbi)	(1 << (sbi)->write_io_size_bits) /* Blocks */
+#define F2FS_IO_SIZE_KB(sbi)	(1 << ((sbi)->write_io_size_bits + 2)) /* KB */
+#define F2FS_IO_SIZE_BYTES(sbi)	(1 << ((sbi)->write_io_size_bits + 12)) /* B */
+#define F2FS_IO_SIZE_BITS(sbi)	((sbi)->write_io_size_bits) /* power of 2 */
+#define F2FS_IO_SIZE_MASK(sbi)	(F2FS_IO_SIZE(sbi) - 1)
+
 /* This flag is used by node and meta inodes, and by recovery */
 #define GFP_F2FS_ZERO		(GFP_NOFS | __GFP_ZERO)
 #define GFP_F2FS_HIGH_ZERO	(GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM)
-- 
cgit v1.2.3


From 0cb8eb30d425d2d2ae28ab630596c44a158784c4 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 29 Jan 2017 14:42:52 +0100
Subject: leds: class: Add new optional brightness_hw_changed attribute

Some LEDs may have their brightness level changed autonomously
(outside of kernel control) by hardware / firmware. This commit
adds support for an optional brightness_hw_changed attribute to
signal such changes to userspace (if a driver can detect them):

What:		/sys/class/leds/<led>/brightness_hw_changed
Date:		January 2017
KernelVersion:	4.11
Description:
		Last hardware set brightness level for this LED. Some LEDs
		may be changed autonomously by hardware/firmware. Only LEDs
		where this happens and the driver can detect this, will
		have this file.

		This file supports poll() to detect when the hardware
		changes the brightness.

		Reading this file will return the last brightness level set
		by the hardware, this may be different from the current
		brightness.

Drivers which want to support this, simply add LED_BRIGHT_HW_CHANGED to
their flags field and call led_classdev_notify_brightness_hw_changed()
with the hardware set brightness when they detect a hardware / firmware
triggered brightness change.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 Documentation/ABI/testing/sysfs-class-led | 17 +++++++
 Documentation/leds/leds-class.txt         | 15 ++++++
 drivers/leds/Kconfig                      |  9 ++++
 drivers/leds/led-class.c                  | 76 +++++++++++++++++++++++++++++++
 include/linux/leds.h                      | 15 ++++++
 5 files changed, 132 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-led b/Documentation/ABI/testing/sysfs-class-led
index 491cdeedc195..5f67f7ab277b 100644
--- a/Documentation/ABI/testing/sysfs-class-led
+++ b/Documentation/ABI/testing/sysfs-class-led
@@ -23,6 +23,23 @@ Description:
 		If the LED does not support different brightness levels, this
 		should be 1.
 
+What:		/sys/class/leds/<led>/brightness_hw_changed
+Date:		January 2017
+KernelVersion:	4.11
+Description:
+		Last hardware set brightness level for this LED. Some LEDs
+		may be changed autonomously by hardware/firmware. Only LEDs
+		where this happens and the driver can detect this, will have
+		this file.
+
+		This file supports poll() to detect when the hardware changes
+		the brightness.
+
+		Reading this file will return the last brightness level set
+		by the hardware, this may be different from the current
+		brightness. Reading this file when no hw brightness change
+		event has happened will return an ENODATA error.
+
 What:		/sys/class/leds/<led>/trigger
 Date:		March 2006
 KernelVersion:	2.6.17
diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt
index f1f7ec9f5cc5..836cb16d6f09 100644
--- a/Documentation/leds/leds-class.txt
+++ b/Documentation/leds/leds-class.txt
@@ -65,6 +65,21 @@ LED subsystem core exposes following API for setting brightness:
 		blinking, returns -EBUSY if software blink fallback is enabled.
 
 
+LED registration API
+====================
+
+A driver wanting to register a LED classdev for use by other drivers /
+userspace needs to allocate and fill a led_classdev struct and then call
+[devm_]led_classdev_register. If the non devm version is used the driver
+must call led_classdev_unregister from its remove function before
+free-ing the led_classdev struct.
+
+If the driver can detect hardware initiated brightness changes and thus
+wants to have a brightness_hw_changed attribute then the LED_BRIGHT_HW_CHANGED
+flag must be set in flags before registering. Calling
+led_classdev_notify_brightness_hw_changed on a classdev not registered with
+the LED_BRIGHT_HW_CHANGED flag is a bug and will trigger a WARN_ON.
+
 Hardware accelerated blink of LEDs
 ==================================
 
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index c621cbbb5768..275f467956ee 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -29,6 +29,15 @@ config LEDS_CLASS_FLASH
 	  for the flash related features of a LED device. It can be built
 	  as a module.
 
+config LEDS_BRIGHTNESS_HW_CHANGED
+	bool "LED Class brightness_hw_changed attribute support"
+	depends on LEDS_CLASS
+	help
+	  This option enables support for the brightness_hw_changed attribute
+	  for led sysfs class devices under /sys/class/leds.
+
+	  See Documentation/ABI/testing/sysfs-class-led for details.
+
 comment "LED drivers"
 
 config LEDS_88PM860X
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 326ee6e925a2..f2b0a80a62b4 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -103,6 +103,68 @@ static const struct attribute_group *led_groups[] = {
 	NULL,
 };
 
+#ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED
+static ssize_t brightness_hw_changed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+
+	if (led_cdev->brightness_hw_changed == -1)
+		return -ENODATA;
+
+	return sprintf(buf, "%u\n", led_cdev->brightness_hw_changed);
+}
+
+static DEVICE_ATTR_RO(brightness_hw_changed);
+
+static int led_add_brightness_hw_changed(struct led_classdev *led_cdev)
+{
+	struct device *dev = led_cdev->dev;
+	int ret;
+
+	ret = device_create_file(dev, &dev_attr_brightness_hw_changed);
+	if (ret) {
+		dev_err(dev, "Error creating brightness_hw_changed\n");
+		return ret;
+	}
+
+	led_cdev->brightness_hw_changed_kn =
+		sysfs_get_dirent(dev->kobj.sd, "brightness_hw_changed");
+	if (!led_cdev->brightness_hw_changed_kn) {
+		dev_err(dev, "Error getting brightness_hw_changed kn\n");
+		device_remove_file(dev, &dev_attr_brightness_hw_changed);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev)
+{
+	sysfs_put(led_cdev->brightness_hw_changed_kn);
+	device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed);
+}
+
+void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev,
+					       enum led_brightness brightness)
+{
+	if (WARN_ON(!led_cdev->brightness_hw_changed_kn))
+		return;
+
+	led_cdev->brightness_hw_changed = brightness;
+	sysfs_notify_dirent(led_cdev->brightness_hw_changed_kn);
+}
+EXPORT_SYMBOL_GPL(led_classdev_notify_brightness_hw_changed);
+#else
+static int led_add_brightness_hw_changed(struct led_classdev *led_cdev)
+{
+	return 0;
+}
+static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev)
+{
+}
+#endif
+
 /**
  * led_classdev_suspend - suspend an led_classdev.
  * @led_cdev: the led_classdev to suspend.
@@ -204,9 +266,20 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 		dev_warn(parent, "Led %s renamed to %s due to name collision",
 				led_cdev->name, dev_name(led_cdev->dev));
 
+	if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) {
+		ret = led_add_brightness_hw_changed(led_cdev);
+		if (ret) {
+			device_unregister(led_cdev->dev);
+			return ret;
+		}
+	}
+
 	led_cdev->work_flags = 0;
 #ifdef CONFIG_LEDS_TRIGGERS
 	init_rwsem(&led_cdev->trigger_lock);
+#endif
+#ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED
+	led_cdev->brightness_hw_changed = -1;
 #endif
 	mutex_init(&led_cdev->led_access);
 	/* add to the list of leds */
@@ -256,6 +329,9 @@ void led_classdev_unregister(struct led_classdev *led_cdev)
 
 	flush_work(&led_cdev->set_brightness_work);
 
+	if (led_cdev->flags & LED_BRIGHT_HW_CHANGED)
+		led_remove_brightness_hw_changed(led_cdev);
+
 	device_unregister(led_cdev->dev);
 
 	down_write(&leds_list_lock);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index bb50d0151e75..38c0bd7ca107 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -13,6 +13,7 @@
 #define __LINUX_LEDS_H_INCLUDED
 
 #include <linux/device.h>
+#include <linux/kernfs.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
@@ -47,6 +48,7 @@ struct led_classdev {
 #define LED_DEV_CAP_FLASH	(1 << 18)
 #define LED_HW_PLUGGABLE	(1 << 19)
 #define LED_PANIC_INDICATOR	(1 << 20)
+#define LED_BRIGHT_HW_CHANGED	(1 << 21)
 
 	/* set_brightness_work / blink_timer flags, atomic, private. */
 	unsigned long		work_flags;
@@ -111,6 +113,11 @@ struct led_classdev {
 	bool			activated;
 #endif
 
+#ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED
+	int			 brightness_hw_changed;
+	struct kernfs_node	*brightness_hw_changed_kn;
+#endif
+
 	/* Ensures consistent access to the LED Flash Class device */
 	struct mutex		led_access;
 };
@@ -423,4 +430,12 @@ static inline void ledtrig_cpu(enum cpu_led_event evt)
 }
 #endif
 
+#ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED
+extern void led_classdev_notify_brightness_hw_changed(
+	struct led_classdev *led_cdev, enum led_brightness brightness);
+#else
+static inline void led_classdev_notify_brightness_hw_changed(
+	struct led_classdev *led_cdev, enum led_brightness brightness) { }
+#endif
+
 #endif		/* __LINUX_LEDS_H_INCLUDED */
-- 
cgit v1.2.3


From 40be0dda0725886b623d67868db3219a2e74683b Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Sat, 28 Jan 2017 15:15:42 +0100
Subject: net: add devm version of alloc_etherdev_mqs function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds devm_alloc_etherdev_mqs function and devm_alloc_etherdev
macro. These can be used for simpler netdev allocation without having to
care about calling free_netdev.

Thanks to this change drivers, their error paths and removal paths may
get simpler by a bit.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h |  5 +++++
 net/ethernet/eth.c          | 28 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 42add77ae47d..c62b709b1ce0 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -54,6 +54,11 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
 #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)
 
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+					   unsigned int txqs,
+					   unsigned int rxqs);
+#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)
+
 struct sk_buff **eth_gro_receive(struct sk_buff **head,
 				 struct sk_buff *skb);
 int eth_gro_complete(struct sk_buff *skb, int nhoff);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 8c5a479681ca..efdaaab735fc 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -392,6 +392,34 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 }
 EXPORT_SYMBOL(alloc_etherdev_mqs);
 
+static void devm_free_netdev(struct device *dev, void *res)
+{
+	free_netdev(*(struct net_device **)res);
+}
+
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+					   unsigned int txqs, unsigned int rxqs)
+{
+	struct net_device **dr;
+	struct net_device *netdev;
+
+	dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return NULL;
+
+	netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+	if (!netdev) {
+		devres_free(dr);
+		return NULL;
+	}
+
+	*dr = netdev;
+	devres_add(dev, dr);
+
+	return netdev;
+}
+EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
+
 ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 {
 	return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
-- 
cgit v1.2.3


From f067a982cefa8df5642212bb0c7e25974831f781 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 23 Jan 2017 10:11:42 +0530
Subject: PM / OPP: Add 'struct kref' to OPP table

Add kref to struct opp_table for easier accounting of the OPP table.

Note that the new routine dev_pm_opp_get_opp_table() takes the reference
from under the opp_table_lock, which guarantees that the OPP table
doesn't get freed unless dev_pm_opp_put_opp_table() is called for the
OPP table.

Two separate release mechanisms are added: locked and unlocked. In
unlocked version the routines aren't required to take/drop
opp_table_lock as the callers have already done that. This is required
to avoid breaking git bisect, otherwise we may get lockdeps between
commits. Once all the users of OPP table are updated the unlocked
version shall be removed.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 51 +++++++++++++++++++++++++++++++++++++++++--
 drivers/base/power/opp/opp.h  |  3 +++
 include/linux/pm_opp.h        | 10 +++++++++
 3 files changed, 62 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index dcebd5efb6a1..ccc0d8913fd0 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -855,6 +855,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev)
 	srcu_init_notifier_head(&opp_table->srcu_head);
 	INIT_LIST_HEAD(&opp_table->opp_list);
 	mutex_init(&opp_table->lock);
+	kref_init(&opp_table->kref);
 
 	/* Secure the device table modification */
 	list_add_rcu(&opp_table->node, &opp_tables);
@@ -894,8 +895,36 @@ static void _kfree_device_rcu(struct rcu_head *head)
 	kfree_rcu(opp_table, rcu_head);
 }
 
-static void _free_opp_table(struct opp_table *opp_table)
+void _get_opp_table_kref(struct opp_table *opp_table)
 {
+	kref_get(&opp_table->kref);
+}
+
+struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _find_opp_table(dev);
+	if (!IS_ERR(opp_table)) {
+		_get_opp_table_kref(opp_table);
+		goto unlock;
+	}
+
+	opp_table = _allocate_opp_table(dev);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return opp_table;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table);
+
+static void _opp_table_kref_release_unlocked(struct kref *kref)
+{
+	struct opp_table *opp_table = container_of(kref, struct opp_table, kref);
 	struct opp_device *opp_dev;
 
 	/* Release clk */
@@ -916,6 +945,24 @@ static void _free_opp_table(struct opp_table *opp_table)
 		  _kfree_device_rcu);
 }
 
+static void dev_pm_opp_put_opp_table_unlocked(struct opp_table *opp_table)
+{
+	kref_put(&opp_table->kref, _opp_table_kref_release_unlocked);
+}
+
+static void _opp_table_kref_release(struct kref *kref)
+{
+	_opp_table_kref_release_unlocked(kref);
+	mutex_unlock(&opp_table_lock);
+}
+
+void dev_pm_opp_put_opp_table(struct opp_table *opp_table)
+{
+	kref_put_mutex(&opp_table->kref, _opp_table_kref_release,
+		       &opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_opp_table);
+
 /**
  * _remove_opp_table() - Removes a OPP table
  * @opp_table: OPP table to be removed.
@@ -939,7 +986,7 @@ static void _remove_opp_table(struct opp_table *opp_table)
 	if (opp_table->set_opp)
 		return;
 
-	_free_opp_table(opp_table);
+	dev_pm_opp_put_opp_table_unlocked(opp_table);
 }
 
 void _opp_free(struct dev_pm_opp *opp)
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 105243b06373..aae4d8f480ef 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -131,6 +131,7 @@ enum opp_table_access {
  * @rcu_head:	RCU callback head used for deferred freeing
  * @dev_list:	list of devices that share these OPPs
  * @opp_list:	table of opps
+ * @kref:	for reference count of the table.
  * @lock:	mutex protecting the opp_list.
  * @np:		struct device_node pointer for opp's DT node.
  * @clock_latency_ns_max: Max clock latency in nanoseconds.
@@ -164,6 +165,7 @@ struct opp_table {
 	struct rcu_head rcu_head;
 	struct list_head dev_list;
 	struct list_head opp_list;
+	struct kref kref;
 	struct mutex lock;
 
 	struct device_node *np;
@@ -192,6 +194,7 @@ struct opp_table {
 };
 
 /* Routines internal to opp core */
+void _get_opp_table_kref(struct opp_table *opp_table);
 struct opp_table *_find_opp_table(struct device *dev);
 struct opp_table *_add_opp_table(struct device *dev);
 struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 66a02deeb03f..d867c6b25f9a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -78,6 +78,9 @@ struct dev_pm_set_opp_data {
 
 #if defined(CONFIG_PM_OPP)
 
+struct opp_table *dev_pm_opp_get_opp_table(struct device *dev);
+void dev_pm_opp_put_opp_table(struct opp_table *opp_table);
+
 unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
 
 unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
@@ -126,6 +129,13 @@ int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
 void dev_pm_opp_remove_table(struct device *dev);
 void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask);
 #else
+static inline struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {}
+
 static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 {
 	return 0;
-- 
cgit v1.2.3


From fa30184d192ec78d443cf6d3abc37d9eb3b9253e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 23 Jan 2017 10:11:43 +0530
Subject: PM / OPP: Return opp_table from dev_pm_opp_set_*() routines

Now that we have proper kernel reference infrastructure in place for OPP
tables, use it to guarantee that the OPP table isn't freed while being
used by the callers of dev_pm_opp_set_*() APIs.

Make them all return the pointer to the OPP table after taking its
reference and put the reference back with dev_pm_opp_put_*() APIs.

Now that the OPP table wouldn't get freed while these routines are
executing after dev_pm_opp_get_opp_table() is called, there is no need
to take opp_table_lock. Drop them as well.

Remove the rcu specific comments from these routines as they aren't
relevant anymore.

Note that prototypes of dev_pm_opp_{set|put}_regulators() were already
updated by another patch.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 243 +++++++++---------------------------------
 drivers/cpufreq/sti-cpufreq.c |  13 +--
 include/linux/pm_opp.h        |  35 +++---
 3 files changed, 74 insertions(+), 217 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index ccc0d8913fd0..1af349ab630e 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -974,18 +974,6 @@ static void _remove_opp_table(struct opp_table *opp_table)
 	if (!list_empty(&opp_table->opp_list))
 		return;
 
-	if (opp_table->supported_hw)
-		return;
-
-	if (opp_table->prop_name)
-		return;
-
-	if (opp_table->regulators)
-		return;
-
-	if (opp_table->set_opp)
-		return;
-
 	dev_pm_opp_put_opp_table_unlocked(opp_table);
 }
 
@@ -1277,27 +1265,16 @@ free_opp:
  * specify the hierarchy of versions it supports. OPP layer will then enable
  * OPPs, which are available for those versions, based on its 'opp-supported-hw'
  * property.
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
  */
-int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
-				unsigned int count)
+struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
+			const u32 *versions, unsigned int count)
 {
 	struct opp_table *opp_table;
-	int ret = 0;
-
-	/* Hold our table modification lock here */
-	mutex_lock(&opp_table_lock);
+	int ret;
 
-	opp_table = _add_opp_table(dev);
-	if (!opp_table) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
 
 	/* Make sure there are no concurrent readers while updating opp_table */
 	WARN_ON(!list_empty(&opp_table->opp_list));
@@ -1318,65 +1295,40 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 	}
 
 	opp_table->supported_hw_count = count;
-	mutex_unlock(&opp_table_lock);
-	return 0;
+
+	return opp_table;
 
 err:
-	_remove_opp_table(opp_table);
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
 
 /**
  * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
- * @dev: Device for which supported-hw has to be put.
+ * @opp_table: OPP table returned by dev_pm_opp_set_supported_hw().
  *
  * This is required only for the V2 bindings, and is called for a matching
  * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
  * will not be freed.
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
  */
-void dev_pm_opp_put_supported_hw(struct device *dev)
+void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
 {
-	struct opp_table *opp_table;
-
-	/* Hold our table modification lock here */
-	mutex_lock(&opp_table_lock);
-
-	/* Check for existing table for 'dev' first */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		dev_err(dev, "Failed to find opp_table: %ld\n",
-			PTR_ERR(opp_table));
-		goto unlock;
-	}
-
 	/* Make sure there are no concurrent readers while updating opp_table */
 	WARN_ON(!list_empty(&opp_table->opp_list));
 
 	if (!opp_table->supported_hw) {
-		dev_err(dev, "%s: Doesn't have supported hardware list\n",
-			__func__);
-		goto unlock;
+		pr_err("%s: Doesn't have supported hardware list\n",
+		       __func__);
+		return;
 	}
 
 	kfree(opp_table->supported_hw);
 	opp_table->supported_hw = NULL;
 	opp_table->supported_hw_count = 0;
 
-	/* Try freeing opp_table if this was the last blocking resource */
-	_remove_opp_table(opp_table);
-
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
 
@@ -1389,26 +1341,15 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
  * specify the extn to be used for certain property names. The properties to
  * which the extension will apply are opp-microvolt and opp-microamp. OPP core
  * should postfix the property name with -<name> while looking for them.
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
  */
-int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	struct opp_table *opp_table;
-	int ret = 0;
-
-	/* Hold our table modification lock here */
-	mutex_lock(&opp_table_lock);
+	int ret;
 
-	opp_table = _add_opp_table(dev);
-	if (!opp_table) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
 
 	/* Make sure there are no concurrent readers while updating opp_table */
 	WARN_ON(!list_empty(&opp_table->opp_list));
@@ -1427,63 +1368,37 @@ int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 		goto err;
 	}
 
-	mutex_unlock(&opp_table_lock);
-	return 0;
+	return opp_table;
 
 err:
-	_remove_opp_table(opp_table);
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
 
 /**
  * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
- * @dev: Device for which the prop-name has to be put.
+ * @opp_table: OPP table returned by dev_pm_opp_set_prop_name().
  *
  * This is required only for the V2 bindings, and is called for a matching
  * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
  * will not be freed.
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
  */
-void dev_pm_opp_put_prop_name(struct device *dev)
+void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
 {
-	struct opp_table *opp_table;
-
-	/* Hold our table modification lock here */
-	mutex_lock(&opp_table_lock);
-
-	/* Check for existing table for 'dev' first */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		dev_err(dev, "Failed to find opp_table: %ld\n",
-			PTR_ERR(opp_table));
-		goto unlock;
-	}
-
 	/* Make sure there are no concurrent readers while updating opp_table */
 	WARN_ON(!list_empty(&opp_table->opp_list));
 
 	if (!opp_table->prop_name) {
-		dev_err(dev, "%s: Doesn't have a prop-name\n", __func__);
-		goto unlock;
+		pr_err("%s: Doesn't have a prop-name\n", __func__);
+		return;
 	}
 
 	kfree(opp_table->prop_name);
 	opp_table->prop_name = NULL;
 
-	/* Try freeing opp_table if this was the last blocking resource */
-	_remove_opp_table(opp_table);
-
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
@@ -1530,12 +1445,6 @@ static void _free_set_opp_data(struct opp_table *opp_table)
  * well.
  *
  * This must be called before any OPPs are initialized for the device.
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
  */
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 					    const char * const names[],
@@ -1545,13 +1454,9 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 	struct regulator *reg;
 	int ret, i;
 
-	mutex_lock(&opp_table_lock);
-
-	opp_table = _add_opp_table(dev);
-	if (!opp_table) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
 
 	/* This should be called before OPPs are initialized */
 	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
@@ -1593,7 +1498,6 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 	if (ret)
 		goto free_regulators;
 
-	mutex_unlock(&opp_table_lock);
 	return opp_table;
 
 free_regulators:
@@ -1604,9 +1508,7 @@ free_regulators:
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = 0;
 err:
-	_remove_opp_table(opp_table);
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 
 	return ERR_PTR(ret);
 }
@@ -1615,22 +1517,14 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
 /**
  * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
  * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
  */
 void dev_pm_opp_put_regulators(struct opp_table *opp_table)
 {
 	int i;
 
-	mutex_lock(&opp_table_lock);
-
 	if (!opp_table->regulators) {
 		pr_err("%s: Doesn't have regulators set\n", __func__);
-		goto unlock;
+		return;
 	}
 
 	/* Make sure there are no concurrent readers while updating opp_table */
@@ -1645,11 +1539,7 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table)
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = 0;
 
-	/* Try freeing opp_table if this was the last blocking resource */
-	_remove_opp_table(opp_table);
-
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
 
@@ -1662,29 +1552,19 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
  * regulators per device), instead of the generic OPP set rate helper.
  *
  * This must be called before any OPPs are initialized for the device.
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
  */
-int dev_pm_opp_register_set_opp_helper(struct device *dev,
+struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 			int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
 	struct opp_table *opp_table;
 	int ret;
 
 	if (!set_opp)
-		return -EINVAL;
-
-	mutex_lock(&opp_table_lock);
+		return ERR_PTR(-EINVAL);
 
-	opp_table = _add_opp_table(dev);
-	if (!opp_table) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
 
 	/* This should be called before OPPs are initialized */
 	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
@@ -1700,47 +1580,28 @@ int dev_pm_opp_register_set_opp_helper(struct device *dev,
 
 	opp_table->set_opp = set_opp;
 
-	mutex_unlock(&opp_table_lock);
-	return 0;
+	return opp_table;
 
 err:
-	_remove_opp_table(opp_table);
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
 
 /**
  * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
  *					   set_opp helper
- * @dev: Device for which custom set_opp helper has to be cleared.
+ * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
  *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
+ * Release resources blocked for platform specific set_opp helper.
  */
-void dev_pm_opp_register_put_opp_helper(struct device *dev)
+void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table)
 {
-	struct opp_table *opp_table;
-
-	mutex_lock(&opp_table_lock);
-
-	/* Check for existing table for 'dev' first */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		dev_err(dev, "Failed to find opp_table: %ld\n",
-			PTR_ERR(opp_table));
-		goto unlock;
-	}
-
 	if (!opp_table->set_opp) {
-		dev_err(dev, "%s: Doesn't have custom set_opp helper set\n",
-			__func__);
-		goto unlock;
+		pr_err("%s: Doesn't have custom set_opp helper set\n",
+		       __func__);
+		return;
 	}
 
 	/* Make sure there are no concurrent readers while updating opp_table */
@@ -1748,11 +1609,7 @@ void dev_pm_opp_register_put_opp_helper(struct device *dev)
 
 	opp_table->set_opp = NULL;
 
-	/* Try freeing opp_table if this was the last blocking resource */
-	_remove_opp_table(opp_table);
-
-unlock:
-	mutex_unlock(&opp_table_lock);
+	dev_pm_opp_put_opp_table(opp_table);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
 
diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c
index b366e6d830ea..a7db9011d5fe 100644
--- a/drivers/cpufreq/sti-cpufreq.c
+++ b/drivers/cpufreq/sti-cpufreq.c
@@ -160,6 +160,7 @@ static int sti_cpufreq_set_opp_info(void)
 	int pcode, substrate, major, minor;
 	int ret;
 	char name[MAX_PCODE_NAME_LEN];
+	struct opp_table *opp_table;
 
 	reg_fields = sti_cpufreq_match();
 	if (!reg_fields) {
@@ -211,20 +212,20 @@ use_defaults:
 
 	snprintf(name, MAX_PCODE_NAME_LEN, "pcode%d", pcode);
 
-	ret = dev_pm_opp_set_prop_name(dev, name);
-	if (ret) {
+	opp_table = dev_pm_opp_set_prop_name(dev, name);
+	if (IS_ERR(opp_table)) {
 		dev_err(dev, "Failed to set prop name\n");
-		return ret;
+		return PTR_ERR(opp_table);
 	}
 
 	version[0] = BIT(major);
 	version[1] = BIT(minor);
 	version[2] = BIT(substrate);
 
-	ret = dev_pm_opp_set_supported_hw(dev, version, VERSION_ELEMENTS);
-	if (ret) {
+	opp_table = dev_pm_opp_set_supported_hw(dev, version, VERSION_ELEMENTS);
+	if (IS_ERR(opp_table)) {
 		dev_err(dev, "Failed to set supported hardware\n");
-		return ret;
+		return PTR_ERR(opp_table);
 	}
 
 	dev_dbg(dev, "pcode: %d major: %d minor: %d substrate: %d\n",
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index d867c6b25f9a..99787cbcaab2 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -114,15 +114,14 @@ int dev_pm_opp_disable(struct device *dev, unsigned long freq);
 int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb);
 int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb);
 
-int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
-				unsigned int count);
-void dev_pm_opp_put_supported_hw(struct device *dev);
-int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
-void dev_pm_opp_put_prop_name(struct device *dev);
+struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
+void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
+struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
+void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
 void dev_pm_opp_put_regulators(struct opp_table *opp_table);
-int dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
-void dev_pm_opp_register_put_opp_helper(struct device *dev);
+struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
+void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -224,29 +223,29 @@ static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct noti
 	return -ENOTSUPP;
 }
 
-static inline int dev_pm_opp_set_supported_hw(struct device *dev,
-					      const u32 *versions,
-					      unsigned int count)
+static inline struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
+							    const u32 *versions,
+							    unsigned int count)
 {
-	return -ENOTSUPP;
+	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
+static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {}
 
-static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
+static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 			int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
-	return -ENOTSUPP;
+	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void dev_pm_opp_register_put_opp_helper(struct device *dev) {}
+static inline void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table) {}
 
-static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
-	return -ENOTSUPP;
+	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
+static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
 static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count)
 {
-- 
cgit v1.2.3


From 7034764a1e4a6edbb60914e89aad8384e3fe5d17 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 23 Jan 2017 10:11:46 +0530
Subject: PM / OPP: Add 'struct kref' to struct dev_pm_opp

Add kref to struct dev_pm_opp for easier accounting of the OPPs.

Note that the OPPs are freed under the opp_table->lock mutex only.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 27 ++++++++++++---------------
 drivers/base/power/opp/opp.h  |  3 +++
 include/linux/pm_opp.h        |  3 +++
 3 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index ab9499e3ba02..9870ee54d708 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -949,20 +949,10 @@ static void _kfree_opp_rcu(struct rcu_head *head)
 	kfree_rcu(opp, rcu_head);
 }
 
-/**
- * _opp_remove()  - Remove an OPP from a table definition
- * @opp_table:	points back to the opp_table struct this opp belongs to
- * @opp:	pointer to the OPP to remove
- *
- * This function removes an opp definition from the opp table.
- *
- * Locking: The internal opp_table and opp structures are RCU protected.
- * It is assumed that the caller holds required mutex for an RCU updater
- * strategy.
- */
-static void _opp_remove(struct opp_table *opp_table, struct dev_pm_opp *opp)
+static void _opp_kref_release(struct kref *kref)
 {
-	mutex_lock(&opp_table->lock);
+	struct dev_pm_opp *opp = container_of(kref, struct dev_pm_opp, kref);
+	struct opp_table *opp_table = opp->opp_table;
 
 	/*
 	 * Notify the changes in the availability of the operable
@@ -977,6 +967,12 @@ static void _opp_remove(struct opp_table *opp_table, struct dev_pm_opp *opp)
 	dev_pm_opp_put_opp_table(opp_table);
 }
 
+void dev_pm_opp_put(struct dev_pm_opp *opp)
+{
+	kref_put_mutex(&opp->kref, _opp_kref_release, &opp->opp_table->lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put);
+
 /**
  * dev_pm_opp_remove()  - Remove an OPP from OPP table
  * @dev:	device for which we do this operation
@@ -1020,7 +1016,7 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq)
 		goto unlock;
 	}
 
-	_opp_remove(opp_table, opp);
+	dev_pm_opp_put(opp);
 unlock:
 	mutex_unlock(&opp_table_lock);
 }
@@ -1124,6 +1120,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 	mutex_unlock(&opp_table->lock);
 
 	new_opp->opp_table = opp_table;
+	kref_init(&new_opp->kref);
 
 	/* Get a reference to the OPP table */
 	_get_opp_table_kref(opp_table);
@@ -1819,7 +1816,7 @@ void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev,
 		/* Free static OPPs */
 		list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
 			if (remove_all || !opp->dynamic)
-				_opp_remove(opp_table, opp);
+				dev_pm_opp_put(opp);
 		}
 	} else {
 		_remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index a01724363347..586f36f94e28 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -16,6 +16,7 @@
 
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/limits.h>
 #include <linux/pm_opp.h>
@@ -56,6 +57,7 @@ extern struct list_head opp_tables;
  *		are protected by the opp_table_lock for integrity.
  *		IMPORTANT: the opp nodes should be maintained in increasing
  *		order.
+ * @kref:	for reference count of the OPP.
  * @available:	true/false - marks if this OPP as available or not
  * @dynamic:	not-created from static DT entries.
  * @turbo:	true if turbo (boost) OPP
@@ -73,6 +75,7 @@ extern struct list_head opp_tables;
  */
 struct dev_pm_opp {
 	struct list_head node;
+	struct kref kref;
 
 	bool available;
 	bool dynamic;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 99787cbcaab2..731d548657aa 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -102,6 +102,7 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 
 struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
 					     unsigned long *freq);
+void dev_pm_opp_put(struct dev_pm_opp *opp);
 
 int dev_pm_opp_add(struct device *dev, unsigned long freq,
 		   unsigned long u_volt);
@@ -193,6 +194,8 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
 	return ERR_PTR(-ENOTSUPP);
 }
 
+static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {}
+
 static inline int dev_pm_opp_add(struct device *dev, unsigned long freq,
 					unsigned long u_volt)
 {
-- 
cgit v1.2.3


From b526a314263ea217b8fa9758dca5dc245fd49997 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 22 Jan 2017 17:14:08 +0100
Subject: pwm: Try to load modules during pwm_get()

Add a module name string to the pwm_lookup struct and if specified try
to load the module using request_module() if pwmchip_find_by_name() is
unable to find the PWM chip.

This is a last resort to work around drivers that can't - and can't be
made to - deal with deferred probe.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
[thierry.reding@gmail.com: rename new macro, reword commit message]
[thierry.reding@gmail.com: add comment explaining use-case]
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 14 ++++++++++++++
 include/linux/pwm.h | 23 +++++++++++++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 799c4fb4cc2f..a0860b30bd93 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -765,6 +765,7 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 	unsigned int best = 0;
 	struct pwm_lookup *p, *chosen = NULL;
 	unsigned int match;
+	int err;
 
 	/* look up via DT first */
 	if (IS_ENABLED(CONFIG_OF) && dev && dev->of_node)
@@ -825,6 +826,19 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 		return ERR_PTR(-ENODEV);
 
 	chip = pwmchip_find_by_name(chosen->provider);
+
+	/*
+	 * If the lookup entry specifies a module, load the module and retry
+	 * the PWM chip lookup. This can be used to work around driver load
+	 * ordering issues if driver's can't be made to properly support the
+	 * deferred probe mechanism.
+	 */
+	if (!chip && chosen->module) {
+		err = request_module(chosen->module);
+		if (err == 0)
+			chip = pwmchip_find_by_name(chosen->provider);
+	}
+
 	if (!chip)
 		return ERR_PTR(-EPROBE_DEFER);
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index eae215ef1b2c..08fad7c6a471 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -603,18 +603,25 @@ struct pwm_lookup {
 	const char *con_id;
 	unsigned int period;
 	enum pwm_polarity polarity;
+	const char *module; /* optional, may be NULL */
 };
 
-#define PWM_LOOKUP(_provider, _index, _dev_id, _con_id, _period, _polarity) \
-	{						\
-		.provider = _provider,			\
-		.index = _index,			\
-		.dev_id = _dev_id,			\
-		.con_id = _con_id,			\
-		.period = _period,			\
-		.polarity = _polarity			\
+#define PWM_LOOKUP_WITH_MODULE(_provider, _index, _dev_id, _con_id,	\
+			       _period, _polarity, _module)		\
+	{								\
+		.provider = _provider,					\
+		.index = _index,					\
+		.dev_id = _dev_id,					\
+		.con_id = _con_id,					\
+		.period = _period,					\
+		.polarity = _polarity,					\
+		.module = _module,					\
 	}
 
+#define PWM_LOOKUP(_provider, _index, _dev_id, _con_id, _period, _polarity) \
+	PWM_LOOKUP_WITH_MODULE(_provider, _index, _dev_id, _con_id, _period, \
+			       _polarity, NULL)
+
 #if IS_ENABLED(CONFIG_PWM)
 void pwm_add_table(struct pwm_lookup *table, size_t num);
 void pwm_remove_table(struct pwm_lookup *table, size_t num);
-- 
cgit v1.2.3


From 61babe943938bb41fcd7e2b1dec7d1537ea3892f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 21 Nov 2016 18:32:08 -0800
Subject: mtd: nand: fix nand.h kernel-doc warnings

Fix kernel-doc warnings in <linux/mtd/nand.h>:

..//include/linux/mtd/nand.h:658: warning: No description found for parameter 'tCEH_min'
..//include/linux/mtd/nand.h:877: warning: No description found for parameter 'data_interface'

Fixes: eee64b700e26 ("mtd: nand: Introduce nand_data_interface")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mtd/nand.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index c5f3a012ae62..f67915c7726f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -615,7 +615,7 @@ struct nand_buffers {
  * @tALS_min: ALE setup time
  * @tAR_min: ALE to RE# delay
  * @tCEA_max: CE# access time
- * @tCEH_min:
+ * @tCEH_min: CE# high hold time
  * @tCH_min:  CE# hold time
  * @tCHZ_max: CE# high to output hi-Z
  * @tCLH_min: CLE hold time
@@ -801,6 +801,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
  *			supported, 0 otherwise.
  * @jedec_params:	[INTERN] holds the JEDEC parameter page when JEDEC is
  *			supported, 0 otherwise.
+ * @data_interface:	[INTERN] NAND interface timing information
  * @read_retries:	[INTERN] the number of read retry modes supported
  * @onfi_set_features:	[REPLACEABLE] set the features for ONFI nand
  * @onfi_get_features:	[REPLACEABLE] get the features for ONFI nand
-- 
cgit v1.2.3


From 4404d7d821c33ac8105f1d52deb60f736d7c6a06 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 18 Dec 2016 12:34:55 +0100
Subject: mtd: nand: fsmc: remove stale non-DT probe path

The FSMC driver has an execution path and a header file in
<linux/mtd/fsmc.h> that serves to support passing in platform
data through board files, albeit no upstream users of this
mechanism exist.

The header file also contains function headers for functions that
do not exist in the kernel.

Delete this and move the platform data struct, parsing and
handling into the driver, assume we are using OF and make the
driver depend on OF, remove the ifdefs making that optional.

Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Stefan Roese <sr@denx.de>
Cc: Vipin Kumar <vipin.kumar@st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Stefan Roese <sr@denx.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/Kconfig     |   1 +
 drivers/mtd/nand/fsmc_nand.c | 153 ++++++++++++++++++++++++++++++++++++------
 include/linux/mtd/fsmc.h     | 156 -------------------------------------------
 3 files changed, 133 insertions(+), 177 deletions(-)
 delete mode 100644 include/linux/mtd/fsmc.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 9ce5dcb4abd0..c6585454f102 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -535,6 +535,7 @@ config MTD_NAND_JZ4780
 
 config MTD_NAND_FSMC
 	tristate "Support for NAND on ST Micros FSMC"
+	depends on OF
 	depends on PLAT_SPEAR || ARCH_NOMADIK || ARCH_U8500 || MACH_U300
 	help
 	  Enables support for NAND Flash chips on the ST Microelectronics
diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c
index 4924b43977ef..bda1e4667138 100644
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -35,10 +35,133 @@
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
 #include <linux/slab.h>
-#include <linux/mtd/fsmc.h>
 #include <linux/amba/bus.h>
 #include <mtd/mtd-abi.h>
 
+#define FSMC_NAND_BW8		1
+#define FSMC_NAND_BW16		2
+
+#define FSMC_MAX_NOR_BANKS	4
+#define FSMC_MAX_NAND_BANKS	4
+
+#define FSMC_FLASH_WIDTH8	1
+#define FSMC_FLASH_WIDTH16	2
+
+/* fsmc controller registers for NOR flash */
+#define CTRL			0x0
+	/* ctrl register definitions */
+	#define BANK_ENABLE		(1 << 0)
+	#define MUXED			(1 << 1)
+	#define NOR_DEV			(2 << 2)
+	#define WIDTH_8			(0 << 4)
+	#define WIDTH_16		(1 << 4)
+	#define RSTPWRDWN		(1 << 6)
+	#define WPROT			(1 << 7)
+	#define WRT_ENABLE		(1 << 12)
+	#define WAIT_ENB		(1 << 13)
+
+#define CTRL_TIM		0x4
+	/* ctrl_tim register definitions */
+
+#define FSMC_NOR_BANK_SZ	0x8
+#define FSMC_NOR_REG_SIZE	0x40
+
+#define FSMC_NOR_REG(base, bank, reg)		(base + \
+						FSMC_NOR_BANK_SZ * (bank) + \
+						reg)
+
+/* fsmc controller registers for NAND flash */
+#define PC			0x00
+	/* pc register definitions */
+	#define FSMC_RESET		(1 << 0)
+	#define FSMC_WAITON		(1 << 1)
+	#define FSMC_ENABLE		(1 << 2)
+	#define FSMC_DEVTYPE_NAND	(1 << 3)
+	#define FSMC_DEVWID_8		(0 << 4)
+	#define FSMC_DEVWID_16		(1 << 4)
+	#define FSMC_ECCEN		(1 << 6)
+	#define FSMC_ECCPLEN_512	(0 << 7)
+	#define FSMC_ECCPLEN_256	(1 << 7)
+	#define FSMC_TCLR_1		(1)
+	#define FSMC_TCLR_SHIFT		(9)
+	#define FSMC_TCLR_MASK		(0xF)
+	#define FSMC_TAR_1		(1)
+	#define FSMC_TAR_SHIFT		(13)
+	#define FSMC_TAR_MASK		(0xF)
+#define STS			0x04
+	/* sts register definitions */
+	#define FSMC_CODE_RDY		(1 << 15)
+#define COMM			0x08
+	/* comm register definitions */
+	#define FSMC_TSET_0		0
+	#define FSMC_TSET_SHIFT		0
+	#define FSMC_TSET_MASK		0xFF
+	#define FSMC_TWAIT_6		6
+	#define FSMC_TWAIT_SHIFT	8
+	#define FSMC_TWAIT_MASK		0xFF
+	#define FSMC_THOLD_4		4
+	#define FSMC_THOLD_SHIFT	16
+	#define FSMC_THOLD_MASK		0xFF
+	#define FSMC_THIZ_1		1
+	#define FSMC_THIZ_SHIFT		24
+	#define FSMC_THIZ_MASK		0xFF
+#define ATTRIB			0x0C
+#define IOATA			0x10
+#define ECC1			0x14
+#define ECC2			0x18
+#define ECC3			0x1C
+#define FSMC_NAND_BANK_SZ	0x20
+
+#define FSMC_NAND_REG(base, bank, reg)		(base + FSMC_NOR_REG_SIZE + \
+						(FSMC_NAND_BANK_SZ * (bank)) + \
+						reg)
+
+#define FSMC_BUSY_WAIT_TIMEOUT	(1 * HZ)
+
+struct fsmc_nand_timings {
+	uint8_t tclr;
+	uint8_t tar;
+	uint8_t thiz;
+	uint8_t thold;
+	uint8_t twait;
+	uint8_t tset;
+};
+
+enum access_mode {
+	USE_DMA_ACCESS = 1,
+	USE_WORD_ACCESS,
+};
+
+/**
+ * fsmc_nand_platform_data - platform specific NAND controller config
+ * @nand_timings: timing setup for the physical NAND interface
+ * @partitions: partition table for the platform, use a default fallback
+ * if this is NULL
+ * @nr_partitions: the number of partitions in the previous entry
+ * @options: different options for the driver
+ * @width: bus width
+ * @bank: default bank
+ * @select_bank: callback to select a certain bank, this is
+ * platform-specific. If the controller only supports one bank
+ * this may be set to NULL
+ */
+struct fsmc_nand_platform_data {
+	struct fsmc_nand_timings *nand_timings;
+	struct mtd_partition	*partitions;
+	unsigned int		nr_partitions;
+	unsigned int		options;
+	unsigned int		width;
+	unsigned int		bank;
+
+	enum access_mode	mode;
+
+	void			(*select_bank)(uint32_t bank, uint32_t busw);
+
+	/* priv structures for dma accesses */
+	void			*read_dma_priv;
+	void			*write_dma_priv;
+};
+
 static int fsmc_ecc1_ooblayout_ecc(struct mtd_info *mtd, int section,
 				   struct mtd_oob_region *oobregion)
 {
@@ -714,7 +837,6 @@ static bool filter(struct dma_chan *chan, void *slave)
 	return true;
 }
 
-#ifdef CONFIG_OF
 static int fsmc_nand_probe_config_dt(struct platform_device *pdev,
 				     struct device_node *np)
 {
@@ -757,13 +879,6 @@ static int fsmc_nand_probe_config_dt(struct platform_device *pdev,
 	}
 	return 0;
 }
-#else
-static int fsmc_nand_probe_config_dt(struct platform_device *pdev,
-				     struct device_node *np)
-{
-	return -ENOSYS;
-}
-#endif
 
 /*
  * fsmc_nand_probe - Probe function
@@ -782,19 +897,15 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
 	u32 pid;
 	int i;
 
-	if (np) {
-		pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
-		pdev->dev.platform_data = pdata;
-		ret = fsmc_nand_probe_config_dt(pdev, np);
-		if (ret) {
-			dev_err(&pdev->dev, "no platform data\n");
-			return -ENODEV;
-		}
-	}
+	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
 
-	if (!pdata) {
-		dev_err(&pdev->dev, "platform data is NULL\n");
-		return -EINVAL;
+	pdev->dev.platform_data = pdata;
+	ret = fsmc_nand_probe_config_dt(pdev, np);
+	if (ret) {
+		dev_err(&pdev->dev, "no platform data\n");
+		return -ENODEV;
 	}
 
 	/* Allocate memory for the device structure (and zero it) */
diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h
deleted file mode 100644
index ad3c3488073c..000000000000
--- a/include/linux/mtd/fsmc.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * incude/mtd/fsmc.h
- *
- * ST Microelectronics
- * Flexible Static Memory Controller (FSMC)
- * platform data interface and header file
- *
- * Copyright © 2010 ST Microelectronics
- * Vipin Kumar <vipin.kumar@st.com>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#ifndef __MTD_FSMC_H
-#define __MTD_FSMC_H
-
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/physmap.h>
-#include <linux/types.h>
-#include <linux/mtd/partitions.h>
-#include <asm/param.h>
-
-#define FSMC_NAND_BW8		1
-#define FSMC_NAND_BW16		2
-
-#define FSMC_MAX_NOR_BANKS	4
-#define FSMC_MAX_NAND_BANKS	4
-
-#define FSMC_FLASH_WIDTH8	1
-#define FSMC_FLASH_WIDTH16	2
-
-/* fsmc controller registers for NOR flash */
-#define CTRL			0x0
-	/* ctrl register definitions */
-	#define BANK_ENABLE		(1 << 0)
-	#define MUXED			(1 << 1)
-	#define NOR_DEV			(2 << 2)
-	#define WIDTH_8			(0 << 4)
-	#define WIDTH_16		(1 << 4)
-	#define RSTPWRDWN		(1 << 6)
-	#define WPROT			(1 << 7)
-	#define WRT_ENABLE		(1 << 12)
-	#define WAIT_ENB		(1 << 13)
-
-#define CTRL_TIM		0x4
-	/* ctrl_tim register definitions */
-
-#define FSMC_NOR_BANK_SZ	0x8
-#define FSMC_NOR_REG_SIZE	0x40
-
-#define FSMC_NOR_REG(base, bank, reg)		(base + \
-						FSMC_NOR_BANK_SZ * (bank) + \
-						reg)
-
-/* fsmc controller registers for NAND flash */
-#define PC			0x00
-	/* pc register definitions */
-	#define FSMC_RESET		(1 << 0)
-	#define FSMC_WAITON		(1 << 1)
-	#define FSMC_ENABLE		(1 << 2)
-	#define FSMC_DEVTYPE_NAND	(1 << 3)
-	#define FSMC_DEVWID_8		(0 << 4)
-	#define FSMC_DEVWID_16		(1 << 4)
-	#define FSMC_ECCEN		(1 << 6)
-	#define FSMC_ECCPLEN_512	(0 << 7)
-	#define FSMC_ECCPLEN_256	(1 << 7)
-	#define FSMC_TCLR_1		(1)
-	#define FSMC_TCLR_SHIFT		(9)
-	#define FSMC_TCLR_MASK		(0xF)
-	#define FSMC_TAR_1		(1)
-	#define FSMC_TAR_SHIFT		(13)
-	#define FSMC_TAR_MASK		(0xF)
-#define STS			0x04
-	/* sts register definitions */
-	#define FSMC_CODE_RDY		(1 << 15)
-#define COMM			0x08
-	/* comm register definitions */
-	#define FSMC_TSET_0		0
-	#define FSMC_TSET_SHIFT		0
-	#define FSMC_TSET_MASK		0xFF
-	#define FSMC_TWAIT_6		6
-	#define FSMC_TWAIT_SHIFT	8
-	#define FSMC_TWAIT_MASK		0xFF
-	#define FSMC_THOLD_4		4
-	#define FSMC_THOLD_SHIFT	16
-	#define FSMC_THOLD_MASK		0xFF
-	#define FSMC_THIZ_1		1
-	#define FSMC_THIZ_SHIFT		24
-	#define FSMC_THIZ_MASK		0xFF
-#define ATTRIB			0x0C
-#define IOATA			0x10
-#define ECC1			0x14
-#define ECC2			0x18
-#define ECC3			0x1C
-#define FSMC_NAND_BANK_SZ	0x20
-
-#define FSMC_NAND_REG(base, bank, reg)		(base + FSMC_NOR_REG_SIZE + \
-						(FSMC_NAND_BANK_SZ * (bank)) + \
-						reg)
-
-#define FSMC_BUSY_WAIT_TIMEOUT	(1 * HZ)
-
-struct fsmc_nand_timings {
-	uint8_t tclr;
-	uint8_t tar;
-	uint8_t thiz;
-	uint8_t thold;
-	uint8_t twait;
-	uint8_t tset;
-};
-
-enum access_mode {
-	USE_DMA_ACCESS = 1,
-	USE_WORD_ACCESS,
-};
-
-/**
- * fsmc_nand_platform_data - platform specific NAND controller config
- * @nand_timings: timing setup for the physical NAND interface
- * @partitions: partition table for the platform, use a default fallback
- * if this is NULL
- * @nr_partitions: the number of partitions in the previous entry
- * @options: different options for the driver
- * @width: bus width
- * @bank: default bank
- * @select_bank: callback to select a certain bank, this is
- * platform-specific. If the controller only supports one bank
- * this may be set to NULL
- */
-struct fsmc_nand_platform_data {
-	struct fsmc_nand_timings *nand_timings;
-	struct mtd_partition	*partitions;
-	unsigned int		nr_partitions;
-	unsigned int		options;
-	unsigned int		width;
-	unsigned int		bank;
-
-	enum access_mode	mode;
-
-	void			(*select_bank)(uint32_t bank, uint32_t busw);
-
-	/* priv structures for dma accesses */
-	void			*read_dma_priv;
-	void			*write_dma_priv;
-};
-
-extern int __init fsmc_nor_init(struct platform_device *pdev,
-		unsigned long base, uint32_t bank, uint32_t width);
-extern void __init fsmc_init_board_info(struct platform_device *pdev,
-		struct mtd_partition *partitions, unsigned int nr_partitions,
-		unsigned int width);
-
-#endif /* __MTD_FSMC_H */
-- 
cgit v1.2.3


From 058fe1c0440e68a1ba3c2270ae43e9f0298b27d8 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 18 Jan 2017 11:24:53 -0800
Subject: perf/core: Make cgroup switch visit only cpuctxs with cgroup events

This patch follows from a conversation in CQM/CMT's last series about
speeding up the context switch for cgroup events:

  https://patchwork.kernel.org/patch/9478617/

This is a low-hanging fruit optimization. It replaces the iteration over
the "pmus" list in cgroup switch by an iteration over a new list that
contains only cpuctxs with at least one cgroup event.

This is necessary because the number of PMUs have increased over the years
e.g modern x86 server systems have well above 50 PMUs.

The iteration over the full PMU list is unneccessary and can be costly in
heavy cache contention scenarios.

Below are some instrumentation measurements with 10, 50 and 90 percentiles
of the total cost of context switch before and after this optimization for
a simple array read/write microbenchark.

  Contention
    Level    Nr events      Before (us)            After (us)       Median
  L2    L3     types      (10%, 50%, 90%)       (10%, 50%, 90%     Speedup
  --------------------------------------------------------------------------
  Low   Low       1       (1.72, 2.42, 5.85)    (1.35, 1.64, 5.46)     29%
  High  Low       1       (2.08, 4.56, 19.8)    (1720, 2.20, 13.7)     51%
  High  High      1       (2.86, 10.4, 12.7)    (2.54, 4.32, 12.1)     58%

  Low   Low       2       (1.98, 3.20, 6.89)    (1.68, 2.41, 8.89)     24%
  High  Low       2       (2.48, 5.28, 22.4)    (2150, 3.69, 14.6)     30%
  High  High      2       (3.32, 8.09, 13.9)    (2.80, 5.15, 13.7)     36%

where:

  1 event type  = cycles
  2 event types = cycles,intel_cqm/llc_occupancy/

   Contention L2 Low: workset  <  L2 cache size.
                 High:  "     >>  L2   "     " .
   Contention L3 Low: workset of task on all sockets  <  L3 cache size.
                 High:   "     "   "   "   "    "    >>  L3   "     " .

   Median Speedup is (50%ile Before - 50%ile After) /  50%ile Before

Unsurprisingly, the benefits of this optimization decrease with the number
of cpuctxs with a cgroup events, yet, is never detrimental.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170118192454.58008-2-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 98 +++++++++++++++++++++-------------------------
 2 files changed, 46 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 78ed8105e64d..dfa725723f28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -788,6 +788,7 @@ struct perf_cpu_context {
 	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+	struct list_head		cgrp_cpuctx_entry;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5aaa806702d..928a818d912e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -678,6 +678,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	info->timestamp = ctx->timestamp;
 }
 
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
 #define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
 
@@ -690,61 +692,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
+	struct list_head *list;
 	unsigned long flags;
 
 	/*
-	 * disable interrupts to avoid geting nr_cgroup
-	 * changes via __perf_event_disable(). Also
-	 * avoids preemption.
+	 * Disable interrupts and preemption to avoid this CPU's
+	 * cgrp_cpuctx_entry to change under us.
 	 */
 	local_irq_save(flags);
 
-	/*
-	 * we reschedule only in the presence of cgroup
-	 * constrained events.
-	 */
+	list = this_cpu_ptr(&cgrp_cpuctx_list);
+	list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			continue; /* ensure we process each cpuctx once */
+		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		/*
-		 * perf_cgroup_events says at least one
-		 * context on this CPU has cgroup events.
-		 *
-		 * ctx->nr_cgroups reports the number of cgroup
-		 * events for a context.
-		 */
-		if (cpuctx->ctx.nr_cgroups > 0) {
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-			perf_pmu_disable(cpuctx->ctx.pmu);
-
-			if (mode & PERF_CGROUP_SWOUT) {
-				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-				/*
-				 * must not be done before ctxswout due
-				 * to event_filter_match() in event_sched_out()
-				 */
-				cpuctx->cgrp = NULL;
-			}
+		if (mode & PERF_CGROUP_SWOUT) {
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			/*
+			 * must not be done before ctxswout due
+			 * to event_filter_match() in event_sched_out()
+			 */
+			cpuctx->cgrp = NULL;
+		}
 
-			if (mode & PERF_CGROUP_SWIN) {
-				WARN_ON_ONCE(cpuctx->cgrp);
-				/*
-				 * set cgrp before ctxsw in to allow
-				 * event_filter_match() to not have to pass
-				 * task around
-				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-				 * because cgorup events are only per-cpu
-				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-			}
-			perf_pmu_enable(cpuctx->ctx.pmu);
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		if (mode & PERF_CGROUP_SWIN) {
+			WARN_ON_ONCE(cpuctx->cgrp);
+			/*
+			 * set cgrp before ctxsw in to allow
+			 * event_filter_match() to not have to pass
+			 * task around
+			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
+			 * because cgorup events are only per-cpu
+			 */
+			cpuctx->cgrp = perf_cgroup_from_task(task,
+							     &cpuctx->ctx);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 		}
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
 
 	local_irq_restore(flags);
@@ -889,6 +876,7 @@ list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
 {
 	struct perf_cpu_context *cpuctx;
+	struct list_head *cpuctx_entry;
 
 	if (!is_cgroup_event(event))
 		return;
@@ -902,15 +890,16 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
-
-	/*
-	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
-	 * ctx->nr_cgroup == 0 .
-	 */
-	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-		cpuctx->cgrp = event->cgrp;
-	else if (!add)
+	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+	if (add) {
+		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+		if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+			cpuctx->cgrp = event->cgrp;
+	} else {
+		list_del(cpuctx_entry);
 		cpuctx->cgrp = NULL;
+	}
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -10709,6 +10698,9 @@ static void __init perf_event_init_all_cpus(void)
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
+#ifdef CONFIG_CGROUP_PERF
+		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
 		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
-- 
cgit v1.2.3


From 1fd7e416995401ec082fc0fe6090a223969beda5 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 18 Jan 2017 11:24:54 -0800
Subject: perf/core: Remove perf_cpu_context::unique_pmu

cpuctx->unique_pmu was originally introduced as a way to identify cpuctxs
with shared pmus in order to avoid visiting the same cpuctx more than once
in a for_each_pmu loop.

cpuctx->unique_pmu == cpuctx->pmu in non-software task contexts since they
have only one pmu per cpuctx. Since perf_pmu_sched_task() is only called in
hw contexts, this patch replaces cpuctx->unique_pmu by cpuctx->pmu in it.

The change above, together with the previous patch in this series, removed
the remaining uses of cpuctx->unique_pmu, so we remove it altogether.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170118192454.58008-3-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 -
 kernel/events/core.c       | 31 +------------------------------
 2 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dfa725723f28..5c58e93c130c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -785,7 +785,6 @@ struct perf_cpu_context {
 	ktime_t				hrtimer_interval;
 	unsigned int			hrtimer_active;
 
-	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 928a818d912e..d92c7ad7715d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2932,7 +2932,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 		return;
 
 	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-		pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+		pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
 
 		if (WARN_ON_ONCE(!pmu->sched_task))
 			continue;
@@ -8636,37 +8636,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 	return NULL;
 }
 
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct perf_cpu_context *cpuctx;
-
-		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
-		if (cpuctx->unique_pmu == old_pmu)
-			cpuctx->unique_pmu = pmu;
-	}
-}
-
 static void free_pmu_context(struct pmu *pmu)
 {
-	struct pmu *i;
-
 	mutex_lock(&pmus_lock);
-	/*
-	 * Like a real lame refcount.
-	 */
-	list_for_each_entry(i, &pmus, entry) {
-		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
-			update_pmu_context(i, pmu);
-			goto out;
-		}
-	}
-
 	free_percpu(pmu->pmu_cpu_context);
-out:
 	mutex_unlock(&pmus_lock);
 }
 
@@ -8870,8 +8843,6 @@ skip_type:
 		cpuctx->ctx.pmu = pmu;
 
 		__perf_mux_hrtimer_init(cpuctx, cpu);
-
-		cpuctx->unique_pmu = pmu;
 	}
 
 got_cpu_context:
-- 
cgit v1.2.3


From 5c34153704898ed7ec0f8c0dceb651cbe4b713fd Mon Sep 17 00:00:00 2001
From: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
Date: Thu, 26 Jan 2017 19:50:49 +0530
Subject: irqchip/gic-v3: Add missing system register definitions

Define register definitions for ICH_VMCR_EL2, ICC_CTLR_EL1 and
ICH_VTR_EL2, ICC_BPR0_EL1, ICC_BPR1_EL1 registers.

Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 43 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index e808f8ae6f14..7f6d904a62c6 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -352,8 +352,30 @@
 /*
  * CPU interface registers
  */
-#define ICC_CTLR_EL1_EOImode_drop_dir	(0U << 1)
-#define ICC_CTLR_EL1_EOImode_drop	(1U << 1)
+#define ICC_CTLR_EL1_EOImode_SHIFT	(1)
+#define ICC_CTLR_EL1_EOImode_drop_dir	(0U << ICC_CTLR_EL1_EOImode_SHIFT)
+#define ICC_CTLR_EL1_EOImode_drop	(1U << ICC_CTLR_EL1_EOImode_SHIFT)
+#define ICC_CTLR_EL1_EOImode_MASK	(1 << ICC_CTLR_EL1_EOImode_SHIFT)
+#define ICC_CTLR_EL1_CBPR_SHIFT		0
+#define ICC_CTLR_EL1_CBPR_MASK		(1 << ICC_CTLR_EL1_CBPR_SHIFT)
+#define ICC_CTLR_EL1_PRI_BITS_SHIFT	8
+#define ICC_CTLR_EL1_PRI_BITS_MASK	(0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT)
+#define ICC_CTLR_EL1_ID_BITS_SHIFT	11
+#define ICC_CTLR_EL1_ID_BITS_MASK	(0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT)
+#define ICC_CTLR_EL1_SEIS_SHIFT		14
+#define ICC_CTLR_EL1_SEIS_MASK		(0x1 << ICC_CTLR_EL1_SEIS_SHIFT)
+#define ICC_CTLR_EL1_A3V_SHIFT		15
+#define ICC_CTLR_EL1_A3V_MASK		(0x1 << ICC_CTLR_EL1_A3V_SHIFT)
+#define ICC_PMR_EL1_SHIFT		0
+#define ICC_PMR_EL1_MASK		(0xff << ICC_PMR_EL1_SHIFT)
+#define ICC_BPR0_EL1_SHIFT		0
+#define ICC_BPR0_EL1_MASK		(0x7 << ICC_BPR0_EL1_SHIFT)
+#define ICC_BPR1_EL1_SHIFT		0
+#define ICC_BPR1_EL1_MASK		(0x7 << ICC_BPR1_EL1_SHIFT)
+#define ICC_IGRPEN0_EL1_SHIFT		0
+#define ICC_IGRPEN0_EL1_MASK		(1 << ICC_IGRPEN0_EL1_SHIFT)
+#define ICC_IGRPEN1_EL1_SHIFT		0
+#define ICC_IGRPEN1_EL1_MASK		(1 << ICC_IGRPEN1_EL1_SHIFT)
 #define ICC_SRE_EL1_SRE			(1U << 0)
 
 /*
@@ -384,12 +406,29 @@
 
 #define ICH_VMCR_CTLR_SHIFT		0
 #define ICH_VMCR_CTLR_MASK		(0x21f << ICH_VMCR_CTLR_SHIFT)
+#define ICH_VMCR_CBPR_SHIFT		4
+#define ICH_VMCR_CBPR_MASK		(1 << ICH_VMCR_CBPR_SHIFT)
+#define ICH_VMCR_EOIM_SHIFT		9
+#define ICH_VMCR_EOIM_MASK		(1 << ICH_VMCR_EOIM_SHIFT)
 #define ICH_VMCR_BPR1_SHIFT		18
 #define ICH_VMCR_BPR1_MASK		(7 << ICH_VMCR_BPR1_SHIFT)
 #define ICH_VMCR_BPR0_SHIFT		21
 #define ICH_VMCR_BPR0_MASK		(7 << ICH_VMCR_BPR0_SHIFT)
 #define ICH_VMCR_PMR_SHIFT		24
 #define ICH_VMCR_PMR_MASK		(0xffUL << ICH_VMCR_PMR_SHIFT)
+#define ICH_VMCR_ENG0_SHIFT		0
+#define ICH_VMCR_ENG0_MASK		(1 << ICH_VMCR_ENG0_SHIFT)
+#define ICH_VMCR_ENG1_SHIFT		1
+#define ICH_VMCR_ENG1_MASK		(1 << ICH_VMCR_ENG1_SHIFT)
+
+#define ICH_VTR_PRI_BITS_SHIFT		29
+#define ICH_VTR_PRI_BITS_MASK		(7 << ICH_VTR_PRI_BITS_SHIFT)
+#define ICH_VTR_ID_BITS_SHIFT		23
+#define ICH_VTR_ID_BITS_MASK		(7 << ICH_VTR_ID_BITS_SHIFT)
+#define ICH_VTR_SEIS_SHIFT		22
+#define ICH_VTR_SEIS_MASK		(1 << ICH_VTR_SEIS_SHIFT)
+#define ICH_VTR_A3V_SHIFT		21
+#define ICH_VTR_A3V_MASK		(1 << ICH_VTR_A3V_SHIFT)
 
 #define ICC_IAR1_EL1_SPURIOUS		0x3ff
 
-- 
cgit v1.2.3


From 5fb247d79c04240dce86c842976cde1edde7f7ed Mon Sep 17 00:00:00 2001
From: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
Date: Thu, 26 Jan 2017 19:50:50 +0530
Subject: KVM: arm/arm64: vgic: Introduce VENG0 and VENG1 fields to vmcr struct

ICC_VMCR_EL2 supports virtual access to ICC_IGRPEN1_EL1.Enable
and ICC_IGRPEN0_EL1.Enable fields. Add grpen0 and grpen1 member
variables to struct vmcr to support read and write of these fields.

Also refactor vgic_set_vmcr and vgic_get_vmcr() code.
Drop ICH_VMCR_CTLR_SHIFT and ICH_VMCR_CTLR_MASK macros and instead
use ICH_VMCR_EOI* and ICH_VMCR_CBPR* macros.

Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h |  2 --
 virt/kvm/arm/vgic/vgic-mmio-v2.c   | 16 ----------------
 virt/kvm/arm/vgic/vgic-mmio.c      | 16 ++++++++++++++++
 virt/kvm/arm/vgic/vgic-v3.c        | 20 ++++++++++++++++++--
 virt/kvm/arm/vgic/vgic.h           |  5 +++++
 5 files changed, 39 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 7f6d904a62c6..170e00a40826 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -404,8 +404,6 @@
 #define ICH_HCR_EN			(1 << 0)
 #define ICH_HCR_UIE			(1 << 1)
 
-#define ICH_VMCR_CTLR_SHIFT		0
-#define ICH_VMCR_CTLR_MASK		(0x21f << ICH_VMCR_CTLR_SHIFT)
 #define ICH_VMCR_CBPR_SHIFT		4
 #define ICH_VMCR_CBPR_MASK		(1 << ICH_VMCR_CBPR_SHIFT)
 #define ICH_VMCR_EOIM_SHIFT		9
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index fa68dd41756e..a3ad7ff95c9b 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -213,22 +213,6 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
 	}
 }
 
-static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_set_vmcr(vcpu, vmcr);
-	else
-		vgic_v3_set_vmcr(vcpu, vmcr);
-}
-
-static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_get_vmcr(vcpu, vmcr);
-	else
-		vgic_v3_get_vmcr(vcpu, vmcr);
-}
-
 #define GICC_ARCH_VERSION_V2	0x2
 
 /* These are for userland accesses only, there is no guest-facing emulation. */
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 746c8afca718..1d1886e7d640 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -385,6 +385,22 @@ vgic_find_mmio_region(const struct vgic_register_region *region, int nr_regions,
 		       sizeof(region[0]), match_region);
 }
 
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_set_vmcr(vcpu, vmcr);
+	else
+		vgic_v3_set_vmcr(vcpu, vmcr);
+}
+
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_get_vmcr(vcpu, vmcr);
+	else
+		vgic_v3_get_vmcr(vcpu, vmcr);
+}
+
 /*
  * kvm_mmio_read_buf() returns a value in a format where it can be converted
  * to a byte array and be directly observed as the guest wanted it to appear
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 679ba935698f..42ff9c9826d3 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -175,10 +175,18 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
 	u32 vmcr;
 
-	vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
+	/*
+	 * Ignore the FIQen bit, because GIC emulation always implies
+	 * SRE=1 which means the vFIQEn bit is also RES1.
+	 */
+	vmcr = ((vmcrp->ctlr >> ICC_CTLR_EL1_EOImode_SHIFT) <<
+		 ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
+	vmcr |= (vmcrp->ctlr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
 	vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
 	vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
 	vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
+	vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
+	vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
 
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
 }
@@ -187,10 +195,18 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
 	u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
 
-	vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
+	/*
+	 * Ignore the FIQen bit, because GIC emulation always implies
+	 * SRE=1 which means the vFIQEn bit is also RES1.
+	 */
+	vmcrp->ctlr = ((vmcr >> ICH_VMCR_EOIM_SHIFT) <<
+			ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK;
+	vmcrp->ctlr |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
 	vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
 	vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
 	vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+	vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
+	vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
 }
 
 #define INITIAL_PENDBASER_VALUE						  \
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 4505fd4919fd..ecfe1a6c841a 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -65,6 +65,9 @@ struct vgic_vmcr {
 	u32	abpr;
 	u32	bpr;
 	u32	pmr;
+	/* Below member variable are valid only for GICv3 */
+	u32	grpen0;
+	u32	grpen1;
 };
 
 struct vgic_reg_attr {
@@ -137,6 +140,8 @@ int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
 int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
 			 int offset, u32 *val);
 int kvm_register_vgic_device(unsigned long type);
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 int vgic_lazy_init(struct kvm *kvm);
 int vgic_init(struct kvm *kvm);
 
-- 
cgit v1.2.3


From 9d93dc1c96ec446bef9c34a189ea24556f1af89a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 25 Jan 2017 13:47:43 +0000
Subject: arm/arm64: KVM: Get rid of KVM_MEMSLOT_INCOHERENT

KVM_MEMSLOT_INCOHERENT is not used anymore, as we've killed its
only use in the arm/arm64 MMU code. Let's remove the last artifacts.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/kvm/mmu.c       | 9 ---------
 include/linux/kvm_host.h | 1 -
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 5cc35080cbf7..962616fd4ddd 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1876,15 +1876,6 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			    unsigned long npages)
 {
-	/*
-	 * Readonly memslots are not incoherent with the caches by definition,
-	 * but in practice, they are used mostly to emulate ROMs or NOR flashes
-	 * that the guest may consider devices and hence map as uncached.
-	 * To prevent incoherency issues in these cases, tag all readonly
-	 * regions as incoherent.
-	 */
-	if (slot->flags & KVM_MEM_READONLY)
-		slot->flags |= KVM_MEMSLOT_INCOHERENT;
 	return 0;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c5190dab2c1..cda457bcedc1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -45,7 +45,6 @@
  * include/linux/kvm_h.
  */
 #define KVM_MEMSLOT_INVALID	(1UL << 16)
-#define KVM_MEMSLOT_INCOHERENT	(1UL << 17)
 
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
-- 
cgit v1.2.3


From a92def1becf33e91fc460c7ae575aa9210ba8f40 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Mon, 19 Dec 2016 18:48:29 -0200
Subject: [media] ir-rx51: port to rc-core

This driver was written using lirc since rc-core did not support
transmitter-only hardware at that time. Now that it does, port
this driver.

Compile tested only.

Signed-off-by: Sean Young <sean@mess.org>
Cc: Timo Kokkonen <timo.t.kokkonen@iki.fi>
Cc: Ivaylo Dimitrov <ivo.g.dimitrov.75@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 arch/arm/mach-omap2/pdata-quirks.c          |   8 +-
 drivers/media/rc/Kconfig                    |   2 +-
 drivers/media/rc/ir-rx51.c                  | 332 ++++++++++------------------
 include/linux/platform_data/media/ir-rx51.h |   6 +-
 4 files changed, 126 insertions(+), 222 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index 477910a48448..9f060748df5d 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -484,15 +484,15 @@ static struct pwm_omap_dmtimer_pdata pwm_dmtimer_pdata = {
 };
 #endif
 
-static struct lirc_rx51_platform_data __maybe_unused rx51_lirc_data = {
+static struct ir_rx51_platform_data __maybe_unused rx51_ir_data = {
 	.set_max_mpu_wakeup_lat = omap_pm_set_max_mpu_wakeup_lat,
 };
 
-static struct platform_device __maybe_unused rx51_lirc_device = {
-	.name           = "lirc_rx51",
+static struct platform_device __maybe_unused rx51_ir_device = {
+	.name           = "ir_rx51",
 	.id             = -1,
 	.dev            = {
-		.platform_data = &rx51_lirc_data,
+		.platform_data = &rx51_ir_data,
 	},
 };
 
diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index 3351e25d6176..d0ddbd3e22ca 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -345,7 +345,7 @@ config IR_TTUSBIR
 
 config IR_RX51
 	tristate "Nokia N900 IR transmitter diode"
-	depends on OMAP_DM_TIMER && PWM_OMAP_DMTIMER && ARCH_OMAP2PLUS && LIRC
+	depends on (OMAP_DM_TIMER && PWM_OMAP_DMTIMER && ARCH_OMAP2PLUS || COMPILE_TEST) && RC_CORE
 	---help---
 	   Say Y or M here if you want to enable support for the IR
 	   transmitter diode built in the Nokia N900 (RX51) device.
diff --git a/drivers/media/rc/ir-rx51.c b/drivers/media/rc/ir-rx51.c
index e6efa8c267a0..49265f02e772 100644
--- a/drivers/media/rc/ir-rx51.c
+++ b/drivers/media/rc/ir-rx51.c
@@ -15,32 +15,23 @@
  */
 #include <linux/clk.h>
 #include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/uaccess.h>
 #include <linux/platform_device.h>
-#include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/pwm.h>
 #include <linux/of.h>
 #include <linux/hrtimer.h>
 
-#include <media/lirc.h>
-#include <media/lirc_dev.h>
+#include <media/rc-core.h>
 #include <linux/platform_data/media/ir-rx51.h>
 
-#define LIRC_RX51_DRIVER_FEATURES (LIRC_CAN_SET_SEND_DUTY_CYCLE |	\
-				   LIRC_CAN_SET_SEND_CARRIER |		\
-				   LIRC_CAN_SEND_PULSE)
-
-#define DRIVER_NAME "lirc_rx51"
-
 #define WBUF_LEN 256
 
-struct lirc_rx51 {
+struct ir_rx51 {
+	struct rc_dev *rcdev;
 	struct pwm_device *pwm;
 	struct hrtimer timer;
 	struct device	     *dev;
-	struct lirc_rx51_platform_data *pdata;
+	struct ir_rx51_platform_data *pdata;
 	wait_queue_head_t     wqueue;
 
 	unsigned int	freq;		/* carrier frequency */
@@ -50,38 +41,37 @@ struct lirc_rx51 {
 	unsigned long	device_is_open;
 };
 
-static inline void lirc_rx51_on(struct lirc_rx51 *lirc_rx51)
+static inline void ir_rx51_on(struct ir_rx51 *ir_rx51)
 {
-	pwm_enable(lirc_rx51->pwm);
+	pwm_enable(ir_rx51->pwm);
 }
 
-static inline void lirc_rx51_off(struct lirc_rx51 *lirc_rx51)
+static inline void ir_rx51_off(struct ir_rx51 *ir_rx51)
 {
-	pwm_disable(lirc_rx51->pwm);
+	pwm_disable(ir_rx51->pwm);
 }
 
-static int init_timing_params(struct lirc_rx51 *lirc_rx51)
+static int init_timing_params(struct ir_rx51 *ir_rx51)
 {
-	struct pwm_device *pwm = lirc_rx51->pwm;
-	int duty, period = DIV_ROUND_CLOSEST(NSEC_PER_SEC, lirc_rx51->freq);
+	struct pwm_device *pwm = ir_rx51->pwm;
+	int duty, period = DIV_ROUND_CLOSEST(NSEC_PER_SEC, ir_rx51->freq);
 
-	duty = DIV_ROUND_CLOSEST(lirc_rx51->duty_cycle * period, 100);
+	duty = DIV_ROUND_CLOSEST(ir_rx51->duty_cycle * period, 100);
 
 	pwm_config(pwm, duty, period);
 
 	return 0;
 }
 
-static enum hrtimer_restart lirc_rx51_timer_cb(struct hrtimer *timer)
+static enum hrtimer_restart ir_rx51_timer_cb(struct hrtimer *timer)
 {
-	struct lirc_rx51 *lirc_rx51 =
-			container_of(timer, struct lirc_rx51, timer);
+	struct ir_rx51 *ir_rx51 = container_of(timer, struct ir_rx51, timer);
 	ktime_t now;
 
-	if (lirc_rx51->wbuf_index < 0) {
-		dev_err_ratelimited(lirc_rx51->dev,
-				"BUG wbuf_index has value of %i\n",
-				lirc_rx51->wbuf_index);
+	if (ir_rx51->wbuf_index < 0) {
+		dev_err_ratelimited(ir_rx51->dev,
+				    "BUG wbuf_index has value of %i\n",
+				    ir_rx51->wbuf_index);
 		goto end;
 	}
 
@@ -92,20 +82,20 @@ static enum hrtimer_restart lirc_rx51_timer_cb(struct hrtimer *timer)
 	do {
 		u64 ns;
 
-		if (lirc_rx51->wbuf_index >= WBUF_LEN)
+		if (ir_rx51->wbuf_index >= WBUF_LEN)
 			goto end;
-		if (lirc_rx51->wbuf[lirc_rx51->wbuf_index] == -1)
+		if (ir_rx51->wbuf[ir_rx51->wbuf_index] == -1)
 			goto end;
 
-		if (lirc_rx51->wbuf_index % 2)
-			lirc_rx51_off(lirc_rx51);
+		if (ir_rx51->wbuf_index % 2)
+			ir_rx51_off(ir_rx51);
 		else
-			lirc_rx51_on(lirc_rx51);
+			ir_rx51_on(ir_rx51);
 
-		ns = 1000 * lirc_rx51->wbuf[lirc_rx51->wbuf_index];
+		ns = US_TO_NS(ir_rx51->wbuf[ir_rx51->wbuf_index]);
 		hrtimer_add_expires_ns(timer, ns);
 
-		lirc_rx51->wbuf_index++;
+		ir_rx51->wbuf_index++;
 
 		now = timer->base->get_time();
 
@@ -114,203 +104,112 @@ static enum hrtimer_restart lirc_rx51_timer_cb(struct hrtimer *timer)
 	return HRTIMER_RESTART;
 end:
 	/* Stop TX here */
-	lirc_rx51_off(lirc_rx51);
-	lirc_rx51->wbuf_index = -1;
+	ir_rx51_off(ir_rx51);
+	ir_rx51->wbuf_index = -1;
 
-	wake_up_interruptible(&lirc_rx51->wqueue);
+	wake_up_interruptible(&ir_rx51->wqueue);
 
 	return HRTIMER_NORESTART;
 }
 
-static ssize_t lirc_rx51_write(struct file *file, const char *buf,
-			  size_t n, loff_t *ppos)
+static int ir_rx51_tx(struct rc_dev *dev, unsigned int *buffer,
+		      unsigned int count)
 {
-	int count, i;
-	struct lirc_rx51 *lirc_rx51 = file->private_data;
+	struct ir_rx51 *ir_rx51 = dev->priv;
 
-	if (n % sizeof(int))
+	if (count > WBUF_LEN)
 		return -EINVAL;
 
-	count = n / sizeof(int);
-	if ((count > WBUF_LEN) || (count % 2 == 0))
-		return -EINVAL;
+	memcpy(ir_rx51->wbuf, buffer, count * sizeof(unsigned int));
 
 	/* Wait any pending transfers to finish */
-	wait_event_interruptible(lirc_rx51->wqueue, lirc_rx51->wbuf_index < 0);
-
-	if (copy_from_user(lirc_rx51->wbuf, buf, n))
-		return -EFAULT;
-
-	/* Sanity check the input pulses */
-	for (i = 0; i < count; i++)
-		if (lirc_rx51->wbuf[i] < 0)
-			return -EINVAL;
+	wait_event_interruptible(ir_rx51->wqueue, ir_rx51->wbuf_index < 0);
 
-	init_timing_params(lirc_rx51);
+	init_timing_params(ir_rx51);
 	if (count < WBUF_LEN)
-		lirc_rx51->wbuf[count] = -1; /* Insert termination mark */
+		ir_rx51->wbuf[count] = -1; /* Insert termination mark */
 
 	/*
 	 * Adjust latency requirements so the device doesn't go in too
 	 * deep sleep states
 	 */
-	lirc_rx51->pdata->set_max_mpu_wakeup_lat(lirc_rx51->dev, 50);
+	ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, 50);
 
-	lirc_rx51_on(lirc_rx51);
-	lirc_rx51->wbuf_index = 1;
-	hrtimer_start(&lirc_rx51->timer,
-		      ns_to_ktime(1000 * lirc_rx51->wbuf[0]),
+	ir_rx51_on(ir_rx51);
+	ir_rx51->wbuf_index = 1;
+	hrtimer_start(&ir_rx51->timer,
+		      ns_to_ktime(US_TO_NS(ir_rx51->wbuf[0])),
 		      HRTIMER_MODE_REL);
 	/*
 	 * Don't return back to the userspace until the transfer has
 	 * finished
 	 */
-	wait_event_interruptible(lirc_rx51->wqueue, lirc_rx51->wbuf_index < 0);
+	wait_event_interruptible(ir_rx51->wqueue, ir_rx51->wbuf_index < 0);
 
 	/* We can sleep again */
-	lirc_rx51->pdata->set_max_mpu_wakeup_lat(lirc_rx51->dev, -1);
+	ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, -1);
 
-	return n;
+	return count;
 }
 
-static long lirc_rx51_ioctl(struct file *filep,
-			unsigned int cmd, unsigned long arg)
+static int ir_rx51_open(struct rc_dev *dev)
 {
-	int result;
-	unsigned long value;
-	unsigned int ivalue;
-	struct lirc_rx51 *lirc_rx51 = filep->private_data;
-
-	switch (cmd) {
-	case LIRC_GET_SEND_MODE:
-		result = put_user(LIRC_MODE_PULSE, (unsigned long *)arg);
-		if (result)
-			return result;
-		break;
-
-	case LIRC_SET_SEND_MODE:
-		result = get_user(value, (unsigned long *)arg);
-		if (result)
-			return result;
-
-		/* only LIRC_MODE_PULSE supported */
-		if (value != LIRC_MODE_PULSE)
-			return -ENOSYS;
-		break;
-
-	case LIRC_GET_REC_MODE:
-		result = put_user(0, (unsigned long *) arg);
-		if (result)
-			return result;
-		break;
-
-	case LIRC_GET_LENGTH:
-		return -ENOSYS;
-		break;
-
-	case LIRC_SET_SEND_DUTY_CYCLE:
-		result = get_user(ivalue, (unsigned int *) arg);
-		if (result)
-			return result;
-
-		if (ivalue <= 0 || ivalue > 100) {
-			dev_err(lirc_rx51->dev, ": invalid duty cycle %d\n",
-				ivalue);
-			return -EINVAL;
-		}
-
-		lirc_rx51->duty_cycle = ivalue;
-		break;
-
-	case LIRC_SET_SEND_CARRIER:
-		result = get_user(ivalue, (unsigned int *) arg);
-		if (result)
-			return result;
-
-		if (ivalue > 500000 || ivalue < 20000) {
-			dev_err(lirc_rx51->dev, ": invalid carrier freq %d\n",
-				ivalue);
-			return -EINVAL;
-		}
-
-		lirc_rx51->freq = ivalue;
-		break;
-
-	case LIRC_GET_FEATURES:
-		result = put_user(LIRC_RX51_DRIVER_FEATURES,
-				  (unsigned long *) arg);
-		if (result)
-			return result;
-		break;
-
-	default:
-		return -ENOIOCTLCMD;
-	}
-
-	return 0;
-}
+	struct ir_rx51 *ir_rx51 = dev->priv;
 
-static int lirc_rx51_open(struct inode *inode, struct file *file)
-{
-	struct lirc_rx51 *lirc_rx51 = lirc_get_pdata(file);
-	BUG_ON(!lirc_rx51);
-
-	file->private_data = lirc_rx51;
-
-	if (test_and_set_bit(1, &lirc_rx51->device_is_open))
+	if (test_and_set_bit(1, &ir_rx51->device_is_open))
 		return -EBUSY;
 
-	lirc_rx51->pwm = pwm_get(lirc_rx51->dev, NULL);
-	if (IS_ERR(lirc_rx51->pwm)) {
-		int res = PTR_ERR(lirc_rx51->pwm);
+	ir_rx51->pwm = pwm_get(ir_rx51->dev, NULL);
+	if (IS_ERR(ir_rx51->pwm)) {
+		int res = PTR_ERR(ir_rx51->pwm);
 
-		dev_err(lirc_rx51->dev, "pwm_get failed: %d\n", res);
+		dev_err(ir_rx51->dev, "pwm_get failed: %d\n", res);
 		return res;
 	}
 
 	return 0;
 }
 
-static int lirc_rx51_release(struct inode *inode, struct file *file)
+static void ir_rx51_release(struct rc_dev *dev)
 {
-	struct lirc_rx51 *lirc_rx51 = file->private_data;
-
-	hrtimer_cancel(&lirc_rx51->timer);
-	lirc_rx51_off(lirc_rx51);
-	pwm_put(lirc_rx51->pwm);
+	struct ir_rx51 *ir_rx51 = dev->priv;
 
-	clear_bit(1, &lirc_rx51->device_is_open);
+	hrtimer_cancel(&ir_rx51->timer);
+	ir_rx51_off(ir_rx51);
+	pwm_put(ir_rx51->pwm);
 
-	return 0;
+	clear_bit(1, &ir_rx51->device_is_open);
 }
 
-static struct lirc_rx51 lirc_rx51 = {
+static struct ir_rx51 ir_rx51 = {
 	.duty_cycle	= 50,
 	.wbuf_index	= -1,
 };
 
-static const struct file_operations lirc_fops = {
-	.owner		= THIS_MODULE,
-	.write		= lirc_rx51_write,
-	.unlocked_ioctl	= lirc_rx51_ioctl,
-	.read		= lirc_dev_fop_read,
-	.poll		= lirc_dev_fop_poll,
-	.open		= lirc_rx51_open,
-	.release	= lirc_rx51_release,
-};
+static int ir_rx51_set_duty_cycle(struct rc_dev *dev, u32 duty)
+{
+	struct ir_rx51 *ir_rx51 = dev->priv;
 
-static struct lirc_driver lirc_rx51_driver = {
-	.name		= DRIVER_NAME,
-	.minor		= -1,
-	.code_length	= 1,
-	.data		= &lirc_rx51,
-	.fops		= &lirc_fops,
-	.owner		= THIS_MODULE,
-};
+	ir_rx51->duty_cycle = duty;
+
+	return 0;
+}
+
+static int ir_rx51_set_tx_carrier(struct rc_dev *dev, u32 carrier)
+{
+	struct ir_rx51 *ir_rx51 = dev->priv;
+
+	if (carrier > 500000 || carrier < 20000)
+		return -EINVAL;
+
+	ir_rx51->freq = carrier;
+
+	return 0;
+}
 
 #ifdef CONFIG_PM
 
-static int lirc_rx51_suspend(struct platform_device *dev, pm_message_t state)
+static int ir_rx51_suspend(struct platform_device *dev, pm_message_t state)
 {
 	/*
 	 * In case the device is still open, do not suspend. Normally
@@ -320,34 +219,34 @@ static int lirc_rx51_suspend(struct platform_device *dev, pm_message_t state)
 	 * were in a middle of a transmit. Thus, we defer any suspend
 	 * actions until transmit has completed.
 	 */
-	if (test_and_set_bit(1, &lirc_rx51.device_is_open))
+	if (test_and_set_bit(1, &ir_rx51.device_is_open))
 		return -EAGAIN;
 
-	clear_bit(1, &lirc_rx51.device_is_open);
+	clear_bit(1, &ir_rx51.device_is_open);
 
 	return 0;
 }
 
-static int lirc_rx51_resume(struct platform_device *dev)
+static int ir_rx51_resume(struct platform_device *dev)
 {
 	return 0;
 }
 
 #else
 
-#define lirc_rx51_suspend	NULL
-#define lirc_rx51_resume	NULL
+#define ir_rx51_suspend	NULL
+#define ir_rx51_resume	NULL
 
 #endif /* CONFIG_PM */
 
-static int lirc_rx51_probe(struct platform_device *dev)
+static int ir_rx51_probe(struct platform_device *dev)
 {
 	struct pwm_device *pwm;
+	struct rc_dev *rcdev;
 
-	lirc_rx51_driver.features = LIRC_RX51_DRIVER_FEATURES;
-	lirc_rx51.pdata = dev->dev.platform_data;
+	ir_rx51.pdata = dev->dev.platform_data;
 
-	if (!lirc_rx51.pdata) {
+	if (!ir_rx51.pdata) {
 		dev_err(&dev->dev, "Platform Data is missing\n");
 		return -ENXIO;
 	}
@@ -362,51 +261,56 @@ static int lirc_rx51_probe(struct platform_device *dev)
 	}
 
 	/* Use default, in case userspace does not set the carrier */
-	lirc_rx51.freq = DIV_ROUND_CLOSEST(pwm_get_period(pwm), NSEC_PER_SEC);
+	ir_rx51.freq = DIV_ROUND_CLOSEST(pwm_get_period(pwm), NSEC_PER_SEC);
 	pwm_put(pwm);
 
-	hrtimer_init(&lirc_rx51.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	lirc_rx51.timer.function = lirc_rx51_timer_cb;
+	hrtimer_init(&ir_rx51.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	ir_rx51.timer.function = ir_rx51_timer_cb;
 
-	lirc_rx51.dev = &dev->dev;
-	lirc_rx51_driver.dev = &dev->dev;
-	lirc_rx51_driver.minor = lirc_register_driver(&lirc_rx51_driver);
-	init_waitqueue_head(&lirc_rx51.wqueue);
+	ir_rx51.dev = &dev->dev;
 
-	if (lirc_rx51_driver.minor < 0) {
-		dev_err(lirc_rx51.dev, ": lirc_register_driver failed: %d\n",
-		       lirc_rx51_driver.minor);
-		return lirc_rx51_driver.minor;
-	}
+	rcdev = devm_rc_allocate_device(&dev->dev, RC_DRIVER_IR_RAW_TX);
+	if (!rcdev)
+		return -ENOMEM;
 
-	return 0;
+	rcdev->priv = &ir_rx51;
+	rcdev->open = ir_rx51_open;
+	rcdev->close = ir_rx51_release;
+	rcdev->tx_ir = ir_rx51_tx;
+	rcdev->s_tx_duty_cycle = ir_rx51_set_duty_cycle;
+	rcdev->s_tx_carrier = ir_rx51_set_tx_carrier;
+	rcdev->driver_name = KBUILD_MODNAME;
+
+	ir_rx51.rcdev = rcdev;
+
+	return devm_rc_register_device(&dev->dev, ir_rx51.rcdev);
 }
 
-static int lirc_rx51_remove(struct platform_device *dev)
+static int ir_rx51_remove(struct platform_device *dev)
 {
-	return lirc_unregister_driver(lirc_rx51_driver.minor);
+	return 0;
 }
 
-static const struct of_device_id lirc_rx51_match[] = {
+static const struct of_device_id ir_rx51_match[] = {
 	{
 		.compatible = "nokia,n900-ir",
 	},
 	{},
 };
-MODULE_DEVICE_TABLE(of, lirc_rx51_match);
+MODULE_DEVICE_TABLE(of, ir_rx51_match);
 
-struct platform_driver lirc_rx51_platform_driver = {
-	.probe		= lirc_rx51_probe,
-	.remove		= lirc_rx51_remove,
-	.suspend	= lirc_rx51_suspend,
-	.resume		= lirc_rx51_resume,
+static struct platform_driver ir_rx51_platform_driver = {
+	.probe		= ir_rx51_probe,
+	.remove		= ir_rx51_remove,
+	.suspend	= ir_rx51_suspend,
+	.resume		= ir_rx51_resume,
 	.driver		= {
-		.name	= DRIVER_NAME,
-		.of_match_table = of_match_ptr(lirc_rx51_match),
+		.name	= KBUILD_MODNAME,
+		.of_match_table = of_match_ptr(ir_rx51_match),
 	},
 };
-module_platform_driver(lirc_rx51_platform_driver);
+module_platform_driver(ir_rx51_platform_driver);
 
-MODULE_DESCRIPTION("LIRC TX driver for Nokia RX51");
+MODULE_DESCRIPTION("IR TX driver for Nokia RX51");
 MODULE_AUTHOR("Nokia Corporation");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/media/ir-rx51.h b/include/linux/platform_data/media/ir-rx51.h
index 812d87307877..2c94ab568bfa 100644
--- a/include/linux/platform_data/media/ir-rx51.h
+++ b/include/linux/platform_data/media/ir-rx51.h
@@ -1,7 +1,7 @@
-#ifndef _LIRC_RX51_H
-#define _LIRC_RX51_H
+#ifndef _IR_RX51_H
+#define _IR_RX51_H
 
-struct lirc_rx51_platform_data {
+struct ir_rx51_platform_data {
 	int(*set_max_mpu_wakeup_lat)(struct device *dev, long t);
 };
 
-- 
cgit v1.2.3


From ddeaa6379d50a530f9f57b3d12f7940e079b052c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 15 Oct 2016 13:59:03 -0700
Subject: sunrpc & nfs: Add and use dprintk_cont macros

Allow line continuations to work properly with KERN_CONT.

Signed-off-by: Joe Perches <joe@perches.com>
[Anna: Add fallback dprintk_cont() for when CONFIG_SUNRPC_DEBUG=n]
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/write.c               |  6 ++---
 include/linux/sunrpc/debug.h | 58 ++++++++++++++++++++++++++++++--------------
 2 files changed, 43 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b00d53d13d47..ad4219a41f25 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1787,7 +1787,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 		if (status < 0) {
 			nfs_context_set_write_error(req->wb_context, status);
 			nfs_inode_remove_request(req);
-			dprintk(", error = %d\n", status);
+			dprintk_cont(", error = %d\n", status);
 			goto next;
 		}
 
@@ -1796,11 +1796,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 		if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
 			/* We have a match */
 			nfs_inode_remove_request(req);
-			dprintk(" OK\n");
+			dprintk_cont(" OK\n");
 			goto next;
 		}
 		/* We have a mismatch. Write the page again */
-		dprintk(" mismatch\n");
+		dprintk_cont(" mismatch\n");
 		nfs_mark_request_dirty(req);
 		set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
 	next:
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 59a7889e15db..8da0f37f3bdc 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -20,33 +20,55 @@ extern unsigned int		nfsd_debug;
 extern unsigned int		nlm_debug;
 #endif
 
-#define dprintk(args...)	dfprintk(FACILITY, ## args)
-#define dprintk_rcu(args...)	dfprintk_rcu(FACILITY, ## args)
+#define dprintk(fmt, ...)						\
+	dfprintk(FACILITY, fmt, ##__VA_ARGS__)
+#define dprintk_cont(fmt, ...)						\
+	dfprintk_cont(FACILITY, fmt, ##__VA_ARGS__)
+#define dprintk_rcu(fmt, ...)						\
+	dfprintk_rcu(FACILITY, fmt, ##__VA_ARGS__)
+#define dprintk_rcu_cont(fmt, ...)					\
+	dfprintk_rcu_cont(FACILITY, fmt, ##__VA_ARGS__)
 
 #undef ifdebug
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define ifdebug(fac)		if (unlikely(rpc_debug & RPCDBG_##fac))
 
-# define dfprintk(fac, args...)	\
-	do { \
-		ifdebug(fac) \
-			printk(KERN_DEFAULT args); \
-	} while (0)
-
-# define dfprintk_rcu(fac, args...)	\
-	do { \
-		ifdebug(fac) { \
-			rcu_read_lock(); \
-			printk(KERN_DEFAULT args); \
-			rcu_read_unlock(); \
-		} \
-	} while (0)
+# define dfprintk(fac, fmt, ...)					\
+do {									\
+	ifdebug(fac)							\
+		printk(KERN_DEFAULT fmt, ##__VA_ARGS__);		\
+} while (0)
+
+# define dfprintk_cont(fac, fmt, ...)					\
+do {									\
+	ifdebug(fac)							\
+		printk(KERN_CONT fmt, ##__VA_ARGS__);			\
+} while (0)
+
+# define dfprintk_rcu(fac, fmt, ...)					\
+do {									\
+	ifdebug(fac) {							\
+		rcu_read_lock();					\
+		printk(KERN_DEFAULT fmt, ##__VA_ARGS__);		\
+		rcu_read_unlock();					\
+	}								\
+} while (0)
+
+# define dfprintk_rcu_cont(fac, fmt, ...)				\
+do {									\
+	ifdebug(fac) {							\
+		rcu_read_lock();					\
+		printk(KERN_CONT fmt, ##__VA_ARGS__);			\
+		rcu_read_unlock();					\
+	}								\
+} while (0)
 
 # define RPC_IFDEBUG(x)		x
 #else
 # define ifdebug(fac)		if (0)
-# define dfprintk(fac, args...)	do {} while (0)
-# define dfprintk_rcu(fac, args...)	do {} while (0)
+# define dfprintk(fac, fmt, ...)	do {} while (0)
+# define dfprintk_cont(fac, fmt, ...)	do {} while (0)
+# define dfprintk_rcu(fac, fmt, ...)	do {} while (0)
 # define RPC_IFDEBUG(x)
 #endif
 
-- 
cgit v1.2.3


From 297e1cf29eac13d3e5bb896d18c64a50b6bb48eb Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 29 Jan 2017 18:56:17 +0200
Subject: net/mlx4_en: Adding support of turning off link autonegotiation via
 ethtool

This feature will allow the user to disable auto negotiation
on the port for mlx4 devices while setting the speed is limited
to 1GbE speeds.
Other speeds will not be accepted in autoneg off mode.

This functionality is permitted providing that the firmware
is compatible with this feature.
The above is determined by querying a new dedicated capability
bit in the device.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 24 +++++++++++++++++++-----
 include/linux/mlx4/device.h                     |  7 ++++++-
 2 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 785757f17687..ca730d4abbb4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -902,6 +902,7 @@ mlx4_en_set_link_ksettings(struct net_device *dev,
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_ptys_reg ptys_reg;
 	__be32 proto_admin;
+	u8 cur_autoneg;
 	int ret;
 
 	u32 ptys_adv = ethtool2ptys_link_modes(
@@ -931,10 +932,21 @@ mlx4_en_set_link_ksettings(struct net_device *dev,
 		return 0;
 	}
 
-	proto_admin = link_ksettings->base.autoneg == AUTONEG_ENABLE ?
-		cpu_to_be32(ptys_adv) :
-		speed_set_ptys_admin(priv, speed,
-				     ptys_reg.eth_proto_cap);
+	cur_autoneg = ptys_reg.flags & MLX4_PTYS_AN_DISABLE_ADMIN ?
+				AUTONEG_DISABLE : AUTONEG_ENABLE;
+
+	if (link_ksettings->base.autoneg == AUTONEG_DISABLE) {
+		proto_admin = speed_set_ptys_admin(priv, speed,
+						   ptys_reg.eth_proto_cap);
+		if ((be32_to_cpu(proto_admin) &
+		     (MLX4_PROT_MASK(MLX4_1000BASE_CX_SGMII) |
+		      MLX4_PROT_MASK(MLX4_1000BASE_KX))) &&
+		    (ptys_reg.flags & MLX4_PTYS_AN_DISABLE_CAP))
+			ptys_reg.flags |= MLX4_PTYS_AN_DISABLE_ADMIN;
+	} else {
+		proto_admin = cpu_to_be32(ptys_adv);
+		ptys_reg.flags &= ~MLX4_PTYS_AN_DISABLE_ADMIN;
+	}
 
 	proto_admin &= ptys_reg.eth_proto_cap;
 	if (!proto_admin) {
@@ -942,7 +954,9 @@ mlx4_en_set_link_ksettings(struct net_device *dev,
 		return -EINVAL; /* nothing to change due to bad input */
 	}
 
-	if (proto_admin == ptys_reg.eth_proto_admin)
+	if ((proto_admin == ptys_reg.eth_proto_admin) &&
+	    ((ptys_reg.flags & MLX4_PTYS_AN_DISABLE_CAP) &&
+	     (link_ksettings->base.autoneg == cur_autoneg)))
 		return 0; /* Nothing to change */
 
 	en_dbg(DRV, priv, "mlx4_ACCESS_PTYS_REG SET: ptys_reg.eth_proto_admin = 0x%x\n",
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 6533c16e27ad..c3ac945b2759 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1539,8 +1539,13 @@ enum mlx4_ptys_proto {
 	MLX4_PTYS_EN = 1<<2,
 };
 
+enum mlx4_ptys_flags {
+	MLX4_PTYS_AN_DISABLE_CAP   = 1 << 5,
+	MLX4_PTYS_AN_DISABLE_ADMIN = 1 << 6,
+};
+
 struct mlx4_ptys_reg {
-	u8 resrvd1;
+	u8 flags;
 	u8 local_port;
 	u8 resrvd2;
 	u8 proto_mask;
-- 
cgit v1.2.3


From 40fb4fc1e18bc641a0d0887ac21943fd194c1fa9 Mon Sep 17 00:00:00 2001
From: Shaker Daibes <shakerd@mellanox.com>
Date: Sun, 29 Jan 2017 18:56:18 +0200
Subject: net/mlx4_en: Pass user MTU value to Firmware at set port command

When starting the port, driver will inform Firmware about the actual MTU
which does not include implicit headers, such as FCS or VLAN tags.

Signed-off-by: Shaker Daibes <shakerd@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  8 +++
 drivers/net/ethernet/mellanox/mlx4/fw.c        |  2 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |  7 ++-
 drivers/net/ethernet/mellanox/mlx4/port.c      | 71 ++++++++++++++++++++++++--
 include/linux/mlx4/device.h                    |  1 +
 5 files changed, 82 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index cfb4a9d67b45..60a021c34881 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1695,6 +1695,14 @@ int mlx4_en_start_port(struct net_device *dev)
 		       priv->port, err);
 		goto tx_err;
 	}
+
+	err = mlx4_SET_PORT_user_mtu(mdev->dev, priv->port, dev->mtu);
+	if (err) {
+		en_err(priv, "Failed to pass user MTU(%d) to Firmware for port %d, with error %d\n",
+		       dev->mtu, priv->port, err);
+		goto tx_err;
+	}
+
 	/* Set default qp number */
 	err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index de39a5db4a6f..3fe885ce1902 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -2983,7 +2983,7 @@ static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit)
 		return PTR_ERR(mailbox);
 	context = mailbox->buf;
 
-	context->v_ignore_fcs |=  SET_PORT_GEN_PHV_VALID;
+	context->flags2 |=  SET_PORT_GEN_PHV_VALID;
 	if (phv_bit)
 		context->phv_en |=  SET_PORT_GEN_PHV_EN;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 88ee7d8a5923..1132d76ddfdf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -487,6 +487,7 @@ struct mlx4_slave_state {
 	bool vst_qinq_supported;
 	u8 function;
 	dma_addr_t vhcr_dma;
+	u16 user_mtu[MLX4_MAX_PORTS + 1];
 	u16 mtu[MLX4_MAX_PORTS + 1];
 	__be32 ib_cap_mask[MLX4_MAX_PORTS + 1];
 	struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES];
@@ -590,6 +591,7 @@ struct mlx4_mfunc_master_ctx {
 	struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1];
 	int			init_port_ref[MLX4_MAX_PORTS + 1];
 	u16			max_mtu[MLX4_MAX_PORTS + 1];
+	u16			max_user_mtu[MLX4_MAX_PORTS + 1];
 	u8			pptx;
 	u8			pprx;
 	int			disable_mcast_ref[MLX4_MAX_PORTS + 1];
@@ -787,7 +789,7 @@ enum {
 
 struct mlx4_set_port_general_context {
 	u16 reserved1;
-	u8 v_ignore_fcs;
+	u8 flags2;
 	u8 flags;
 	union {
 		u8 ignore_fcs;
@@ -803,7 +805,8 @@ struct mlx4_set_port_general_context {
 	u16 reserved4;
 	u32 reserved5;
 	u8 phv_en;
-	u8 reserved6[3];
+	u8 reserved6[5];
+	__be16 user_mtu;
 };
 
 struct mlx4_set_port_rqp_calc_context {
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index b656dd5772e5..a7b0cdcb358a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -50,7 +50,8 @@
 #define MLX4_STATS_ERROR_COUNTERS_MASK		0x1ffc30ULL
 #define MLX4_STATS_PORT_COUNTERS_MASK		0x1fe00000ULL
 
-#define MLX4_FLAG_V_IGNORE_FCS_MASK		0x2
+#define MLX4_FLAG2_V_IGNORE_FCS_MASK		BIT(1)
+#define MLX4_FLAG2_V_USER_MTU_MASK		BIT(5)
 #define MLX4_IGNORE_FCS_MASK			0x1
 #define MLX4_TC_MAX_NUMBER			8
 
@@ -1239,6 +1240,38 @@ void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave)
 	return;
 }
 
+static void
+mlx4_en_set_port_user_mtu(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_set_port_general_context *gen_context)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+	struct mlx4_slave_state *slave_st = &master->slave_state[slave];
+	u16 user_mtu, prev_user_mtu;
+
+	/* User Mtu is configured as the max USER_MTU among all
+	 * the functions on the port.
+	 */
+	user_mtu = be16_to_cpu(gen_context->user_mtu);
+	user_mtu = min_t(int, user_mtu, dev->caps.eth_mtu_cap[port]);
+	prev_user_mtu = slave_st->user_mtu[port];
+	slave_st->user_mtu[port] = user_mtu;
+	if (user_mtu > master->max_user_mtu[port])
+		master->max_user_mtu[port] = user_mtu;
+	if (user_mtu < prev_user_mtu &&
+	    prev_user_mtu == master->max_user_mtu[port]) {
+		int i;
+
+		slave_st->user_mtu[port] = user_mtu;
+		master->max_user_mtu[port] = user_mtu;
+		for (i = 0; i < dev->num_slaves; i++)
+			master->max_user_mtu[port] =
+				max_t(u16, master->max_user_mtu[port],
+				      master->slave_state[i].user_mtu[port]);
+	}
+	gen_context->user_mtu = cpu_to_be16(master->max_user_mtu[port]);
+}
+
 static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
 				u8 op_mod, struct mlx4_cmd_mailbox *inbox)
 {
@@ -1269,7 +1302,9 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
 	is_eth = op_mod;
 	port_info = &priv->port[port];
 
-	/* Slaves cannot perform SET_PORT operations except changing MTU */
+	/* Slaves cannot perform SET_PORT operations,
+	 * except for changing MTU and USER_MTU.
+	 */
 	if (is_eth) {
 		if (slave != dev->caps.function &&
 		    in_modifier != MLX4_SET_PORT_GENERAL &&
@@ -1316,8 +1351,12 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
 					    master->slave_state[i].mtu[port]);
 				}
 			}
-
 			gen_context->mtu = cpu_to_be16(master->max_mtu[port]);
+
+			if (gen_context->flags2 & MLX4_FLAG2_V_USER_MTU_MASK)
+				mlx4_en_set_port_user_mtu(dev, slave, port,
+							  gen_context);
+
 			/* Slave cannot change Global Pause configuration */
 			if (slave != mlx4_master_func_num(dev) &&
 			    ((gen_context->pptx != master->pptx) ||
@@ -1608,6 +1647,30 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 }
 EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc);
 
+int mlx4_SET_PORT_user_mtu(struct mlx4_dev *dev, u8 port, u16 user_mtu)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_USER_MTU_MASK;
+	context->user_mtu = cpu_to_be16(user_mtu);
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_user_mtu);
+
 int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, u8 ignore_fcs_value)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -1619,7 +1682,7 @@ int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, u8 ignore_fcs_value)
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 	context = mailbox->buf;
-	context->v_ignore_fcs |= MLX4_FLAG_V_IGNORE_FCS_MASK;
+	context->flags2 |= MLX4_FLAG2_V_IGNORE_FCS_MASK;
 	if (ignore_fcs_value)
 		context->ignore_fcs |= MLX4_IGNORE_FCS_MASK;
 	else
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c3ac945b2759..7e66e4f62858 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1374,6 +1374,7 @@ int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port);
 int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac);
 int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_user_mtu(struct mlx4_dev *dev, u8 port, u16 user_mtu);
 int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 			   u8 promisc);
 int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time);
-- 
cgit v1.2.3


From 9e9fc6f00a54f7064dc681ac187be6498d566a4f Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 25 Jan 2017 14:06:27 +0530
Subject: cpuidle:powernv: Add helper function to populate powernv idle states.

In the current code for powernv_add_idle_states, there is a lot of code
duplication while initializing an idle state in powernv_states table.

Add an inline helper function to populate the powernv_states[] table
for a given idle state. Invoke this for populating the "Nap",
"Fastsleep" and the stop states in powernv_add_idle_states.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/cpuidle/cpuidle-powernv.c | 89 +++++++++++++++++++++++----------------
 include/linux/cpuidle.h           |  1 +
 2 files changed, 54 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 0835a37a5f3a..6871b7f34dc8 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -20,6 +20,10 @@
 #include <asm/opal.h>
 #include <asm/runlatch.h>
 
+/*
+ * Expose only those Hardware idle states via the cpuidle framework
+ * that have latency value below POWERNV_THRESHOLD_LATENCY_NS.
+ */
 #define POWERNV_THRESHOLD_LATENCY_NS 200000
 
 static struct cpuidle_driver powernv_idle_driver = {
@@ -167,6 +171,24 @@ static int powernv_cpuidle_driver_init(void)
 	return 0;
 }
 
+static inline void add_powernv_state(int index, const char *name,
+				     unsigned int flags,
+				     int (*idle_fn)(struct cpuidle_device *,
+						    struct cpuidle_driver *,
+						    int),
+				     unsigned int target_residency,
+				     unsigned int exit_latency,
+				     u64 psscr_val)
+{
+	strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
+	strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+	powernv_states[index].flags = flags;
+	powernv_states[index].target_residency = target_residency;
+	powernv_states[index].exit_latency = exit_latency;
+	powernv_states[index].enter = idle_fn;
+	stop_psscr_table[index] = psscr_val;
+}
+
 static int powernv_add_idle_states(void)
 {
 	struct device_node *power_mgt;
@@ -236,6 +258,7 @@ static int powernv_add_idle_states(void)
 		"ibm,cpu-idle-state-residency-ns", residency_ns, dt_idle_states);
 
 	for (i = 0; i < dt_idle_states; i++) {
+		unsigned int exit_latency, target_residency;
 		/*
 		 * If an idle state has exit latency beyond
 		 * POWERNV_THRESHOLD_LATENCY_NS then don't use it
@@ -243,28 +266,33 @@ static int powernv_add_idle_states(void)
 		 */
 		if (latency_ns[i] > POWERNV_THRESHOLD_LATENCY_NS)
 			continue;
+		/*
+		 * Firmware passes residency and latency values in ns.
+		 * cpuidle expects it in us.
+		 */
+		exit_latency = latency_ns[i] / 1000;
+		if (!rc)
+			target_residency = residency_ns[i] / 1000;
+		else
+			target_residency = 0;
 
 		/*
-		 * Cpuidle accepts exit_latency and target_residency in us.
-		 * Use default target_residency values if f/w does not expose it.
+		 * For nap and fastsleep, use default target_residency
+		 * values if f/w does not expose it.
 		 */
 		if (flags[i] & OPAL_PM_NAP_ENABLED) {
+			if (!rc)
+				target_residency = 100;
 			/* Add NAP state */
-			strcpy(powernv_states[nr_idle_states].name, "Nap");
-			strcpy(powernv_states[nr_idle_states].desc, "Nap");
-			powernv_states[nr_idle_states].flags = 0;
-			powernv_states[nr_idle_states].target_residency = 100;
-			powernv_states[nr_idle_states].enter = nap_loop;
+			add_powernv_state(nr_idle_states, "Nap",
+					  CPUIDLE_FLAG_NONE, nap_loop,
+					  target_residency, exit_latency, 0);
 		} else if ((flags[i] & OPAL_PM_STOP_INST_FAST) &&
 				!(flags[i] & OPAL_PM_TIMEBASE_STOP)) {
-			strncpy(powernv_states[nr_idle_states].name,
-				names[i], CPUIDLE_NAME_LEN);
-			strncpy(powernv_states[nr_idle_states].desc,
-				names[i], CPUIDLE_NAME_LEN);
-			powernv_states[nr_idle_states].flags = 0;
-
-			powernv_states[nr_idle_states].enter = stop_loop;
-			stop_psscr_table[nr_idle_states] = psscr_val[i];
+			add_powernv_state(nr_idle_states, names[i],
+					  CPUIDLE_FLAG_NONE, stop_loop,
+					  target_residency, exit_latency,
+					  psscr_val[i]);
 		}
 
 		/*
@@ -274,32 +302,21 @@ static int powernv_add_idle_states(void)
 #ifdef CONFIG_TICK_ONESHOT
 		if (flags[i] & OPAL_PM_SLEEP_ENABLED ||
 			flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) {
+			if (!rc)
+				target_residency = 300000;
 			/* Add FASTSLEEP state */
-			strcpy(powernv_states[nr_idle_states].name, "FastSleep");
-			strcpy(powernv_states[nr_idle_states].desc, "FastSleep");
-			powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIMER_STOP;
-			powernv_states[nr_idle_states].target_residency = 300000;
-			powernv_states[nr_idle_states].enter = fastsleep_loop;
+			add_powernv_state(nr_idle_states, "FastSleep",
+					  CPUIDLE_FLAG_TIMER_STOP,
+					  fastsleep_loop,
+					  target_residency, exit_latency, 0);
 		} else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) &&
 				(flags[i] & OPAL_PM_TIMEBASE_STOP)) {
-			strncpy(powernv_states[nr_idle_states].name,
-				names[i], CPUIDLE_NAME_LEN);
-			strncpy(powernv_states[nr_idle_states].desc,
-				names[i], CPUIDLE_NAME_LEN);
-
-			powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIMER_STOP;
-			powernv_states[nr_idle_states].enter = stop_loop;
-			stop_psscr_table[nr_idle_states] = psscr_val[i];
+			add_powernv_state(nr_idle_states, names[i],
+					  CPUIDLE_FLAG_TIMER_STOP, stop_loop,
+					  target_residency, exit_latency,
+					  psscr_val[i]);
 		}
 #endif
-		powernv_states[nr_idle_states].exit_latency =
-				((unsigned int)latency_ns[i]) / 1000;
-
-		if (!rc) {
-			powernv_states[nr_idle_states].target_residency =
-				((unsigned int)residency_ns[i]) / 1000;
-		}
-
 		nr_idle_states++;
 	}
 out:
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index da346f2817a8..fc1e5d7fc1c7 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -62,6 +62,7 @@ struct cpuidle_state {
 };
 
 /* Idle State Flags */
+#define CPUIDLE_FLAG_NONE       (0x00)
 #define CPUIDLE_FLAG_COUPLED	(0x02) /* state applies to multiple cpus */
 #define CPUIDLE_FLAG_TIMER_STOP (0x04)  /* timer is stopped on this state */
 
-- 
cgit v1.2.3


From 608d792192d7136d354bc1b44585c441164dc54e Mon Sep 17 00:00:00 2001
From: Sarangdhar Joshi <spjoshi@codeaurora.org>
Date: Mon, 23 Jan 2017 17:53:18 -0800
Subject: remoteproc: Add RPROC_DELETED state

Add new state RPROC_DELETED to handle synchronization
between rproc_del() and other operations on rproc. This
state represents the rproc device that has been "deleted".

CC: Loic Pallardy <loic.pallardy@st.com>
CC: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Sarangdhar Joshi <spjoshi@codeaurora.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_sysfs.c | 1 +
 include/linux/remoteproc.h            | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c
index bc5b0e00efb1..47be411400e5 100644
--- a/drivers/remoteproc/remoteproc_sysfs.c
+++ b/drivers/remoteproc/remoteproc_sysfs.c
@@ -73,6 +73,7 @@ static const char * const rproc_state_string[] = {
 	[RPROC_SUSPENDED]	= "suspended",
 	[RPROC_RUNNING]		= "running",
 	[RPROC_CRASHED]		= "crashed",
+	[RPROC_DELETED]		= "deleted",
 	[RPROC_LAST]		= "invalid",
 };
 
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 8265d351c9f0..e691f64fc7be 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -346,6 +346,7 @@ struct rproc_ops {
  *			a message.
  * @RPROC_RUNNING:	device is up and running
  * @RPROC_CRASHED:	device has crashed; need to start recovery
+ * @RPROC_DELETED:	device is deleted
  * @RPROC_LAST:		just keep this one at the end
  *
  * Please note that the values of these states are used as indices
@@ -359,7 +360,8 @@ enum rproc_state {
 	RPROC_SUSPENDED	= 1,
 	RPROC_RUNNING	= 2,
 	RPROC_CRASHED	= 3,
-	RPROC_LAST	= 4,
+	RPROC_DELETED	= 4,
+	RPROC_LAST	= 5,
 };
 
 /**
-- 
cgit v1.2.3


From 2099c77d4af7d1582db6d4437014cf18fe62e74b Mon Sep 17 00:00:00 2001
From: Sarangdhar Joshi <spjoshi@codeaurora.org>
Date: Mon, 23 Jan 2017 17:53:19 -0800
Subject: remoteproc: Drop firmware_loading_complete

firmware_loading_complete is used to synchronize operations
on rproc while asynchronous firmware loading is in progress.
However, rproc_boot() no longer waits on
firmware_loading_complete. Hence drop this completion
variable altogether and handle the race between rproc_del()
and rproc_boot() using new state RPROC_DELETED.

The request_firmware_nowait() will hold the reference to
rproc device by using a get_device()/put_device(), so the
rproc struct will remain valid even when we return from
rproc_del() before the asynchronous call to
rproc_fw_config_virtio() completes.

CC: Loic Pallardy <loic.pallardy@st.com>
CC: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Sarangdhar Joshi <spjoshi@codeaurora.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c | 22 +++++++++++-----------
 include/linux/remoteproc.h           |  2 --
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 44f119a671fe..a7ee05006cca 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -977,17 +977,12 @@ static void rproc_fw_config_virtio(const struct firmware *fw, void *context)
 		rproc_boot(rproc);
 
 	release_firmware(fw);
-	/* allow rproc_del() contexts, if any, to proceed */
-	complete_all(&rproc->firmware_loading_complete);
 }
 
 static int rproc_add_virtio_devices(struct rproc *rproc)
 {
 	int ret;
 
-	/* rproc_del() calls must wait until async loader completes */
-	init_completion(&rproc->firmware_loading_complete);
-
 	/*
 	 * We must retrieve early virtio configuration info from
 	 * the firmware (e.g. whether to register a virtio device,
@@ -999,10 +994,8 @@ static int rproc_add_virtio_devices(struct rproc *rproc)
 	ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
 				      rproc->firmware, &rproc->dev, GFP_KERNEL,
 				      rproc, rproc_fw_config_virtio);
-	if (ret < 0) {
+	if (ret < 0)
 		dev_err(&rproc->dev, "request_firmware_nowait err: %d\n", ret);
-		complete_all(&rproc->firmware_loading_complete);
-	}
 
 	return ret;
 }
@@ -1099,6 +1092,12 @@ static int __rproc_boot(struct rproc *rproc)
 		return ret;
 	}
 
+	if (rproc->state == RPROC_DELETED) {
+		ret = -ENODEV;
+		dev_err(dev, "can't boot deleted rproc %s\n", rproc->name);
+		goto unlock_mutex;
+	}
+
 	/* skip the boot process if rproc is already powered up */
 	if (atomic_inc_return(&rproc->power) > 1) {
 		ret = 0;
@@ -1481,14 +1480,15 @@ int rproc_del(struct rproc *rproc)
 	if (!rproc)
 		return -EINVAL;
 
-	/* if rproc is just being registered, wait */
-	wait_for_completion(&rproc->firmware_loading_complete);
-
 	/* if rproc is marked always-on, rproc_add() booted it */
 	/* TODO: make sure this works with rproc->power > 1 */
 	if (rproc->auto_boot)
 		rproc_shutdown(rproc);
 
+	mutex_lock(&rproc->lock);
+	rproc->state = RPROC_DELETED;
+	mutex_unlock(&rproc->lock);
+
 	rproc_delete_debug_dir(rproc);
 
 	/* the rproc is downref'ed as soon as it's removed from the klist */
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index e691f64fc7be..81da49564ff4 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -399,7 +399,6 @@ enum rproc_crash_type {
  * @num_traces: number of trace buffers
  * @carveouts: list of physically contiguous memory allocations
  * @mappings: list of iommu mappings we initiated, needed on shutdown
- * @firmware_loading_complete: marks e/o asynchronous firmware loading
  * @bootaddr: address of first instruction to boot rproc with (optional)
  * @rvdevs: list of remote virtio devices
  * @subdevs: list of subdevices, to following the running state
@@ -431,7 +430,6 @@ struct rproc {
 	int num_traces;
 	struct list_head carveouts;
 	struct list_head mappings;
-	struct completion firmware_loading_complete;
 	u32 bootaddr;
 	struct list_head rvdevs;
 	struct list_head subdevs;
-- 
cgit v1.2.3


From 8ff6daa17b6a64e59bbabaa116b9bd854fa4da1f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Jan 2017 23:20:26 -0800
Subject: iomap: constify struct iomap_ops

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/dax.c              |  6 +++---
 fs/ext2/ext2.h        |  2 +-
 fs/ext2/inode.c       |  4 ++--
 fs/ext4/ext4.h        |  2 +-
 fs/ext4/inode.c       |  2 +-
 fs/internal.h         |  2 +-
 fs/iomap.c            | 18 +++++++++---------
 fs/xfs/xfs_iomap.c    |  4 ++--
 fs/xfs/xfs_iomap.h    |  4 ++--
 include/linux/dax.h   |  8 ++++----
 include/linux/iomap.h | 14 +++++++-------
 11 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 3af2da5e64ce..78b9651576c6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1074,7 +1074,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  */
 ssize_t
 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
@@ -1118,7 +1118,7 @@ static int dax_fault_return(int error)
  * necessary locking for the page fault to proceed successfully.
  */
 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			struct iomap_ops *ops)
+			const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@@ -1317,7 +1317,7 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 }
 
 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+		pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = address & PMD_MASK;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 37e2be784ac7..5e64de9c5093 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,7 +814,7 @@ extern const struct file_operations ext2_file_operations;
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
 extern const struct address_space_operations ext2_nobh_aops;
-extern struct iomap_ops ext2_iomap_ops;
+extern const struct iomap_ops ext2_iomap_ops;
 
 /* namei.c */
 extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index f073bfca694b..128cce540645 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -842,13 +842,13 @@ ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	return 0;
 }
 
-struct iomap_ops ext2_iomap_ops = {
+const struct iomap_ops ext2_iomap_ops = {
 	.iomap_begin		= ext2_iomap_begin,
 	.iomap_end		= ext2_iomap_end,
 };
 #else
 /* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
-struct iomap_ops ext2_iomap_ops;
+const struct iomap_ops ext2_iomap_ops;
 #endif /* CONFIG_FS_DAX */
 
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2163c1e69f2a..ce70403c4707 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3253,7 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 	}
 }
 
-extern struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_ops;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 88d57af1b516..96c2e12cc5d6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3420,7 +3420,7 @@ orphan_del:
 	return ret;
 }
 
-struct iomap_ops ext4_iomap_ops = {
+const struct iomap_ops ext4_iomap_ops = {
 	.iomap_begin		= ext4_iomap_begin,
 	.iomap_end		= ext4_iomap_end,
 };
diff --git a/fs/internal.h b/fs/internal.h
index b63cf3af2dc2..11c6d89dce9c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -182,7 +182,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 		void *data, struct iomap *iomap);
 
 loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, struct iomap_ops *ops, void *data,
+		unsigned flags, const struct iomap_ops *ops, void *data,
 		iomap_actor_t actor);
 
 /* direct-io.c: */
diff --git a/fs/iomap.c b/fs/iomap.c
index 354a123f170e..7f08ca03d95d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -41,7 +41,7 @@
  */
 loff_t
 iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
-		struct iomap_ops *ops, void *data, iomap_actor_t actor)
+		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
 {
 	struct iomap iomap = { 0 };
 	loff_t written = 0, ret;
@@ -232,7 +232,7 @@ again:
 
 ssize_t
 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
@@ -315,7 +315,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 int
 iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	loff_t ret;
 
@@ -395,7 +395,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
 
 int
 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	loff_t ret;
 
@@ -415,7 +415,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
 
 int
 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	unsigned blocksize = (1 << inode->i_blkbits);
 	unsigned off = pos & (blocksize - 1);
@@ -443,7 +443,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 }
 
 int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
@@ -542,7 +542,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 }
 
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
-		loff_t start, loff_t len, struct iomap_ops *ops)
+		loff_t start, loff_t len, const struct iomap_ops *ops)
 {
 	struct fiemap_ctx ctx;
 	loff_t ret;
@@ -836,8 +836,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 }
 
 ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
-		iomap_dio_end_io_t end_io)
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = file_inode(iocb->ki_filp);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1aa3abd67b36..25ed98324b27 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1144,7 +1144,7 @@ xfs_file_iomap_end(
 	return 0;
 }
 
-struct iomap_ops xfs_iomap_ops = {
+const struct iomap_ops xfs_iomap_ops = {
 	.iomap_begin		= xfs_file_iomap_begin,
 	.iomap_end		= xfs_file_iomap_end,
 };
@@ -1190,6 +1190,6 @@ out_unlock:
 	return error;
 }
 
-struct iomap_ops xfs_xattr_iomap_ops = {
+const struct iomap_ops xfs_xattr_iomap_ops = {
 	.iomap_begin		= xfs_xattr_iomap_begin,
 };
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 6d45cf01fcff..705224b66b6a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -33,7 +33,7 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
 xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
-extern struct iomap_ops xfs_iomap_ops;
-extern struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_xattr_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 24ad71173995..2983e52efd07 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -37,9 +37,9 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 }
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			struct iomap_ops *ops);
+			const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -72,7 +72,7 @@ static inline unsigned int dax_radix_order(void *entry)
 	return 0;
 }
 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+		pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
@@ -80,7 +80,7 @@ static inline unsigned int dax_radix_order(void *entry)
 }
 static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd, unsigned int flags,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index a4c94b86401e..891459caa278 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -72,17 +72,17 @@ struct iomap_ops {
 };
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-		bool *did_zero, struct iomap_ops *ops);
+		bool *did_zero, const struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		loff_t start, loff_t len, struct iomap_ops *ops);
+		loff_t start, loff_t len, const struct iomap_ops *ops);
 
 /*
  * Flags for direct I/O ->end_io:
@@ -92,6 +92,6 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
 		unsigned flags);
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops, iomap_dio_end_io_t end_io);
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io);
 
 #endif /* LINUX_IOMAP_H */
-- 
cgit v1.2.3


From 77d65d6f3d60cebb2dc24cf05408255a21bb6409 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Fri, 27 Jan 2017 17:42:01 +0100
Subject: dmaengine: Provide a wrapper for memcpy operations

Almost all ->device_prep_dma_xx() methods have a wrapper defined in
dmaengine.h. Add one for  ->device_prep_dma_memcpy().

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index feee6ec6a13b..533680860865 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -894,6 +894,17 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset(
 						    len, flags);
 }
 
+static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
+		struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+		size_t len, unsigned long flags)
+{
+	if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy)
+		return NULL;
+
+	return chan->device->device_prep_dma_memcpy(chan, dest, src,
+						    len, flags);
+}
+
 static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
 		struct dma_chan *chan,
 		struct scatterlist *dst_sg, unsigned int dst_nents,
-- 
cgit v1.2.3


From bcf23c79c4e46130701370af4383b61a3cba755c Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Tue, 31 Jan 2017 15:38:16 +0900
Subject: PM / devfreq: Fix available_governor sysfs

The devfreq using passive governor is not able to change the governor.
So, the user can not change the governor through 'available_governor' sysfs
entry. Also, the devfreq which don't use the passive governor is not able to
change to 'passive' governor on the fly.

Fixes: 996133119f57 ("PM / devfreq: Add new passive governor")
Cc: stable@vger.kernel.org
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/devfreq.c          | 31 +++++++++++++++++++++++++++----
 drivers/devfreq/governor_passive.c |  1 +
 include/linux/devfreq.h            |  3 +++
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 8e5938c9c7d6..ade279d29f1e 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -940,6 +940,9 @@ static ssize_t governor_store(struct device *dev, struct device_attribute *attr,
 	if (df->governor == governor) {
 		ret = 0;
 		goto out;
+	} else if (df->governor->immutable || governor->immutable) {
+		ret = -EINVAL;
+		goto out;
 	}
 
 	if (df->governor) {
@@ -969,13 +972,33 @@ static ssize_t available_governors_show(struct device *d,
 					struct device_attribute *attr,
 					char *buf)
 {
-	struct devfreq_governor *tmp_governor;
+	struct devfreq *df = to_devfreq(d);
 	ssize_t count = 0;
 
 	mutex_lock(&devfreq_list_lock);
-	list_for_each_entry(tmp_governor, &devfreq_governor_list, node)
-		count += scnprintf(&buf[count], (PAGE_SIZE - count - 2),
-				   "%s ", tmp_governor->name);
+
+	/*
+	 * The devfreq with immutable governor (e.g., passive) shows
+	 * only own governor.
+	 */
+	if (df->governor->immutable) {
+		count = scnprintf(&buf[count], DEVFREQ_NAME_LEN,
+				   "%s ", df->governor_name);
+	/*
+	 * The devfreq device shows the registered governor except for
+	 * immutable governors such as passive governor .
+	 */
+	} else {
+		struct devfreq_governor *governor;
+
+		list_for_each_entry(governor, &devfreq_governor_list, node) {
+			if (governor->immutable)
+				continue;
+			count += scnprintf(&buf[count], (PAGE_SIZE - count - 2),
+					   "%s ", governor->name);
+		}
+	}
+
 	mutex_unlock(&devfreq_list_lock);
 
 	/* Truncate the trailing space */
diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c
index 9ef46e2592c4..93795b32dc09 100644
--- a/drivers/devfreq/governor_passive.c
+++ b/drivers/devfreq/governor_passive.c
@@ -179,6 +179,7 @@ static int devfreq_passive_event_handler(struct devfreq *devfreq,
 
 static struct devfreq_governor devfreq_passive = {
 	.name = "passive",
+	.immutable = 1,
 	.get_target_freq = devfreq_passive_get_target_freq,
 	.event_handler = devfreq_passive_event_handler,
 };
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 2de4e2eea180..e0acb0e5243b 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -104,6 +104,8 @@ struct devfreq_dev_profile {
  * struct devfreq_governor - Devfreq policy governor
  * @node:		list node - contains registered devfreq governors
  * @name:		Governor's name
+ * @immutable:		Immutable flag for governor. If the value is 1,
+ *			this govenror is never changeable to other governor.
  * @get_target_freq:	Returns desired operating frequency for the device.
  *			Basically, get_target_freq will run
  *			devfreq_dev_profile.get_dev_status() to get the
@@ -121,6 +123,7 @@ struct devfreq_governor {
 	struct list_head node;
 
 	const char name[DEVFREQ_NAME_LEN];
+	const unsigned int immutable;
 	int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
 	int (*event_handler)(struct devfreq *devfreq,
 				unsigned int event, void *data);
-- 
cgit v1.2.3


From a1656454131880980bc3a5313c8bf66ef5990c91 Mon Sep 17 00:00:00 2001
From: Alex Ng <alexng@messages.microsoft.com>
Date: Sat, 28 Jan 2017 12:37:17 -0700
Subject: Drivers: hv: vmbus: Use all supported IC versions to negotiate

Previously, we were assuming that each IC protocol version was tied to a
specific host version. For example, some Windows 10 preview hosts only
support v3 TimeSync even though driver assumes v4 is supported by all
Windows 10 hosts.

The guest will stop trying to negotiate even though older supported
versions may still be offered by the host.

Make IC version negotiation more robust by going through all versions
that are supported by the guest.

Fixes: 3da0401b4d0e ("Drivers: hv: utils: Fix the mapping between host
version and protocol to use")

Reported-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
Signed-off-by: Alex Ng <alexng@messages.microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c | 80 +++++++++++++++++++++++++++-------------
 drivers/hv/hv_fcopy.c     | 20 +++++++---
 drivers/hv/hv_kvp.c       | 41 +++++++++------------
 drivers/hv/hv_snapshot.c  | 18 +++++++--
 drivers/hv/hv_util.c      | 94 +++++++++++++++++++++++++----------------------
 include/linux/hyperv.h    |  7 ++--
 6 files changed, 154 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 49d77be90ca4..de90a9900fee 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -204,33 +204,34 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel)
  * @buf: Raw buffer channel data
  *
  * @icmsghdrp is of type &struct icmsg_hdr.
- * @negop is of type &struct icmsg_negotiate.
  * Set up and fill in default negotiate response message.
  *
- * The fw_version specifies the  framework version that
- * we can support and srv_version specifies the service
- * version we can support.
+ * The fw_version and fw_vercnt specifies the framework version that
+ * we can support.
+ *
+ * The srv_version and srv_vercnt specifies the service
+ * versions we can support.
+ *
+ * Versions are given in decreasing order.
+ *
+ * nego_fw_version and nego_srv_version store the selected protocol versions.
  *
  * Mainly used by Hyper-V drivers.
  */
 bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
-				struct icmsg_negotiate *negop, u8 *buf,
-				int fw_version, int srv_version)
+				u8 *buf, const int *fw_version, int fw_vercnt,
+				const int *srv_version, int srv_vercnt,
+				int *nego_fw_version, int *nego_srv_version)
 {
 	int icframe_major, icframe_minor;
 	int icmsg_major, icmsg_minor;
 	int fw_major, fw_minor;
 	int srv_major, srv_minor;
-	int i;
+	int i, j;
 	bool found_match = false;
+	struct icmsg_negotiate *negop;
 
 	icmsghdrp->icmsgsize = 0x10;
-	fw_major = (fw_version >> 16);
-	fw_minor = (fw_version & 0xFFFF);
-
-	srv_major = (srv_version >> 16);
-	srv_minor = (srv_version & 0xFFFF);
-
 	negop = (struct icmsg_negotiate *)&buf[
 		sizeof(struct vmbuspipe_hdr) +
 		sizeof(struct icmsg_hdr)];
@@ -246,13 +247,22 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
 	 * support.
 	 */
 
-	for (i = 0; i < negop->icframe_vercnt; i++) {
-		if ((negop->icversion_data[i].major == fw_major) &&
-		   (negop->icversion_data[i].minor == fw_minor)) {
-			icframe_major = negop->icversion_data[i].major;
-			icframe_minor = negop->icversion_data[i].minor;
-			found_match = true;
+	for (i = 0; i < fw_vercnt; i++) {
+		fw_major = (fw_version[i] >> 16);
+		fw_minor = (fw_version[i] & 0xFFFF);
+
+		for (j = 0; j < negop->icframe_vercnt; j++) {
+			if ((negop->icversion_data[j].major == fw_major) &&
+			    (negop->icversion_data[j].minor == fw_minor)) {
+				icframe_major = negop->icversion_data[j].major;
+				icframe_minor = negop->icversion_data[j].minor;
+				found_match = true;
+				break;
+			}
 		}
+
+		if (found_match)
+			break;
 	}
 
 	if (!found_match)
@@ -260,14 +270,26 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
 
 	found_match = false;
 
-	for (i = negop->icframe_vercnt;
-		 (i < negop->icframe_vercnt + negop->icmsg_vercnt); i++) {
-		if ((negop->icversion_data[i].major == srv_major) &&
-		   (negop->icversion_data[i].minor == srv_minor)) {
-			icmsg_major = negop->icversion_data[i].major;
-			icmsg_minor = negop->icversion_data[i].minor;
-			found_match = true;
+	for (i = 0; i < srv_vercnt; i++) {
+		srv_major = (srv_version[i] >> 16);
+		srv_minor = (srv_version[i] & 0xFFFF);
+
+		for (j = negop->icframe_vercnt;
+			(j < negop->icframe_vercnt + negop->icmsg_vercnt);
+			j++) {
+
+			if ((negop->icversion_data[j].major == srv_major) &&
+				(negop->icversion_data[j].minor == srv_minor)) {
+
+				icmsg_major = negop->icversion_data[j].major;
+				icmsg_minor = negop->icversion_data[j].minor;
+				found_match = true;
+				break;
+			}
 		}
+
+		if (found_match)
+			break;
 	}
 
 	/*
@@ -284,6 +306,12 @@ fw_error:
 		negop->icmsg_vercnt = 1;
 	}
 
+	if (nego_fw_version)
+		*nego_fw_version = (icframe_major << 16) | icframe_minor;
+
+	if (nego_srv_version)
+		*nego_srv_version = (icmsg_major << 16) | icmsg_minor;
+
 	negop->icversion_data[0].major = icframe_major;
 	negop->icversion_data[0].minor = icframe_minor;
 	negop->icversion_data[1].major = icmsg_major;
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index e47d8c9db03a..0a315e6aa589 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -31,6 +31,16 @@
 #define WIN8_SRV_MINOR		1
 #define WIN8_SRV_VERSION	(WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
 
+#define FCOPY_VER_COUNT 1
+static const int fcopy_versions[] = {
+	WIN8_SRV_VERSION
+};
+
+#define FW_VER_COUNT 1
+static const int fw_versions[] = {
+	UTIL_FW_VERSION
+};
+
 /*
  * Global state maintained for transaction that is being processed.
  * For a class of integration services, including the "file copy service",
@@ -228,8 +238,6 @@ void hv_fcopy_onchannelcallback(void *context)
 	u64 requestid;
 	struct hv_fcopy_hdr *fcopy_msg;
 	struct icmsg_hdr *icmsghdr;
-	struct icmsg_negotiate *negop = NULL;
-	int util_fw_version;
 	int fcopy_srv_version;
 
 	if (fcopy_transaction.state > HVUTIL_READY)
@@ -243,10 +251,10 @@ void hv_fcopy_onchannelcallback(void *context)
 	icmsghdr = (struct icmsg_hdr *)&recv_buffer[
 			sizeof(struct vmbuspipe_hdr)];
 	if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-		util_fw_version = UTIL_FW_VERSION;
-		fcopy_srv_version = WIN8_SRV_VERSION;
-		vmbus_prep_negotiate_resp(icmsghdr, negop, recv_buffer,
-				util_fw_version, fcopy_srv_version);
+		vmbus_prep_negotiate_resp(icmsghdr, recv_buffer,
+				fw_versions, FW_VER_COUNT,
+				fcopy_versions, FCOPY_VER_COUNT,
+				NULL, &fcopy_srv_version);
 	} else {
 		fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[
 				sizeof(struct vmbuspipe_hdr) +
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 3abfc5983c97..2cc670442f6c 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -46,6 +46,19 @@
 #define WIN8_SRV_MINOR   0
 #define WIN8_SRV_VERSION     (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
 
+#define KVP_VER_COUNT 3
+static const int kvp_versions[] = {
+	WIN8_SRV_VERSION,
+	WIN7_SRV_VERSION,
+	WS2008_SRV_VERSION
+};
+
+#define FW_VER_COUNT 2
+static const int fw_versions[] = {
+	UTIL_FW_VERSION,
+	UTIL_WS2K8_FW_VERSION
+};
+
 /*
  * Global state maintained for transaction that is being processed. For a class
  * of integration services, including the "KVP service", the specified protocol
@@ -610,8 +623,6 @@ void hv_kvp_onchannelcallback(void *context)
 	struct hv_kvp_msg *kvp_msg;
 
 	struct icmsg_hdr *icmsghdrp;
-	struct icmsg_negotiate *negop = NULL;
-	int util_fw_version;
 	int kvp_srv_version;
 	static enum {NEGO_NOT_STARTED,
 		     NEGO_IN_PROGRESS,
@@ -640,28 +651,10 @@ void hv_kvp_onchannelcallback(void *context)
 			sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			/*
-			 * Based on the host, select appropriate
-			 * framework and service versions we will
-			 * negotiate.
-			 */
-			switch (vmbus_proto_version) {
-			case (VERSION_WS2008):
-				util_fw_version = UTIL_WS2K8_FW_VERSION;
-				kvp_srv_version = WS2008_SRV_VERSION;
-				break;
-			case (VERSION_WIN7):
-				util_fw_version = UTIL_FW_VERSION;
-				kvp_srv_version = WIN7_SRV_VERSION;
-				break;
-			default:
-				util_fw_version = UTIL_FW_VERSION;
-				kvp_srv_version = WIN8_SRV_VERSION;
-			}
-			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-				 recv_buffer, util_fw_version,
-				 kvp_srv_version);
-
+			vmbus_prep_negotiate_resp(icmsghdrp,
+				 recv_buffer, fw_versions, FW_VER_COUNT,
+				 kvp_versions, KVP_VER_COUNT,
+				 NULL, &kvp_srv_version);
 		} else {
 			kvp_msg = (struct hv_kvp_msg *)&recv_buffer[
 				sizeof(struct vmbuspipe_hdr) +
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 4e543dbb731a..d14f10b924a0 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -31,6 +31,16 @@
 #define VSS_MINOR  0
 #define VSS_VERSION    (VSS_MAJOR << 16 | VSS_MINOR)
 
+#define VSS_VER_COUNT 1
+static const int vss_versions[] = {
+	VSS_VERSION
+};
+
+#define FW_VER_COUNT 1
+static const int fw_versions[] = {
+	UTIL_FW_VERSION
+};
+
 /*
  * Timeout values are based on expecations from host
  */
@@ -297,7 +307,6 @@ void hv_vss_onchannelcallback(void *context)
 
 
 	struct icmsg_hdr *icmsghdrp;
-	struct icmsg_negotiate *negop = NULL;
 
 	if (vss_transaction.state > HVUTIL_READY)
 		return;
@@ -310,9 +319,10 @@ void hv_vss_onchannelcallback(void *context)
 			sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-				 recv_buffer, UTIL_FW_VERSION,
-				 VSS_VERSION);
+			vmbus_prep_negotiate_resp(icmsghdrp,
+				 recv_buffer, fw_versions, FW_VER_COUNT,
+				 vss_versions, VSS_VER_COUNT,
+				 NULL, NULL);
 		} else {
 			vss_msg = (struct hv_vss_msg *)&recv_buffer[
 				sizeof(struct vmbuspipe_hdr) +
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 51528bf92777..891537ae579f 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -58,7 +58,31 @@
 static int sd_srv_version;
 static int ts_srv_version;
 static int hb_srv_version;
-static int util_fw_version;
+
+#define SD_VER_COUNT 2
+static const int sd_versions[] = {
+	SD_VERSION,
+	SD_VERSION_1
+};
+
+#define TS_VER_COUNT 3
+static const int ts_versions[] = {
+	TS_VERSION,
+	TS_VERSION_3,
+	TS_VERSION_1
+};
+
+#define HB_VER_COUNT 2
+static const int hb_versions[] = {
+	HB_VERSION,
+	HB_VERSION_1
+};
+
+#define FW_VER_COUNT 2
+static const int fw_versions[] = {
+	UTIL_FW_VERSION,
+	UTIL_WS2K8_FW_VERSION
+};
 
 static void shutdown_onchannelcallback(void *context);
 static struct hv_util_service util_shutdown = {
@@ -119,7 +143,6 @@ static void shutdown_onchannelcallback(void *context)
 	struct shutdown_msg_data *shutdown_msg;
 
 	struct icmsg_hdr *icmsghdrp;
-	struct icmsg_negotiate *negop = NULL;
 
 	vmbus_recvpacket(channel, shut_txf_buf,
 			 PAGE_SIZE, &recvlen, &requestid);
@@ -129,9 +152,14 @@ static void shutdown_onchannelcallback(void *context)
 			sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-					shut_txf_buf, util_fw_version,
-					sd_srv_version);
+			if (vmbus_prep_negotiate_resp(icmsghdrp, shut_txf_buf,
+					fw_versions, FW_VER_COUNT,
+					sd_versions, SD_VER_COUNT,
+					NULL, &sd_srv_version)) {
+				pr_info("Shutdown IC version %d.%d\n",
+					sd_srv_version >> 16,
+					sd_srv_version & 0xFFFF);
+			}
 		} else {
 			shutdown_msg =
 				(struct shutdown_msg_data *)&shut_txf_buf[
@@ -254,7 +282,6 @@ static void timesync_onchannelcallback(void *context)
 	struct ictimesync_data *timedatap;
 	struct ictimesync_ref_data *refdata;
 	u8 *time_txf_buf = util_timesynch.recv_buffer;
-	struct icmsg_negotiate *negop = NULL;
 
 	vmbus_recvpacket(channel, time_txf_buf,
 			 PAGE_SIZE, &recvlen, &requestid);
@@ -264,12 +291,14 @@ static void timesync_onchannelcallback(void *context)
 				sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-						time_txf_buf,
-						util_fw_version,
-						ts_srv_version);
-			pr_info("Using TimeSync version %d.%d\n",
-				ts_srv_version >> 16, ts_srv_version & 0xFFFF);
+			if (vmbus_prep_negotiate_resp(icmsghdrp, time_txf_buf,
+						fw_versions, FW_VER_COUNT,
+						ts_versions, TS_VER_COUNT,
+						NULL, &ts_srv_version)) {
+				pr_info("TimeSync version %d.%d\n",
+					ts_srv_version >> 16,
+					ts_srv_version & 0xFFFF);
+			}
 		} else {
 			if (ts_srv_version > TS_VERSION_3) {
 				refdata = (struct ictimesync_ref_data *)
@@ -313,7 +342,6 @@ static void heartbeat_onchannelcallback(void *context)
 	struct icmsg_hdr *icmsghdrp;
 	struct heartbeat_msg_data *heartbeat_msg;
 	u8 *hbeat_txf_buf = util_heartbeat.recv_buffer;
-	struct icmsg_negotiate *negop = NULL;
 
 	while (1) {
 
@@ -327,9 +355,16 @@ static void heartbeat_onchannelcallback(void *context)
 				sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-				hbeat_txf_buf, util_fw_version,
-				hb_srv_version);
+			if (vmbus_prep_negotiate_resp(icmsghdrp,
+					hbeat_txf_buf,
+					fw_versions, FW_VER_COUNT,
+					hb_versions, HB_VER_COUNT,
+					NULL, &hb_srv_version)) {
+
+				pr_info("Heartbeat version %d.%d\n",
+					hb_srv_version >> 16,
+					hb_srv_version & 0xFFFF);
+			}
 		} else {
 			heartbeat_msg =
 				(struct heartbeat_msg_data *)&hbeat_txf_buf[
@@ -379,33 +414,6 @@ static int util_probe(struct hv_device *dev,
 
 	hv_set_drvdata(dev, srv);
 
-	/*
-	 * Based on the host; initialize the framework and
-	 * service version numbers we will negotiate.
-	 */
-	switch (vmbus_proto_version) {
-	case (VERSION_WS2008):
-		util_fw_version = UTIL_WS2K8_FW_VERSION;
-		sd_srv_version = SD_VERSION_1;
-		ts_srv_version = TS_VERSION_1;
-		hb_srv_version = HB_VERSION_1;
-		break;
-	case VERSION_WIN7:
-	case VERSION_WIN8:
-	case VERSION_WIN8_1:
-		util_fw_version = UTIL_FW_VERSION;
-		sd_srv_version = SD_VERSION;
-		ts_srv_version = TS_VERSION_3;
-		hb_srv_version = HB_VERSION;
-		break;
-	case VERSION_WIN10:
-	default:
-		util_fw_version = UTIL_FW_VERSION;
-		sd_srv_version = SD_VERSION;
-		ts_srv_version = TS_VERSION;
-		hb_srv_version = HB_VERSION;
-	}
-
 	ret = vmbus_open(dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0,
 			srv->util_cb, dev->channel);
 	if (ret)
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 7ea20bd7cdd1..85b26f06e172 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1445,9 +1445,10 @@ struct hyperv_service_callback {
 };
 
 #define MAX_SRV_VER	0x7ffffff
-extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *,
-					struct icmsg_negotiate *, u8 *, int,
-					int);
+extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
+				const int *fw_version, int fw_vercnt,
+				const int *srv_version, int srv_vercnt,
+				int *nego_fw_version, int *nego_srv_version);
 
 void hv_event_tasklet_disable(struct vmbus_channel *channel);
 void hv_event_tasklet_enable(struct vmbus_channel *channel);
-- 
cgit v1.2.3


From aaa59306b0b7e0ca4ba92cc04c5db101cbb1c096 Mon Sep 17 00:00:00 2001
From: CQ Tang <cq.tang@intel.com>
Date: Mon, 30 Jan 2017 09:39:52 -0800
Subject: iommu/vt-d: Fix some macros that are incorrectly specified in
 intel-iommu

Some of the macros are incorrect with wrong bit-shifts resulting in picking
the incorrect invalidation granularity. Incorrect Source-ID in extended
devtlb invalidation caused device side errors.

To: Joerg Roedel <joro@8bytes.org>
To: David Woodhouse <dwmw2@infradead.org>
Cc: iommu@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
Cc: CQ Tang <cq.tang@intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>

Fixes: 2f26e0a9 ("iommu/vt-d: Add basic SVM PASID support")
Signed-off-by: CQ Tang <cq.tang@intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Tested-by: CQ Tang <cq.tang@intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d49e26c6cdc7..23e129ef6726 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -153,8 +153,8 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
 #define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
 #define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
-#define DMA_TLB_IIRG(type) ((type >> 60) & 7)
-#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
+#define DMA_TLB_IIRG(type) ((type >> 60) & 3)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 3)
 #define DMA_TLB_READ_DRAIN (((u64)1) << 49)
 #define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
 #define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
@@ -164,9 +164,9 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 
 /* INVALID_DESC */
 #define DMA_CCMD_INVL_GRANU_OFFSET  61
-#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 3)
-#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 3)
-#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 3)
+#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 4)
+#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 4)
+#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 4)
 #define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
 #define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
 #define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
@@ -316,8 +316,8 @@ enum {
 #define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
 #define QI_DEV_EIOTLB_GLOB(g)	((u64)g)
 #define QI_DEV_EIOTLB_PASID(p)	(((u64)p) << 32)
-#define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
-#define QI_DEV_EIOTLB_QDEP(qd)	(((qd) & 0x1f) << 16)
+#define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
+#define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
 #define QI_DEV_EIOTLB_MAX_INVS	32
 
 #define QI_PGRP_IDX(idx)	(((u64)(idx)) << 55)
-- 
cgit v1.2.3


From ade69e2432b795c76653e1dfa09c684549826a50 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:09 +0100
Subject: lightnvm: merge gennvm with core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the first iteration of Open-Channel SSDs, it was anticipated that
there could be various media managers on top of an open-channel SSD,
such to allow vendors to plug in their own host-side FTLs, without the
media manager in between.

Now that an Open-Channel SSD is exposed as a traditional block device,
there is no longer a need for this. Therefore lets merge the gennvm code
with core and simplify the stack.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/Kconfig     |   9 -
 drivers/lightnvm/Makefile    |   3 +-
 drivers/lightnvm/core.c      | 786 ++++++++++++++++++++++++++++++++-----------
 drivers/lightnvm/gennvm.c    | 657 ------------------------------------
 drivers/lightnvm/gennvm.h    |  62 ----
 drivers/lightnvm/sysblk.c    | 733 ----------------------------------------
 drivers/nvme/host/lightnvm.c |   7 +-
 include/linux/lightnvm.h     |  97 +-----
 8 files changed, 606 insertions(+), 1748 deletions(-)
 delete mode 100644 drivers/lightnvm/gennvm.c
 delete mode 100644 drivers/lightnvm/gennvm.h
 delete mode 100644 drivers/lightnvm/sysblk.c

(limited to 'include/linux')

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 2f5d5f4a4c75..052714106b7b 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -26,15 +26,6 @@ config NVM_DEBUG
 
 	It is required to create/remove targets without IOCTLs.
 
-config NVM_GENNVM
-	tristate "General Non-Volatile Memory Manager for Open-Channel SSDs"
-	---help---
-	Non-volatile memory media manager for Open-Channel SSDs that implements
-	physical media metadata management and block provisioning API.
-
-	This is the standard media manager for using Open-Channel SSDs, and
-	required for targets to be instantiated.
-
 config NVM_RRPC
 	tristate "Round-robin Hybrid Open-Channel SSD target"
 	---help---
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index a7a0a22cf1a5..b2a39e2d2895 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -2,6 +2,5 @@
 # Makefile for Open-Channel SSDs.
 #
 
-obj-$(CONFIG_NVM)		:= core.o sysblk.o
-obj-$(CONFIG_NVM_GENNVM) 	+= gennvm.o
+obj-$(CONFIG_NVM)		:= core.o
 obj-$(CONFIG_NVM_RRPC)		+= rrpc.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 02240a0b39c9..e9a495650dd0 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -29,10 +29,492 @@
 
 static LIST_HEAD(nvm_tgt_types);
 static DECLARE_RWSEM(nvm_tgtt_lock);
-static LIST_HEAD(nvm_mgrs);
 static LIST_HEAD(nvm_devices);
 static DECLARE_RWSEM(nvm_lock);
 
+/* Map between virtual and physical channel and lun */
+struct nvm_ch_map {
+	int ch_off;
+	int nr_luns;
+	int *lun_offs;
+};
+
+struct nvm_dev_map {
+	struct nvm_ch_map *chnls;
+	int nr_chnls;
+};
+
+struct nvm_area {
+	struct list_head list;
+	sector_t begin;
+	sector_t end;	/* end is excluded */
+};
+
+enum {
+	TRANS_TGT_TO_DEV =	0x0,
+	TRANS_DEV_TO_TGT =	0x1,
+};
+
+static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
+{
+	struct nvm_target *tgt;
+
+	list_for_each_entry(tgt, &dev->targets, list)
+		if (!strcmp(name, tgt->disk->disk_name))
+			return tgt;
+
+	return NULL;
+}
+
+static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
+{
+	int i;
+
+	for (i = lun_begin; i <= lun_end; i++) {
+		if (test_and_set_bit(i, dev->lun_map)) {
+			pr_err("nvm: lun %d already allocated\n", i);
+			goto err;
+		}
+	}
+
+	return 0;
+err:
+	while (--i > lun_begin)
+		clear_bit(i, dev->lun_map);
+
+	return -EBUSY;
+}
+
+static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin,
+				 int lun_end)
+{
+	int i;
+
+	for (i = lun_begin; i <= lun_end; i++)
+		WARN_ON(!test_and_clear_bit(i, dev->lun_map));
+}
+
+static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_dev_map *dev_map = tgt_dev->map;
+	int i, j;
+
+	for (i = 0; i < dev_map->nr_chnls; i++) {
+		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
+		int *lun_offs = ch_map->lun_offs;
+		int ch = i + ch_map->ch_off;
+
+		for (j = 0; j < ch_map->nr_luns; j++) {
+			int lun = j + lun_offs[j];
+			int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+
+			WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
+		}
+
+		kfree(ch_map->lun_offs);
+	}
+
+	kfree(dev_map->chnls);
+	kfree(dev_map);
+
+	kfree(tgt_dev->luns);
+	kfree(tgt_dev);
+}
+
+static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
+					      int lun_begin, int lun_end)
+{
+	struct nvm_tgt_dev *tgt_dev = NULL;
+	struct nvm_dev_map *dev_rmap = dev->rmap;
+	struct nvm_dev_map *dev_map;
+	struct ppa_addr *luns;
+	int nr_luns = lun_end - lun_begin + 1;
+	int luns_left = nr_luns;
+	int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
+	int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
+	int bch = lun_begin / dev->geo.luns_per_chnl;
+	int blun = lun_begin % dev->geo.luns_per_chnl;
+	int lunid = 0;
+	int lun_balanced = 1;
+	int prev_nr_luns;
+	int i, j;
+
+	nr_chnls = nr_luns / dev->geo.luns_per_chnl;
+	nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
+
+	dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
+	if (!dev_map)
+		goto err_dev;
+
+	dev_map->chnls = kcalloc(nr_chnls, sizeof(struct nvm_ch_map),
+								GFP_KERNEL);
+	if (!dev_map->chnls)
+		goto err_chnls;
+
+	luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL);
+	if (!luns)
+		goto err_luns;
+
+	prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
+					dev->geo.luns_per_chnl : luns_left;
+	for (i = 0; i < nr_chnls; i++) {
+		struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
+		int *lun_roffs = ch_rmap->lun_offs;
+		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
+		int *lun_offs;
+		int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
+					dev->geo.luns_per_chnl : luns_left;
+
+		if (lun_balanced && prev_nr_luns != luns_in_chnl)
+			lun_balanced = 0;
+
+		ch_map->ch_off = ch_rmap->ch_off = bch;
+		ch_map->nr_luns = luns_in_chnl;
+
+		lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+		if (!lun_offs)
+			goto err_ch;
+
+		for (j = 0; j < luns_in_chnl; j++) {
+			luns[lunid].ppa = 0;
+			luns[lunid].g.ch = i;
+			luns[lunid++].g.lun = j;
+
+			lun_offs[j] = blun;
+			lun_roffs[j + blun] = blun;
+		}
+
+		ch_map->lun_offs = lun_offs;
+
+		/* when starting a new channel, lun offset is reset */
+		blun = 0;
+		luns_left -= luns_in_chnl;
+	}
+
+	dev_map->nr_chnls = nr_chnls;
+
+	tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
+	if (!tgt_dev)
+		goto err_ch;
+
+	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
+	/* Target device only owns a portion of the physical device */
+	tgt_dev->geo.nr_chnls = nr_chnls;
+	tgt_dev->geo.nr_luns = nr_luns;
+	tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
+	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
+	tgt_dev->q = dev->q;
+	tgt_dev->map = dev_map;
+	tgt_dev->luns = luns;
+	memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
+
+	tgt_dev->parent = dev;
+
+	return tgt_dev;
+err_ch:
+	while (--i > 0)
+		kfree(dev_map->chnls[i].lun_offs);
+	kfree(luns);
+err_luns:
+	kfree(dev_map->chnls);
+err_chnls:
+	kfree(dev_map);
+err_dev:
+	return tgt_dev;
+}
+
+static const struct block_device_operations nvm_fops = {
+	.owner		= THIS_MODULE,
+};
+
+static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
+{
+	struct nvm_ioctl_create_simple *s = &create->conf.s;
+	struct request_queue *tqueue;
+	struct gendisk *tdisk;
+	struct nvm_tgt_type *tt;
+	struct nvm_target *t;
+	struct nvm_tgt_dev *tgt_dev;
+	void *targetdata;
+
+	tt = nvm_find_target_type(create->tgttype, 1);
+	if (!tt) {
+		pr_err("nvm: target type %s not found\n", create->tgttype);
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->mlock);
+	t = nvm_find_target(dev, create->tgtname);
+	if (t) {
+		pr_err("nvm: target name already exists.\n");
+		mutex_unlock(&dev->mlock);
+		return -EINVAL;
+	}
+	mutex_unlock(&dev->mlock);
+
+	if (nvm_reserve_luns(dev, s->lun_begin, s->lun_end))
+		return -ENOMEM;
+
+	t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
+	if (!t)
+		goto err_reserve;
+
+	tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
+	if (!tgt_dev) {
+		pr_err("nvm: could not create target device\n");
+		goto err_t;
+	}
+
+	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
+	if (!tqueue)
+		goto err_dev;
+	blk_queue_make_request(tqueue, tt->make_rq);
+
+	tdisk = alloc_disk(0);
+	if (!tdisk)
+		goto err_queue;
+
+	sprintf(tdisk->disk_name, "%s", create->tgtname);
+	tdisk->flags = GENHD_FL_EXT_DEVT;
+	tdisk->major = 0;
+	tdisk->first_minor = 0;
+	tdisk->fops = &nvm_fops;
+	tdisk->queue = tqueue;
+
+	targetdata = tt->init(tgt_dev, tdisk);
+	if (IS_ERR(targetdata))
+		goto err_init;
+
+	tdisk->private_data = targetdata;
+	tqueue->queuedata = targetdata;
+
+	blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
+
+	set_capacity(tdisk, tt->capacity(targetdata));
+	add_disk(tdisk);
+
+	t->type = tt;
+	t->disk = tdisk;
+	t->dev = tgt_dev;
+
+	mutex_lock(&dev->mlock);
+	list_add_tail(&t->list, &dev->targets);
+	mutex_unlock(&dev->mlock);
+
+	return 0;
+err_init:
+	put_disk(tdisk);
+err_queue:
+	blk_cleanup_queue(tqueue);
+err_dev:
+	kfree(tgt_dev);
+err_t:
+	kfree(t);
+err_reserve:
+	nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
+	return -ENOMEM;
+}
+
+static void __nvm_remove_target(struct nvm_target *t)
+{
+	struct nvm_tgt_type *tt = t->type;
+	struct gendisk *tdisk = t->disk;
+	struct request_queue *q = tdisk->queue;
+
+	del_gendisk(tdisk);
+	blk_cleanup_queue(q);
+
+	if (tt->exit)
+		tt->exit(tdisk->private_data);
+
+	nvm_remove_tgt_dev(t->dev);
+	put_disk(tdisk);
+
+	list_del(&t->list);
+	kfree(t);
+}
+
+/**
+ * nvm_remove_tgt - Removes a target from the media manager
+ * @dev:	device
+ * @remove:	ioctl structure with target name to remove.
+ *
+ * Returns:
+ * 0: on success
+ * 1: on not found
+ * <0: on error
+ */
+static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
+{
+	struct nvm_target *t;
+
+	mutex_lock(&dev->mlock);
+	t = nvm_find_target(dev, remove->tgtname);
+	if (!t) {
+		mutex_unlock(&dev->mlock);
+		return 1;
+	}
+	__nvm_remove_target(t);
+	mutex_unlock(&dev->mlock);
+
+	return 0;
+}
+
+static int nvm_register_map(struct nvm_dev *dev)
+{
+	struct nvm_dev_map *rmap;
+	int i, j;
+
+	rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
+	if (!rmap)
+		goto err_rmap;
+
+	rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct nvm_ch_map),
+								GFP_KERNEL);
+	if (!rmap->chnls)
+		goto err_chnls;
+
+	for (i = 0; i < dev->geo.nr_chnls; i++) {
+		struct nvm_ch_map *ch_rmap;
+		int *lun_roffs;
+		int luns_in_chnl = dev->geo.luns_per_chnl;
+
+		ch_rmap = &rmap->chnls[i];
+
+		ch_rmap->ch_off = -1;
+		ch_rmap->nr_luns = luns_in_chnl;
+
+		lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+		if (!lun_roffs)
+			goto err_ch;
+
+		for (j = 0; j < luns_in_chnl; j++)
+			lun_roffs[j] = -1;
+
+		ch_rmap->lun_offs = lun_roffs;
+	}
+
+	dev->rmap = rmap;
+
+	return 0;
+err_ch:
+	while (--i >= 0)
+		kfree(rmap->chnls[i].lun_offs);
+err_chnls:
+	kfree(rmap);
+err_rmap:
+	return -ENOMEM;
+}
+
+static int nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+	struct nvm_dev_map *dev_map = tgt_dev->map;
+	struct nvm_ch_map *ch_map = &dev_map->chnls[p->g.ch];
+	int lun_off = ch_map->lun_offs[p->g.lun];
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_dev_map *dev_rmap = dev->rmap;
+	struct nvm_ch_map *ch_rmap;
+	int lun_roff;
+
+	p->g.ch += ch_map->ch_off;
+	p->g.lun += lun_off;
+
+	ch_rmap = &dev_rmap->chnls[p->g.ch];
+	lun_roff = ch_rmap->lun_offs[p->g.lun];
+
+	if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) {
+		pr_err("nvm: corrupted device partition table\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_dev_map *dev_rmap = dev->rmap;
+	struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch];
+	int lun_roff = ch_rmap->lun_offs[p->g.lun];
+
+	p->g.ch -= ch_rmap->ch_off;
+	p->g.lun -= lun_roff;
+
+	return 0;
+}
+
+static int nvm_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
+			int flag)
+{
+	int i;
+	int ret;
+
+	if (rqd->nr_ppas == 1) {
+		if (flag == TRANS_TGT_TO_DEV)
+			return nvm_map_to_dev(tgt_dev, &rqd->ppa_addr);
+		else
+			return nvm_map_to_tgt(tgt_dev, &rqd->ppa_addr);
+	}
+
+	for (i = 0; i < rqd->nr_ppas; i++) {
+		if (flag == TRANS_TGT_TO_DEV)
+			ret = nvm_map_to_dev(tgt_dev, &rqd->ppa_list[i]);
+		else
+			ret = nvm_map_to_tgt(tgt_dev, &rqd->ppa_list[i]);
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static struct ppa_addr nvm_trans_ppa(struct nvm_tgt_dev *tgt_dev,
+				     struct ppa_addr p, int dir)
+{
+	struct ppa_addr ppa = p;
+
+	if (dir == TRANS_TGT_TO_DEV)
+		nvm_map_to_dev(tgt_dev, &ppa);
+	else
+		nvm_map_to_tgt(tgt_dev, &ppa);
+
+	return ppa;
+}
+
+void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
+		     int len)
+{
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_dev_map *dev_rmap = dev->rmap;
+	u64 i;
+
+	for (i = 0; i < len; i++) {
+		struct nvm_ch_map *ch_rmap;
+		int *lun_roffs;
+		struct ppa_addr gaddr;
+		u64 pba = le64_to_cpu(entries[i]);
+		int off;
+		u64 diff;
+
+		if (!pba)
+			continue;
+
+		gaddr = linear_to_generic_addr(geo, pba);
+		ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
+		lun_roffs = ch_rmap->lun_offs;
+
+		off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
+
+		diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
+				(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
+
+		entries[i] -= cpu_to_le64(diff);
+	}
+}
+EXPORT_SYMBOL(nvm_part_to_tgt);
+
 struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
 {
 	struct nvm_tgt_type *tmp, *tt = NULL;
@@ -92,78 +574,6 @@ void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler)
 }
 EXPORT_SYMBOL(nvm_dev_dma_free);
 
-static struct nvmm_type *nvm_find_mgr_type(const char *name)
-{
-	struct nvmm_type *mt;
-
-	list_for_each_entry(mt, &nvm_mgrs, list)
-		if (!strcmp(name, mt->name))
-			return mt;
-
-	return NULL;
-}
-
-static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev)
-{
-	struct nvmm_type *mt;
-	int ret;
-
-	lockdep_assert_held(&nvm_lock);
-
-	list_for_each_entry(mt, &nvm_mgrs, list) {
-		if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN))
-			continue;
-
-		ret = mt->register_mgr(dev);
-		if (ret < 0) {
-			pr_err("nvm: media mgr failed to init (%d) on dev %s\n",
-								ret, dev->name);
-			return NULL; /* initialization failed */
-		} else if (ret > 0)
-			return mt;
-	}
-
-	return NULL;
-}
-
-int nvm_register_mgr(struct nvmm_type *mt)
-{
-	struct nvm_dev *dev;
-	int ret = 0;
-
-	down_write(&nvm_lock);
-	if (nvm_find_mgr_type(mt->name)) {
-		ret = -EEXIST;
-		goto finish;
-	} else {
-		list_add(&mt->list, &nvm_mgrs);
-	}
-
-	/* try to register media mgr if any device have none configured */
-	list_for_each_entry(dev, &nvm_devices, devices) {
-		if (dev->mt)
-			continue;
-
-		dev->mt = nvm_init_mgr(dev);
-	}
-finish:
-	up_write(&nvm_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(nvm_register_mgr);
-
-void nvm_unregister_mgr(struct nvmm_type *mt)
-{
-	if (!mt)
-		return;
-
-	down_write(&nvm_lock);
-	list_del(&mt->list);
-	up_write(&nvm_lock);
-}
-EXPORT_SYMBOL(nvm_unregister_mgr);
-
 static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 {
 	struct nvm_dev *dev;
@@ -183,13 +593,13 @@ static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev,
 
 	if (rqd->nr_ppas > 1) {
 		for (i = 0; i < rqd->nr_ppas; i++) {
-			rqd->ppa_list[i] = dev->mt->trans_ppa(tgt_dev,
+			rqd->ppa_list[i] = nvm_trans_ppa(tgt_dev,
 					rqd->ppa_list[i], TRANS_TGT_TO_DEV);
 			rqd->ppa_list[i] = generic_to_dev_addr(dev,
 							rqd->ppa_list[i]);
 		}
 	} else {
-		rqd->ppa_addr = dev->mt->trans_ppa(tgt_dev, rqd->ppa_addr,
+		rqd->ppa_addr = nvm_trans_ppa(tgt_dev, rqd->ppa_addr,
 						TRANS_TGT_TO_DEV);
 		rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
 	}
@@ -242,7 +652,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
 	nvm_free_rqd_ppalist(dev, &rqd);
 	if (ret) {
-		pr_err("nvm: sysblk failed bb mark\n");
+		pr_err("nvm: failed bb mark\n");
 		return -EINVAL;
 	}
 
@@ -262,15 +672,23 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
 	struct nvm_dev *dev = tgt_dev->parent;
 
-	return dev->mt->submit_io(tgt_dev, rqd);
+	if (!dev->ops->submit_io)
+		return -ENODEV;
+
+	/* Convert address space */
+	nvm_generic_to_addr_mode(dev, rqd);
+
+	rqd->dev = tgt_dev;
+	return dev->ops->submit_io(dev, rqd);
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
 int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags)
 {
-	struct nvm_dev *dev = tgt_dev->parent;
+	/* Convert address space */
+	nvm_map_to_dev(tgt_dev, p);
 
-	return dev->mt->erase_blk(tgt_dev, p, flags);
+	return nvm_erase_ppa(tgt_dev->parent, p, 1, flags);
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
@@ -289,16 +707,65 @@ EXPORT_SYMBOL(nvm_get_l2p_tbl);
 int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
 {
 	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_area *area, *prev, *next;
+	sector_t begin = 0;
+	sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
+
+	if (len > max_sectors)
+		return -EINVAL;
+
+	area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
+	if (!area)
+		return -ENOMEM;
+
+	prev = NULL;
+
+	spin_lock(&dev->lock);
+	list_for_each_entry(next, &dev->area_list, list) {
+		if (begin + len > next->begin) {
+			begin = next->end;
+			prev = next;
+			continue;
+		}
+		break;
+	}
+
+	if ((begin + len) > max_sectors) {
+		spin_unlock(&dev->lock);
+		kfree(area);
+		return -EINVAL;
+	}
 
-	return dev->mt->get_area(dev, lba, len);
+	area->begin = *lba = begin;
+	area->end = begin + len;
+
+	if (prev) /* insert into sorted order */
+		list_add(&area->list, &prev->list);
+	else
+		list_add(&area->list, &dev->area_list);
+	spin_unlock(&dev->lock);
+
+	return 0;
 }
 EXPORT_SYMBOL(nvm_get_area);
 
-void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t lba)
+void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 {
 	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_area *area;
 
-	dev->mt->put_area(dev, lba);
+	spin_lock(&dev->lock);
+	list_for_each_entry(area, &dev->area_list, list) {
+		if (area->begin != begin)
+			continue;
+
+		list_del(&area->list);
+		spin_unlock(&dev->lock);
+		kfree(area);
+		return;
+	}
+	spin_unlock(&dev->lock);
 }
 EXPORT_SYMBOL(nvm_put_area);
 
@@ -409,8 +876,15 @@ EXPORT_SYMBOL(nvm_erase_ppa);
 
 void nvm_end_io(struct nvm_rq *rqd, int error)
 {
+	struct nvm_tgt_dev *tgt_dev = rqd->dev;
+	struct nvm_tgt_instance *ins = rqd->ins;
+
+	/* Convert address space */
+	if (tgt_dev)
+		nvm_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT);
+
 	rqd->error = error;
-	rqd->end_io(rqd);
+	ins->tt->end_io(rqd);
 }
 EXPORT_SYMBOL(nvm_end_io);
 
@@ -570,10 +1044,9 @@ EXPORT_SYMBOL(nvm_get_bb_tbl);
 int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
 		       u8 *blks)
 {
-	struct nvm_dev *dev = tgt_dev->parent;
+	ppa = nvm_trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV);
 
-	ppa = dev->mt->trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV);
-	return nvm_get_bb_tbl(dev, ppa, blks);
+	return nvm_get_bb_tbl(tgt_dev->parent, ppa, blks);
 }
 EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
 
@@ -691,36 +1164,31 @@ static int nvm_core_init(struct nvm_dev *dev)
 		goto err_fmtype;
 	}
 
+	INIT_LIST_HEAD(&dev->area_list);
+	INIT_LIST_HEAD(&dev->targets);
 	mutex_init(&dev->mlock);
 	spin_lock_init(&dev->lock);
 
-	blk_queue_logical_block_size(dev->q, geo->sec_size);
+	ret = nvm_register_map(dev);
+	if (ret)
+		goto err_fmtype;
 
+	blk_queue_logical_block_size(dev->q, geo->sec_size);
 	return 0;
 err_fmtype:
 	kfree(dev->lun_map);
 	return ret;
 }
 
-static void nvm_free_mgr(struct nvm_dev *dev)
-{
-	if (!dev->mt)
-		return;
-
-	dev->mt->unregister_mgr(dev);
-	dev->mt = NULL;
-}
-
 void nvm_free(struct nvm_dev *dev)
 {
 	if (!dev)
 		return;
 
-	nvm_free_mgr(dev);
-
 	if (dev->dma_pool)
 		dev->ops->destroy_dma_pool(dev->dma_pool);
 
+	kfree(dev->rmap);
 	kfree(dev->lptbl);
 	kfree(dev->lun_map);
 	kfree(dev);
@@ -731,9 +1199,6 @@ static int nvm_init(struct nvm_dev *dev)
 	struct nvm_geo *geo = &dev->geo;
 	int ret = -EINVAL;
 
-	if (!dev->q || !dev->ops)
-		return ret;
-
 	if (dev->ops->identity(dev, &dev->identity)) {
 		pr_err("nvm: device could not be identified\n");
 		goto err;
@@ -779,49 +1244,50 @@ int nvm_register(struct nvm_dev *dev)
 {
 	int ret;
 
-	ret = nvm_init(dev);
-	if (ret)
-		goto err_init;
+	if (!dev->q || !dev->ops)
+		return -EINVAL;
 
 	if (dev->ops->max_phys_sect > 256) {
 		pr_info("nvm: max sectors supported is 256.\n");
-		ret = -EINVAL;
-		goto err_init;
+		return -EINVAL;
 	}
 
 	if (dev->ops->max_phys_sect > 1) {
 		dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
 		if (!dev->dma_pool) {
 			pr_err("nvm: could not create dma pool\n");
-			ret = -ENOMEM;
-			goto err_init;
+			return -ENOMEM;
 		}
 	}
 
-	if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
-		ret = nvm_get_sysblock(dev, &dev->sb);
-		if (!ret)
-			pr_err("nvm: device not initialized.\n");
-		else if (ret < 0)
-			pr_err("nvm: err (%d) on device initialization\n", ret);
-	}
+	ret = nvm_init(dev);
+	if (ret)
+		goto err_init;
 
 	/* register device with a supported media manager */
 	down_write(&nvm_lock);
-	if (ret > 0)
-		dev->mt = nvm_init_mgr(dev);
 	list_add(&dev->devices, &nvm_devices);
 	up_write(&nvm_lock);
 
 	return 0;
 err_init:
-	kfree(dev->lun_map);
+	dev->ops->destroy_dma_pool(dev->dma_pool);
 	return ret;
 }
 EXPORT_SYMBOL(nvm_register);
 
 void nvm_unregister(struct nvm_dev *dev)
 {
+	struct nvm_target *t, *tmp;
+
+	mutex_lock(&dev->mlock);
+	list_for_each_entry_safe(t, tmp, &dev->targets, list) {
+		if (t->dev->parent != dev)
+			continue;
+		__nvm_remove_target(t);
+	}
+	mutex_unlock(&dev->mlock);
+
 	down_write(&nvm_lock);
 	list_del(&dev->devices);
 	up_write(&nvm_lock);
@@ -844,11 +1310,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 		return -EINVAL;
 	}
 
-	if (!dev->mt) {
-		pr_info("nvm: device has no media manager registered.\n");
-		return -ENODEV;
-	}
-
 	if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
 		pr_err("nvm: config type not valid\n");
 		return -EINVAL;
@@ -861,7 +1322,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 		return -EINVAL;
 	}
 
-	return dev->mt->create_tgt(dev, create);
+	return nvm_create_tgt(dev, create);
 }
 
 static long nvm_ioctl_info(struct file *file, void __user *arg)
@@ -923,16 +1384,14 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
 		struct nvm_ioctl_device_info *info = &devices->info[i];
 
 		sprintf(info->devname, "%s", dev->name);
-		if (dev->mt) {
-			info->bmversion[0] = dev->mt->version[0];
-			info->bmversion[1] = dev->mt->version[1];
-			info->bmversion[2] = dev->mt->version[2];
-			sprintf(info->bmname, "%s", dev->mt->name);
-		} else {
-			sprintf(info->bmname, "none");
-		}
 
+		/* kept for compatibility */
+		info->bmversion[0] = 1;
+		info->bmversion[1] = 0;
+		info->bmversion[2] = 0;
+		sprintf(info->bmname, "%s", "gennvm");
 		i++;
+
 		if (i > 31) {
 			pr_err("nvm: max 31 devices can be reported.\n");
 			break;
@@ -994,7 +1453,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
 	}
 
 	list_for_each_entry(dev, &nvm_devices, devices) {
-		ret = dev->mt->remove_tgt(dev, &remove);
+		ret = nvm_remove_tgt(dev, &remove);
 		if (!ret)
 			break;
 	}
@@ -1002,47 +1461,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
 	return ret;
 }
 
-static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info)
-{
-	info->seqnr = 1;
-	info->erase_cnt = 0;
-	info->version = 1;
-}
-
-static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init)
-{
-	struct nvm_dev *dev;
-	struct nvm_sb_info info;
-	int ret;
-
-	down_write(&nvm_lock);
-	dev = nvm_find_nvm_dev(init->dev);
-	up_write(&nvm_lock);
-	if (!dev) {
-		pr_err("nvm: device not found\n");
-		return -EINVAL;
-	}
-
-	nvm_setup_nvm_sb_info(&info);
-
-	strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN);
-	info.fs_ppa.ppa = -1;
-
-	if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
-		ret = nvm_init_sysblock(dev, &info);
-		if (ret)
-			return ret;
-	}
-
-	memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info));
-
-	down_write(&nvm_lock);
-	dev->mt = nvm_init_mgr(dev);
-	up_write(&nvm_lock);
-
-	return 0;
-}
-
+/* kept for compatibility reasons */
 static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
 {
 	struct nvm_ioctl_dev_init init;
@@ -1058,15 +1477,13 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
 		return -EINVAL;
 	}
 
-	init.dev[DISK_NAME_LEN - 1] = '\0';
-
-	return __nvm_ioctl_dev_init(&init);
+	return 0;
 }
 
+/* Kept for compatibility reasons */
 static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
 {
 	struct nvm_ioctl_dev_factory fact;
-	struct nvm_dev *dev;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1079,19 +1496,6 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
 	if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1))
 		return -EINVAL;
 
-	down_write(&nvm_lock);
-	dev = nvm_find_nvm_dev(fact.dev);
-	up_write(&nvm_lock);
-	if (!dev) {
-		pr_err("nvm: device not found\n");
-		return -EINVAL;
-	}
-
-	nvm_free_mgr(dev);
-
-	if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT)
-		return nvm_dev_factory(dev, fact.flags);
-
 	return 0;
 }
 
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
deleted file mode 100644
index ca7880082d80..000000000000
--- a/drivers/lightnvm/gennvm.c
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * Copyright (C) 2015 Matias Bjorling <m@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- * Implementation of a general nvm manager for Open-Channel SSDs.
- */
-
-#include "gennvm.h"
-
-static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name)
-{
-	struct nvm_target *tgt;
-
-	list_for_each_entry(tgt, &gn->targets, list)
-		if (!strcmp(name, tgt->disk->disk_name))
-			return tgt;
-
-	return NULL;
-}
-
-static const struct block_device_operations gen_fops = {
-	.owner		= THIS_MODULE,
-};
-
-static int gen_reserve_luns(struct nvm_dev *dev, struct nvm_target *t,
-			    int lun_begin, int lun_end)
-{
-	int i;
-
-	for (i = lun_begin; i <= lun_end; i++) {
-		if (test_and_set_bit(i, dev->lun_map)) {
-			pr_err("nvm: lun %d already allocated\n", i);
-			goto err;
-		}
-	}
-
-	return 0;
-
-err:
-	while (--i > lun_begin)
-		clear_bit(i, dev->lun_map);
-
-	return -EBUSY;
-}
-
-static void gen_release_luns_err(struct nvm_dev *dev, int lun_begin,
-				 int lun_end)
-{
-	int i;
-
-	for (i = lun_begin; i <= lun_end; i++)
-		WARN_ON(!test_and_clear_bit(i, dev->lun_map));
-}
-
-static void gen_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct gen_dev_map *dev_map = tgt_dev->map;
-	int i, j;
-
-	for (i = 0; i < dev_map->nr_chnls; i++) {
-		struct gen_ch_map *ch_map = &dev_map->chnls[i];
-		int *lun_offs = ch_map->lun_offs;
-		int ch = i + ch_map->ch_off;
-
-		for (j = 0; j < ch_map->nr_luns; j++) {
-			int lun = j + lun_offs[j];
-			int lunid = (ch * dev->geo.luns_per_chnl) + lun;
-
-			WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
-		}
-
-		kfree(ch_map->lun_offs);
-	}
-
-	kfree(dev_map->chnls);
-	kfree(dev_map);
-	kfree(tgt_dev->luns);
-	kfree(tgt_dev);
-}
-
-static struct nvm_tgt_dev *gen_create_tgt_dev(struct nvm_dev *dev,
-					      int lun_begin, int lun_end)
-{
-	struct nvm_tgt_dev *tgt_dev = NULL;
-	struct gen_dev_map *dev_rmap = dev->rmap;
-	struct gen_dev_map *dev_map;
-	struct ppa_addr *luns;
-	int nr_luns = lun_end - lun_begin + 1;
-	int luns_left = nr_luns;
-	int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
-	int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
-	int bch = lun_begin / dev->geo.luns_per_chnl;
-	int blun = lun_begin % dev->geo.luns_per_chnl;
-	int lunid = 0;
-	int lun_balanced = 1;
-	int prev_nr_luns;
-	int i, j;
-
-	nr_chnls = nr_luns / dev->geo.luns_per_chnl;
-	nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
-
-	dev_map = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL);
-	if (!dev_map)
-		goto err_dev;
-
-	dev_map->chnls = kcalloc(nr_chnls, sizeof(struct gen_ch_map),
-								GFP_KERNEL);
-	if (!dev_map->chnls)
-		goto err_chnls;
-
-	luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL);
-	if (!luns)
-		goto err_luns;
-
-	prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
-					dev->geo.luns_per_chnl : luns_left;
-	for (i = 0; i < nr_chnls; i++) {
-		struct gen_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
-		int *lun_roffs = ch_rmap->lun_offs;
-		struct gen_ch_map *ch_map = &dev_map->chnls[i];
-		int *lun_offs;
-		int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
-					dev->geo.luns_per_chnl : luns_left;
-
-		if (lun_balanced && prev_nr_luns != luns_in_chnl)
-			lun_balanced = 0;
-
-		ch_map->ch_off = ch_rmap->ch_off = bch;
-		ch_map->nr_luns = luns_in_chnl;
-
-		lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
-		if (!lun_offs)
-			goto err_ch;
-
-		for (j = 0; j < luns_in_chnl; j++) {
-			luns[lunid].ppa = 0;
-			luns[lunid].g.ch = i;
-			luns[lunid++].g.lun = j;
-
-			lun_offs[j] = blun;
-			lun_roffs[j + blun] = blun;
-		}
-
-		ch_map->lun_offs = lun_offs;
-
-		/* when starting a new channel, lun offset is reset */
-		blun = 0;
-		luns_left -= luns_in_chnl;
-	}
-
-	dev_map->nr_chnls = nr_chnls;
-
-	tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
-	if (!tgt_dev)
-		goto err_ch;
-
-	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
-	/* Target device only owns a portion of the physical device */
-	tgt_dev->geo.nr_chnls = nr_chnls;
-	tgt_dev->geo.nr_luns = nr_luns;
-	tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
-	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
-	tgt_dev->q = dev->q;
-	tgt_dev->map = dev_map;
-	tgt_dev->luns = luns;
-	memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
-
-	tgt_dev->parent = dev;
-
-	return tgt_dev;
-err_ch:
-	while (--i > 0)
-		kfree(dev_map->chnls[i].lun_offs);
-	kfree(luns);
-err_luns:
-	kfree(dev_map->chnls);
-err_chnls:
-	kfree(dev_map);
-err_dev:
-	return tgt_dev;
-}
-
-static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
-{
-	struct gen_dev *gn = dev->mp;
-	struct nvm_ioctl_create_simple *s = &create->conf.s;
-	struct request_queue *tqueue;
-	struct gendisk *tdisk;
-	struct nvm_tgt_type *tt;
-	struct nvm_target *t;
-	struct nvm_tgt_dev *tgt_dev;
-	void *targetdata;
-
-	tt = nvm_find_target_type(create->tgttype, 1);
-	if (!tt) {
-		pr_err("nvm: target type %s not found\n", create->tgttype);
-		return -EINVAL;
-	}
-
-	mutex_lock(&gn->lock);
-	t = gen_find_target(gn, create->tgtname);
-	if (t) {
-		pr_err("nvm: target name already exists.\n");
-		mutex_unlock(&gn->lock);
-		return -EINVAL;
-	}
-	mutex_unlock(&gn->lock);
-
-	t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
-	if (!t)
-		return -ENOMEM;
-
-	if (gen_reserve_luns(dev, t, s->lun_begin, s->lun_end))
-		goto err_t;
-
-	tgt_dev = gen_create_tgt_dev(dev, s->lun_begin, s->lun_end);
-	if (!tgt_dev) {
-		pr_err("nvm: could not create target device\n");
-		goto err_reserve;
-	}
-
-	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
-	if (!tqueue)
-		goto err_dev;
-	blk_queue_make_request(tqueue, tt->make_rq);
-
-	tdisk = alloc_disk(0);
-	if (!tdisk)
-		goto err_queue;
-
-	sprintf(tdisk->disk_name, "%s", create->tgtname);
-	tdisk->flags = GENHD_FL_EXT_DEVT;
-	tdisk->major = 0;
-	tdisk->first_minor = 0;
-	tdisk->fops = &gen_fops;
-	tdisk->queue = tqueue;
-
-	targetdata = tt->init(tgt_dev, tdisk);
-	if (IS_ERR(targetdata))
-		goto err_init;
-
-	tdisk->private_data = targetdata;
-	tqueue->queuedata = targetdata;
-
-	blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
-
-	set_capacity(tdisk, tt->capacity(targetdata));
-	add_disk(tdisk);
-
-	t->type = tt;
-	t->disk = tdisk;
-	t->dev = tgt_dev;
-
-	mutex_lock(&gn->lock);
-	list_add_tail(&t->list, &gn->targets);
-	mutex_unlock(&gn->lock);
-
-	return 0;
-err_init:
-	put_disk(tdisk);
-err_queue:
-	blk_cleanup_queue(tqueue);
-err_dev:
-	kfree(tgt_dev);
-err_reserve:
-	gen_release_luns_err(dev, s->lun_begin, s->lun_end);
-err_t:
-	kfree(t);
-	return -ENOMEM;
-}
-
-static void __gen_remove_target(struct nvm_target *t)
-{
-	struct nvm_tgt_type *tt = t->type;
-	struct gendisk *tdisk = t->disk;
-	struct request_queue *q = tdisk->queue;
-
-	del_gendisk(tdisk);
-	blk_cleanup_queue(q);
-
-	if (tt->exit)
-		tt->exit(tdisk->private_data);
-
-	gen_remove_tgt_dev(t->dev);
-	put_disk(tdisk);
-
-	list_del(&t->list);
-	kfree(t);
-}
-
-/**
- * gen_remove_tgt - Removes a target from the media manager
- * @dev:	device
- * @remove:	ioctl structure with target name to remove.
- *
- * Returns:
- * 0: on success
- * 1: on not found
- * <0: on error
- */
-static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
-{
-	struct gen_dev *gn = dev->mp;
-	struct nvm_target *t;
-
-	if (!gn)
-		return 1;
-
-	mutex_lock(&gn->lock);
-	t = gen_find_target(gn, remove->tgtname);
-	if (!t) {
-		mutex_unlock(&gn->lock);
-		return 1;
-	}
-	__gen_remove_target(t);
-	mutex_unlock(&gn->lock);
-
-	return 0;
-}
-
-static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct gen_dev *gn = dev->mp;
-	struct gen_area *area, *prev, *next;
-	sector_t begin = 0;
-	sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
-
-	if (len > max_sectors)
-		return -EINVAL;
-
-	area = kmalloc(sizeof(struct gen_area), GFP_KERNEL);
-	if (!area)
-		return -ENOMEM;
-
-	prev = NULL;
-
-	spin_lock(&dev->lock);
-	list_for_each_entry(next, &gn->area_list, list) {
-		if (begin + len > next->begin) {
-			begin = next->end;
-			prev = next;
-			continue;
-		}
-		break;
-	}
-
-	if ((begin + len) > max_sectors) {
-		spin_unlock(&dev->lock);
-		kfree(area);
-		return -EINVAL;
-	}
-
-	area->begin = *lba = begin;
-	area->end = begin + len;
-
-	if (prev) /* insert into sorted order */
-		list_add(&area->list, &prev->list);
-	else
-		list_add(&area->list, &gn->area_list);
-	spin_unlock(&dev->lock);
-
-	return 0;
-}
-
-static void gen_put_area(struct nvm_dev *dev, sector_t begin)
-{
-	struct gen_dev *gn = dev->mp;
-	struct gen_area *area;
-
-	spin_lock(&dev->lock);
-	list_for_each_entry(area, &gn->area_list, list) {
-		if (area->begin != begin)
-			continue;
-
-		list_del(&area->list);
-		spin_unlock(&dev->lock);
-		kfree(area);
-		return;
-	}
-	spin_unlock(&dev->lock);
-}
-
-static void gen_free(struct nvm_dev *dev)
-{
-	kfree(dev->mp);
-	kfree(dev->rmap);
-	dev->mp = NULL;
-}
-
-static int gen_register(struct nvm_dev *dev)
-{
-	struct gen_dev *gn;
-	struct gen_dev_map *dev_rmap;
-	int i, j;
-
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
-	gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL);
-	if (!gn)
-		goto err_gn;
-
-	dev_rmap = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL);
-	if (!dev_rmap)
-		goto err_rmap;
-
-	dev_rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct gen_ch_map),
-								GFP_KERNEL);
-	if (!dev_rmap->chnls)
-		goto err_chnls;
-
-	for (i = 0; i < dev->geo.nr_chnls; i++) {
-		struct gen_ch_map *ch_rmap;
-		int *lun_roffs;
-		int luns_in_chnl = dev->geo.luns_per_chnl;
-
-		ch_rmap = &dev_rmap->chnls[i];
-
-		ch_rmap->ch_off = -1;
-		ch_rmap->nr_luns = luns_in_chnl;
-
-		lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
-		if (!lun_roffs)
-			goto err_ch;
-
-		for (j = 0; j < luns_in_chnl; j++)
-			lun_roffs[j] = -1;
-
-		ch_rmap->lun_offs = lun_roffs;
-	}
-
-	gn->dev = dev;
-	gn->nr_luns = dev->geo.nr_luns;
-	INIT_LIST_HEAD(&gn->area_list);
-	mutex_init(&gn->lock);
-	INIT_LIST_HEAD(&gn->targets);
-	dev->mp = gn;
-	dev->rmap = dev_rmap;
-
-	return 1;
-err_ch:
-	while (--i >= 0)
-		kfree(dev_rmap->chnls[i].lun_offs);
-err_chnls:
-	kfree(dev_rmap);
-err_rmap:
-	gen_free(dev);
-err_gn:
-	module_put(THIS_MODULE);
-	return -ENOMEM;
-}
-
-static void gen_unregister(struct nvm_dev *dev)
-{
-	struct gen_dev *gn = dev->mp;
-	struct nvm_target *t, *tmp;
-
-	mutex_lock(&gn->lock);
-	list_for_each_entry_safe(t, tmp, &gn->targets, list) {
-		if (t->dev->parent != dev)
-			continue;
-		__gen_remove_target(t);
-	}
-	mutex_unlock(&gn->lock);
-
-	gen_free(dev);
-	module_put(THIS_MODULE);
-}
-
-static int gen_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
-{
-	struct gen_dev_map *dev_map = tgt_dev->map;
-	struct gen_ch_map *ch_map = &dev_map->chnls[p->g.ch];
-	int lun_off = ch_map->lun_offs[p->g.lun];
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct gen_dev_map *dev_rmap = dev->rmap;
-	struct gen_ch_map *ch_rmap;
-	int lun_roff;
-
-	p->g.ch += ch_map->ch_off;
-	p->g.lun += lun_off;
-
-	ch_rmap = &dev_rmap->chnls[p->g.ch];
-	lun_roff = ch_rmap->lun_offs[p->g.lun];
-
-	if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) {
-		pr_err("nvm: corrupted device partition table\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int gen_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct gen_dev_map *dev_rmap = dev->rmap;
-	struct gen_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch];
-	int lun_roff = ch_rmap->lun_offs[p->g.lun];
-
-	p->g.ch -= ch_rmap->ch_off;
-	p->g.lun -= lun_roff;
-
-	return 0;
-}
-
-static int gen_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-			int flag)
-{
-	gen_trans_fn *f;
-	int i;
-	int ret = 0;
-
-	f = (flag == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt;
-
-	if (rqd->nr_ppas == 1)
-		return f(tgt_dev, &rqd->ppa_addr);
-
-	for (i = 0; i < rqd->nr_ppas; i++) {
-		ret = f(tgt_dev, &rqd->ppa_list[i]);
-		if (ret)
-			goto out;
-	}
-
-out:
-	return ret;
-}
-
-static void gen_end_io(struct nvm_rq *rqd)
-{
-	struct nvm_tgt_dev *tgt_dev = rqd->dev;
-	struct nvm_tgt_instance *ins = rqd->ins;
-
-	/* Convert address space */
-	if (tgt_dev)
-		gen_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT);
-
-	ins->tt->end_io(rqd);
-}
-
-static int gen_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-
-	if (!dev->ops->submit_io)
-		return -ENODEV;
-
-	/* Convert address space */
-	gen_trans_rq(tgt_dev, rqd, TRANS_TGT_TO_DEV);
-	nvm_generic_to_addr_mode(dev, rqd);
-
-	rqd->dev = tgt_dev;
-	rqd->end_io = gen_end_io;
-	return dev->ops->submit_io(dev, rqd);
-}
-
-static int gen_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p,
-			 int flags)
-{
-	/* Convert address space */
-	gen_map_to_dev(tgt_dev, p);
-
-	return nvm_erase_ppa(tgt_dev->parent, p, 1, flags);
-}
-
-static struct ppa_addr gen_trans_ppa(struct nvm_tgt_dev *tgt_dev,
-				     struct ppa_addr p, int direction)
-{
-	gen_trans_fn *f;
-	struct ppa_addr ppa = p;
-
-	f = (direction == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt;
-	f(tgt_dev, &ppa);
-
-	return ppa;
-}
-
-static void gen_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
-			       int len)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct gen_dev_map *dev_rmap = dev->rmap;
-	u64 i;
-
-	for (i = 0; i < len; i++) {
-		struct gen_ch_map *ch_rmap;
-		int *lun_roffs;
-		struct ppa_addr gaddr;
-		u64 pba = le64_to_cpu(entries[i]);
-		int off;
-		u64 diff;
-
-		if (!pba)
-			continue;
-
-		gaddr = linear_to_generic_addr(geo, pba);
-		ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
-		lun_roffs = ch_rmap->lun_offs;
-
-		off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
-
-		diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
-				(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
-
-		entries[i] -= cpu_to_le64(diff);
-	}
-}
-
-static struct nvmm_type gen = {
-	.name			= "gennvm",
-	.version		= {0, 1, 0},
-
-	.register_mgr		= gen_register,
-	.unregister_mgr		= gen_unregister,
-
-	.create_tgt		= gen_create_tgt,
-	.remove_tgt		= gen_remove_tgt,
-
-	.submit_io		= gen_submit_io,
-	.erase_blk		= gen_erase_blk,
-
-	.get_area		= gen_get_area,
-	.put_area		= gen_put_area,
-
-	.trans_ppa		= gen_trans_ppa,
-	.part_to_tgt		= gen_part_to_tgt,
-};
-
-static int __init gen_module_init(void)
-{
-	return nvm_register_mgr(&gen);
-}
-
-static void gen_module_exit(void)
-{
-	nvm_unregister_mgr(&gen);
-}
-
-module_init(gen_module_init);
-module_exit(gen_module_exit);
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("General media manager for Open-Channel SSDs");
diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h
deleted file mode 100644
index 6a4b3f368848..000000000000
--- a/drivers/lightnvm/gennvm.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright: Matias Bjorling <mb@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- */
-
-#ifndef GENNVM_H_
-#define GENNVM_H_
-
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-
-#include <linux/lightnvm.h>
-
-struct gen_dev {
-	struct nvm_dev *dev;
-
-	int nr_luns;
-	struct list_head area_list;
-
-	struct mutex lock;
-	struct list_head targets;
-};
-
-/* Map between virtual and physical channel and lun */
-struct gen_ch_map {
-	int ch_off;
-	int nr_luns;
-	int *lun_offs;
-};
-
-struct gen_dev_map {
-	struct gen_ch_map *chnls;
-	int nr_chnls;
-};
-
-struct gen_area {
-	struct list_head list;
-	sector_t begin;
-	sector_t end;	/* end is excluded */
-};
-
-static inline void *ch_map_to_lun_offs(struct gen_ch_map *ch_map)
-{
-	return ch_map + 1;
-}
-
-typedef int (gen_trans_fn)(struct nvm_tgt_dev *, struct ppa_addr *);
-
-#define gen_for_each_lun(bm, lun, i) \
-		for ((i) = 0, lun = &(bm)->luns[0]; \
-			(i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)])
-
-#endif /* GENNVM_H_ */
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
deleted file mode 100644
index 12002bf4efc2..000000000000
--- a/drivers/lightnvm/sysblk.c
+++ /dev/null
@@ -1,733 +0,0 @@
-/*
- * Copyright (C) 2015 Matias Bjorling. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- */
-
-#include <linux/lightnvm.h>
-
-#define MAX_SYSBLKS 3	/* remember to update mapping scheme on change */
-#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases
-			      * enables ~1.5M updates per sysblk unit
-			      */
-
-struct sysblk_scan {
-	/* A row is a collection of flash blocks for a system block. */
-	int nr_rows;
-	int row;
-	int act_blk[MAX_SYSBLKS];
-
-	int nr_ppas;
-	struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */
-};
-
-static inline int scan_ppa_idx(int row, int blkid)
-{
-	return (row * MAX_BLKS_PR_SYSBLK) + blkid;
-}
-
-static void nvm_sysblk_to_cpu(struct nvm_sb_info *info,
-			      struct nvm_system_block *sb)
-{
-	info->seqnr = be32_to_cpu(sb->seqnr);
-	info->erase_cnt = be32_to_cpu(sb->erase_cnt);
-	info->version = be16_to_cpu(sb->version);
-	strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN);
-	info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa);
-}
-
-static void nvm_cpu_to_sysblk(struct nvm_system_block *sb,
-			      struct nvm_sb_info *info)
-{
-	sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC);
-	sb->seqnr = cpu_to_be32(info->seqnr);
-	sb->erase_cnt = cpu_to_be32(info->erase_cnt);
-	sb->version = cpu_to_be16(info->version);
-	strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN);
-	sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa);
-}
-
-static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int nr_rows = min_t(int, MAX_SYSBLKS, geo->nr_chnls);
-	int i;
-
-	for (i = 0; i < nr_rows; i++)
-		sysblk_ppas[i].ppa = 0;
-
-	/* if possible, place sysblk at first channel, middle channel and last
-	 * channel of the device. If not, create only one or two sys blocks
-	 */
-	switch (geo->nr_chnls) {
-	case 2:
-		sysblk_ppas[1].g.ch = 1;
-		/* fall-through */
-	case 1:
-		sysblk_ppas[0].g.ch = 0;
-		break;
-	default:
-		sysblk_ppas[0].g.ch = 0;
-		sysblk_ppas[1].g.ch = geo->nr_chnls / 2;
-		sysblk_ppas[2].g.ch = geo->nr_chnls - 1;
-		break;
-	}
-
-	return nr_rows;
-}
-
-static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
-						struct ppa_addr *sysblk_ppas)
-{
-	memset(s, 0, sizeof(struct sysblk_scan));
-	s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas);
-}
-
-static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-					u8 *blks, int nr_blks,
-					struct sysblk_scan *s)
-{
-	struct ppa_addr *sppa;
-	int i, blkid = 0;
-
-	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-	if (nr_blks < 0)
-		return nr_blks;
-
-	for (i = 0; i < nr_blks; i++) {
-		if (blks[i] == NVM_BLK_T_HOST)
-			return -EEXIST;
-
-		if (blks[i] != NVM_BLK_T_FREE)
-			continue;
-
-		sppa = &s->ppas[scan_ppa_idx(s->row, blkid)];
-		sppa->g.ch = ppa.g.ch;
-		sppa->g.lun = ppa.g.lun;
-		sppa->g.blk = i;
-		s->nr_ppas++;
-		blkid++;
-
-		pr_debug("nvm: use (%u %u %u) as sysblk\n",
-					sppa->g.ch, sppa->g.lun, sppa->g.blk);
-		if (blkid > MAX_BLKS_PR_SYSBLK - 1)
-			return 0;
-	}
-
-	pr_err("nvm: sysblk failed get sysblk\n");
-	return -EINVAL;
-}
-
-static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-					u8 *blks, int nr_blks,
-					struct sysblk_scan *s)
-{
-	int i, nr_sysblk = 0;
-
-	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-	if (nr_blks < 0)
-		return nr_blks;
-
-	for (i = 0; i < nr_blks; i++) {
-		if (blks[i] != NVM_BLK_T_HOST)
-			continue;
-
-		if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) {
-			pr_err("nvm: too many host blks\n");
-			return -EINVAL;
-		}
-
-		ppa.g.blk = i;
-
-		s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa;
-		s->nr_ppas++;
-		nr_sysblk++;
-	}
-
-	return 0;
-}
-
-static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
-				struct ppa_addr *ppas, int get_free)
-{
-	struct nvm_geo *geo = &dev->geo;
-	int i, nr_blks, ret = 0;
-	u8 *blks;
-
-	s->nr_ppas = 0;
-	nr_blks = geo->blks_per_lun * geo->plane_mode;
-
-	blks = kmalloc(nr_blks, GFP_KERNEL);
-	if (!blks)
-		return -ENOMEM;
-
-	for (i = 0; i < s->nr_rows; i++) {
-		s->row = i;
-
-		ret = nvm_get_bb_tbl(dev, ppas[i], blks);
-		if (ret) {
-			pr_err("nvm: failed bb tbl for ppa (%u %u)\n",
-							ppas[i].g.ch,
-							ppas[i].g.blk);
-			goto err_get;
-		}
-
-		if (get_free)
-			ret = sysblk_get_free_blks(dev, ppas[i], blks, nr_blks,
-									s);
-		else
-			ret = sysblk_get_host_blks(dev, ppas[i], blks, nr_blks,
-									s);
-
-		if (ret)
-			goto err_get;
-	}
-
-err_get:
-	kfree(blks);
-	return ret;
-}
-
-/*
- * scans a block for latest sysblk.
- * Returns:
- *	0 - newer sysblk not found. PPA is updated to latest page.
- *	1 - newer sysblk found and stored in *cur. PPA is updated to
- *	    next valid page.
- *	<0- error.
- */
-static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
-						struct nvm_system_block *sblk)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_system_block *cur;
-	int pg, ret, found = 0;
-
-	/* the full buffer for a flash page is allocated. Only the first of it
-	 * contains the system block information
-	 */
-	cur = kmalloc(geo->pfpg_size, GFP_KERNEL);
-	if (!cur)
-		return -ENOMEM;
-
-	/* perform linear scan through the block */
-	for (pg = 0; pg < dev->lps_per_blk; pg++) {
-		ppa->g.pg = ppa_to_slc(dev, pg);
-
-		ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE,
-							cur, geo->pfpg_size);
-		if (ret) {
-			if (ret == NVM_RSP_ERR_EMPTYPAGE) {
-				pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n",
-							ppa->g.ch,
-							ppa->g.lun,
-							ppa->g.blk,
-							ppa->g.pg);
-				break;
-			}
-			pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)",
-							ret,
-							ppa->g.ch,
-							ppa->g.lun,
-							ppa->g.blk,
-							ppa->g.pg);
-			break; /* if we can't read a page, continue to the
-				* next blk
-				*/
-		}
-
-		if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) {
-			pr_debug("nvm: scan break for ppa (%u %u %u %u)\n",
-							ppa->g.ch,
-							ppa->g.lun,
-							ppa->g.blk,
-							ppa->g.pg);
-			break; /* last valid page already found */
-		}
-
-		if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr))
-			continue;
-
-		memcpy(sblk, cur, sizeof(struct nvm_system_block));
-		found = 1;
-	}
-
-	kfree(cur);
-
-	return found;
-}
-
-static int nvm_sysblk_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s,
-								int type)
-{
-	return nvm_set_bb_tbl(dev, s->ppas, s->nr_ppas, type);
-}
-
-static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
-							struct sysblk_scan *s)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_system_block nvmsb;
-	void *buf;
-	int i, sect, ret = 0;
-	struct ppa_addr *ppas;
-
-	nvm_cpu_to_sysblk(&nvmsb, info);
-
-	buf = kzalloc(geo->pfpg_size, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	memcpy(buf, &nvmsb, sizeof(struct nvm_system_block));
-
-	ppas = kcalloc(geo->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL);
-	if (!ppas) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	/* Write and verify */
-	for (i = 0; i < s->nr_rows; i++) {
-		ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])];
-
-		pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n",
-							ppas[0].g.ch,
-							ppas[0].g.lun,
-							ppas[0].g.blk,
-							ppas[0].g.pg);
-
-		/* Expand to all sectors within a flash page */
-		if (geo->sec_per_pg > 1) {
-			for (sect = 1; sect < geo->sec_per_pg; sect++) {
-				ppas[sect].ppa = ppas[0].ppa;
-				ppas[sect].g.sec = sect;
-			}
-		}
-
-		ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PWRITE,
-					NVM_IO_SLC_MODE, buf, geo->pfpg_size);
-		if (ret) {
-			pr_err("nvm: sysblk failed program (%u %u %u)\n",
-							ppas[0].g.ch,
-							ppas[0].g.lun,
-							ppas[0].g.blk);
-			break;
-		}
-
-		ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PREAD,
-					NVM_IO_SLC_MODE, buf, geo->pfpg_size);
-		if (ret) {
-			pr_err("nvm: sysblk failed read (%u %u %u)\n",
-							ppas[0].g.ch,
-							ppas[0].g.lun,
-							ppas[0].g.blk);
-			break;
-		}
-
-		if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) {
-			pr_err("nvm: sysblk failed verify (%u %u %u)\n",
-							ppas[0].g.ch,
-							ppas[0].g.lun,
-							ppas[0].g.blk);
-			ret = -EINVAL;
-			break;
-		}
-	}
-
-	kfree(ppas);
-err:
-	kfree(buf);
-
-	return ret;
-}
-
-static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s)
-{
-	int i, ret;
-	unsigned long nxt_blk;
-	struct ppa_addr *ppa;
-
-	for (i = 0; i < s->nr_rows; i++) {
-		nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK;
-		ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)];
-		ppa->g.pg = ppa_to_slc(dev, 0);
-
-		ret = nvm_erase_ppa(dev, ppa, 1, 0);
-		if (ret)
-			return ret;
-
-		s->act_blk[i] = nxt_blk;
-	}
-
-	return 0;
-}
-
-int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
-{
-	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-	struct sysblk_scan s;
-	struct nvm_system_block *cur;
-	int i, j, found = 0;
-	int ret = -ENOMEM;
-
-	/*
-	 * 1. setup sysblk locations
-	 * 2. get bad block list
-	 * 3. filter on host-specific (type 3)
-	 * 4. iterate through all and find the highest seq nr.
-	 * 5. return superblock information
-	 */
-
-	if (!dev->ops->get_bb_tbl)
-		return -EINVAL;
-
-	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-
-	mutex_lock(&dev->mlock);
-	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
-	if (ret)
-		goto err_sysblk;
-
-	/* no sysblocks initialized */
-	if (!s.nr_ppas)
-		goto err_sysblk;
-
-	cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
-	if (!cur)
-		goto err_sysblk;
-
-	/* find the latest block across all sysblocks */
-	for (i = 0; i < s.nr_rows; i++) {
-		for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
-			struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)];
-
-			ret = nvm_scan_block(dev, &ppa, cur);
-			if (ret > 0)
-				found = 1;
-			else if (ret < 0)
-				break;
-		}
-	}
-
-	nvm_sysblk_to_cpu(info, cur);
-
-	kfree(cur);
-err_sysblk:
-	mutex_unlock(&dev->mlock);
-
-	if (found)
-		return 1;
-	return ret;
-}
-
-int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new)
-{
-	/* 1. for each latest superblock
-	 * 2. if room
-	 *    a. write new flash page entry with the updated information
-	 * 3. if no room
-	 *    a. find next available block on lun (linear search)
-	 *       if none, continue to next lun
-	 *       if none at all, report error. also report that it wasn't
-	 *       possible to write to all superblocks.
-	 *    c. write data to block.
-	 */
-	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-	struct sysblk_scan s;
-	struct nvm_system_block *cur;
-	int i, j, ppaidx, found = 0;
-	int ret = -ENOMEM;
-
-	if (!dev->ops->get_bb_tbl)
-		return -EINVAL;
-
-	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-
-	mutex_lock(&dev->mlock);
-	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
-	if (ret)
-		goto err_sysblk;
-
-	cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
-	if (!cur)
-		goto err_sysblk;
-
-	/* Get the latest sysblk for each sysblk row */
-	for (i = 0; i < s.nr_rows; i++) {
-		found = 0;
-		for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
-			ppaidx = scan_ppa_idx(i, j);
-			ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur);
-			if (ret > 0) {
-				s.act_blk[i] = j;
-				found = 1;
-			} else if (ret < 0)
-				break;
-		}
-	}
-
-	if (!found) {
-		pr_err("nvm: no valid sysblks found to update\n");
-		ret = -EINVAL;
-		goto err_cur;
-	}
-
-	/*
-	 * All sysblocks found. Check that they have same page id in their flash
-	 * blocks
-	 */
-	for (i = 1; i < s.nr_rows; i++) {
-		struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])];
-		struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])];
-
-		if (l.g.pg != r.g.pg) {
-			pr_err("nvm: sysblks not on same page. Previous update failed.\n");
-			ret = -EINVAL;
-			goto err_cur;
-		}
-	}
-
-	/*
-	 * Check that there haven't been another update to the seqnr since we
-	 * began
-	 */
-	if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) {
-		pr_err("nvm: seq is not sequential\n");
-		ret = -EINVAL;
-		goto err_cur;
-	}
-
-	/*
-	 * When all pages in a block has been written, a new block is selected
-	 * and writing is performed on the new block.
-	 */
-	if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg ==
-						dev->lps_per_blk - 1) {
-		ret = nvm_prepare_new_sysblks(dev, &s);
-		if (ret)
-			goto err_cur;
-	}
-
-	ret = nvm_write_and_verify(dev, new, &s);
-err_cur:
-	kfree(cur);
-err_sysblk:
-	mutex_unlock(&dev->mlock);
-
-	return ret;
-}
-
-int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-	struct sysblk_scan s;
-	int ret;
-
-	/*
-	 * 1. select master blocks and select first available blks
-	 * 2. get bad block list
-	 * 3. mark MAX_SYSBLKS block as host-based device allocated.
-	 * 4. write and verify data to block
-	 */
-
-	if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl)
-		return -EINVAL;
-
-	if (!(geo->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) {
-		pr_err("nvm: memory does not support SLC access\n");
-		return -EINVAL;
-	}
-
-	/* Index all sysblocks and mark them as host-driven */
-	nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-
-	mutex_lock(&dev->mlock);
-	ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 1);
-	if (ret)
-		goto err_mark;
-
-	ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_HOST);
-	if (ret)
-		goto err_mark;
-
-	/* Write to the first block of each row */
-	ret = nvm_write_and_verify(dev, info, &s);
-err_mark:
-	mutex_unlock(&dev->mlock);
-	return ret;
-}
-
-static int factory_nblks(int nblks)
-{
-	/* Round up to nearest BITS_PER_LONG */
-	return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
-}
-
-static unsigned int factory_blk_offset(struct nvm_geo *geo, struct ppa_addr ppa)
-{
-	int nblks = factory_nblks(geo->blks_per_lun);
-
-	return ((ppa.g.ch * geo->luns_per_chnl * nblks) + (ppa.g.lun * nblks)) /
-								BITS_PER_LONG;
-}
-
-static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-					u8 *blks, int nr_blks,
-					unsigned long *blk_bitmap, int flags)
-{
-	int i, lunoff;
-
-	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-	if (nr_blks < 0)
-		return nr_blks;
-
-	lunoff = factory_blk_offset(&dev->geo, ppa);
-
-	/* non-set bits correspond to the block must be erased */
-	for (i = 0; i < nr_blks; i++) {
-		switch (blks[i]) {
-		case NVM_BLK_T_FREE:
-			if (flags & NVM_FACTORY_ERASE_ONLY_USER)
-				set_bit(i, &blk_bitmap[lunoff]);
-			break;
-		case NVM_BLK_T_HOST:
-			if (!(flags & NVM_FACTORY_RESET_HOST_BLKS))
-				set_bit(i, &blk_bitmap[lunoff]);
-			break;
-		case NVM_BLK_T_GRWN_BAD:
-			if (!(flags & NVM_FACTORY_RESET_GRWN_BBLKS))
-				set_bit(i, &blk_bitmap[lunoff]);
-			break;
-		default:
-			set_bit(i, &blk_bitmap[lunoff]);
-			break;
-		}
-	}
-
-	return 0;
-}
-
-static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
-					int max_ppas, unsigned long *blk_bitmap)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr ppa;
-	int ch, lun, blkid, idx, done = 0, ppa_cnt = 0;
-	unsigned long *offset;
-
-	while (!done) {
-		done = 1;
-		nvm_for_each_lun_ppa(geo, ppa, ch, lun) {
-			idx = factory_blk_offset(geo, ppa);
-			offset = &blk_bitmap[idx];
-
-			blkid = find_first_zero_bit(offset, geo->blks_per_lun);
-			if (blkid >= geo->blks_per_lun)
-				continue;
-			set_bit(blkid, offset);
-
-			ppa.g.blk = blkid;
-			pr_debug("nvm: erase ppa (%u %u %u)\n",
-							ppa.g.ch,
-							ppa.g.lun,
-							ppa.g.blk);
-
-			erase_list[ppa_cnt] = ppa;
-			ppa_cnt++;
-			done = 0;
-
-			if (ppa_cnt == max_ppas)
-				return ppa_cnt;
-		}
-	}
-
-	return ppa_cnt;
-}
-
-static int nvm_fact_select_blks(struct nvm_dev *dev, unsigned long *blk_bitmap,
-								int flags)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr ppa;
-	int ch, lun, nr_blks, ret = 0;
-	u8 *blks;
-
-	nr_blks = geo->blks_per_lun * geo->plane_mode;
-	blks = kmalloc(nr_blks, GFP_KERNEL);
-	if (!blks)
-		return -ENOMEM;
-
-	nvm_for_each_lun_ppa(geo, ppa, ch, lun) {
-		ret = nvm_get_bb_tbl(dev, ppa, blks);
-		if (ret)
-			pr_err("nvm: failed bb tbl for ch%u lun%u\n",
-							ppa.g.ch, ppa.g.blk);
-
-		ret = nvm_factory_blks(dev, ppa, blks, nr_blks, blk_bitmap,
-									flags);
-		if (ret)
-			break;
-	}
-
-	kfree(blks);
-	return ret;
-}
-
-int nvm_dev_factory(struct nvm_dev *dev, int flags)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr *ppas;
-	int ppa_cnt, ret = -ENOMEM;
-	int max_ppas = dev->ops->max_phys_sect / geo->nr_planes;
-	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-	struct sysblk_scan s;
-	unsigned long *blk_bitmap;
-
-	blk_bitmap = kzalloc(factory_nblks(geo->blks_per_lun) * geo->nr_luns,
-								GFP_KERNEL);
-	if (!blk_bitmap)
-		return ret;
-
-	ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL);
-	if (!ppas)
-		goto err_blks;
-
-	/* create list of blks to be erased */
-	ret = nvm_fact_select_blks(dev, blk_bitmap, flags);
-	if (ret)
-		goto err_ppas;
-
-	/* continue to erase until list of blks until empty */
-	while ((ppa_cnt =
-			nvm_fact_get_blks(dev, ppas, max_ppas, blk_bitmap)) > 0)
-		nvm_erase_ppa(dev, ppas, ppa_cnt, 0);
-
-	/* mark host reserved blocks free */
-	if (flags & NVM_FACTORY_RESET_HOST_BLKS) {
-		nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-		mutex_lock(&dev->mlock);
-		ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
-		if (!ret)
-			ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_FREE);
-		mutex_unlock(&dev->mlock);
-	}
-err_ppas:
-	kfree(ppas);
-err_blks:
-	kfree(blk_bitmap);
-	return ret;
-}
-EXPORT_SYMBOL(nvm_dev_factory);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 588d4a34c083..ad54ec9c986f 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -372,7 +372,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 		}
 
 		/* Transform physical address to target address space */
-		nvmdev->mt->part_to_tgt(nvmdev, entries, cmd_nlb);
+		nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
 
 		if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
 			ret = -EINTR;
@@ -633,10 +633,9 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
 		return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
 	} else if (strcmp(attr->name, "device_mode") == 0) {
 		return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
+	/* kept for compatibility */
 	} else if (strcmp(attr->name, "media_manager") == 0) {
-		if (!ndev->mt)
-			return scnprintf(page, PAGE_SIZE, "%s\n", "none");
-		return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name);
+		return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm");
 	} else if (strcmp(attr->name, "ppa_format") == 0) {
 		return scnprintf(page, PAGE_SIZE,
 			"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 7c273bbc5351..84309fe27472 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -80,8 +80,6 @@ struct nvm_dev_ops {
 	unsigned int		max_phys_sect;
 };
 
-
-
 #ifdef CONFIG_NVM
 
 #include <linux/blkdev.h>
@@ -272,15 +270,6 @@ enum {
 	NVM_BLK_ST_BAD =	0x8,	/* Bad block */
 };
 
-/* system block cpu representation */
-struct nvm_sb_info {
-	unsigned long		seqnr;
-	unsigned long		erase_cnt;
-	unsigned int		version;
-	char			mmtype[NVM_MMTYPE_LEN];
-	struct ppa_addr		fs_ppa;
-};
-
 /* Device generic information */
 struct nvm_geo {
 	int nr_chnls;
@@ -308,6 +297,7 @@ struct nvm_geo {
 	int sec_per_lun;
 };
 
+/* sub-device structure */
 struct nvm_tgt_dev {
 	/* Device information */
 	struct nvm_geo geo;
@@ -329,17 +319,10 @@ struct nvm_dev {
 
 	struct list_head devices;
 
-	/* Media manager */
-	struct nvmm_type *mt;
-	void *mp;
-
-	/* System blocks */
-	struct nvm_sb_info sb;
-
 	/* Device information */
 	struct nvm_geo geo;
 
-	/* lower page table */
+	  /* lower page table */
 	int lps_per_blk;
 	int *lptbl;
 
@@ -359,6 +342,10 @@ struct nvm_dev {
 
 	struct mutex mlock;
 	spinlock_t lock;
+
+	/* target management */
+	struct list_head area_list;
+	struct list_head targets;
 };
 
 static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
@@ -452,11 +439,6 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
 					(ppa1.g.blk == ppa2.g.blk));
 }
 
-static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
-{
-	return dev->lptbl[slc_pg];
-}
-
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
@@ -487,49 +469,6 @@ extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
 extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *);
 extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t);
 
-typedef int (nvmm_register_fn)(struct nvm_dev *);
-typedef void (nvmm_unregister_fn)(struct nvm_dev *);
-
-typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *);
-typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
-typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *);
-typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int);
-typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
-typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t);
-typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *,
-					    struct ppa_addr, int);
-typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int);
-
-enum {
-	TRANS_TGT_TO_DEV =	0x0,
-	TRANS_DEV_TO_TGT =	0x1,
-};
-
-struct nvmm_type {
-	const char *name;
-	unsigned int version[3];
-
-	nvmm_register_fn *register_mgr;
-	nvmm_unregister_fn *unregister_mgr;
-
-	nvmm_create_tgt_fn *create_tgt;
-	nvmm_remove_tgt_fn *remove_tgt;
-
-	nvmm_submit_io_fn *submit_io;
-	nvmm_erase_blk_fn *erase_blk;
-
-	nvmm_get_area_fn *get_area;
-	nvmm_put_area_fn *put_area;
-
-	nvmm_trans_ppa_fn *trans_ppa;
-	nvmm_part_to_tgt_fn *part_to_tgt;
-
-	struct list_head list;
-};
-
-extern int nvm_register_mgr(struct nvmm_type *);
-extern void nvm_unregister_mgr(struct nvmm_type *);
-
 extern struct nvm_dev *nvm_alloc_dev(int);
 extern int nvm_register(struct nvm_dev *);
 extern void nvm_unregister(struct nvm_dev *);
@@ -559,31 +498,9 @@ extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
-/* sysblk.c */
-#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
-
-/* system block on disk representation */
-struct nvm_system_block {
-	__be32			magic;		/* magic signature */
-	__be32			seqnr;		/* sequence number */
-	__be32			erase_cnt;	/* erase count */
-	__be16			version;	/* version number */
-	u8			mmtype[NVM_MMTYPE_LEN]; /* media manager name */
-	__be64			fs_ppa;		/* PPA for media manager
-						 * superblock */
-};
-
-extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *);
-extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *);
-extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *);
-
 extern int nvm_dev_factory(struct nvm_dev *, int flags);
 
-#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid)			\
-	for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls;	\
-					(chid)++, (ppa).g.ch = (chid))	\
-		for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl;	\
-					(lunid)++, (ppa).g.lun = (lunid))
+extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
 
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
-- 
cgit v1.2.3


From 10995c3dc9d7f47b92ff3e74b4bd191ddb7991ff Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:10 +0100
Subject: lightnvm: collapse nvm_erase_ppa and nvm_erase_blk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After gennvm and core have been merged, there are no more callers to
nvm_erase_ppa. Therefore collapse the device specific and target
specific erase functions.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 57 ++++++++++++++++++++++--------------------------
 include/linux/lightnvm.h |  1 -
 2 files changed, 26 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index e9a495650dd0..a4e2e3b01ae4 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -683,12 +683,34 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags)
+int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
 {
-	/* Convert address space */
-	nvm_map_to_dev(tgt_dev, p);
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_rq rqd;
+	int ret;
+
+	if (!dev->ops->erase_block)
+		return 0;
+
+	ret = nvm_map_to_dev(tgt_dev, ppas);
+	if (ret)
+		return ret;
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	return nvm_erase_ppa(tgt_dev->parent, p, 1, flags);
+	ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1);
+	if (ret)
+		return ret;
+
+	nvm_generic_to_addr_mode(dev, &rqd);
+
+	rqd.flags = flags;
+
+	ret = dev->ops->erase_block(dev, &rqd);
+
+	nvm_free_rqd_ppalist(dev, &rqd);
+
+	return ret;
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
@@ -847,33 +869,6 @@ void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_free_rqd_ppalist);
 
-int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
-								int flags)
-{
-	struct nvm_rq rqd;
-	int ret;
-
-	if (!dev->ops->erase_block)
-		return 0;
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
-	if (ret)
-		return ret;
-
-	nvm_generic_to_addr_mode(dev, &rqd);
-
-	rqd.flags = flags;
-
-	ret = dev->ops->erase_block(dev, &rqd);
-
-	nvm_free_rqd_ppalist(dev, &rqd);
-
-	return ret;
-}
-EXPORT_SYMBOL(nvm_erase_ppa);
-
 void nvm_end_io(struct nvm_rq *rqd, int error)
 {
 	struct nvm_tgt_dev *tgt_dev = rqd->dev;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 84309fe27472..f2007b2c4979 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -483,7 +483,6 @@ extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
 					const struct ppa_addr *, int, int);
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int);
 extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
-- 
cgit v1.2.3


From 583b7058b2e8071f59646c8fb027f6c3597417ac Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:11 +0100
Subject: lightnvm: remove nvm_submit_ppa* functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nvm_submit_ppa* functions are no longer needed after gennvm and core
have been merged.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 109 -----------------------------------------------
 include/linux/lightnvm.h |   4 --
 2 files changed, 113 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index a4e2e3b01ae4..ffdf048d58d1 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -883,115 +883,6 @@ void nvm_end_io(struct nvm_rq *rqd, int error)
 }
 EXPORT_SYMBOL(nvm_end_io);
 
-static void nvm_end_io_sync(struct nvm_rq *rqd)
-{
-	struct completion *waiting = rqd->wait;
-
-	rqd->wait = NULL;
-
-	complete(waiting);
-}
-
-static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
-						int flags, void *buf, int len)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	struct bio *bio;
-	int ret;
-	unsigned long hang_check;
-
-	bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL);
-	if (IS_ERR_OR_NULL(bio))
-		return -ENOMEM;
-
-	nvm_generic_to_addr_mode(dev, rqd);
-
-	rqd->dev = NULL;
-	rqd->opcode = opcode;
-	rqd->flags = flags;
-	rqd->bio = bio;
-	rqd->wait = &wait;
-	rqd->end_io = nvm_end_io_sync;
-
-	ret = dev->ops->submit_io(dev, rqd);
-	if (ret) {
-		bio_put(bio);
-		return ret;
-	}
-
-	/* Prevent hang_check timer from firing at us during very long I/O */
-	hang_check = sysctl_hung_task_timeout_secs;
-	if (hang_check)
-		while (!wait_for_completion_io_timeout(&wait,
-							hang_check * (HZ/2)))
-			;
-	else
-		wait_for_completion_io(&wait);
-
-	return rqd->error;
-}
-
-/**
- * nvm_submit_ppa_list - submit user-defined ppa list to device. The user must
- *			 take to free ppa list if necessary.
- * @dev:	device
- * @ppa_list:	user created ppa_list
- * @nr_ppas:	length of ppa_list
- * @opcode:	device opcode
- * @flags:	device flags
- * @buf:	data buffer
- * @len:	data buffer length
- */
-int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list,
-			int nr_ppas, int opcode, int flags, void *buf, int len)
-{
-	struct nvm_rq rqd;
-
-	if (dev->ops->max_phys_sect < nr_ppas)
-		return -EINVAL;
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	rqd.nr_ppas = nr_ppas;
-	if (nr_ppas > 1)
-		rqd.ppa_list = ppa_list;
-	else
-		rqd.ppa_addr = ppa_list[0];
-
-	return __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len);
-}
-EXPORT_SYMBOL(nvm_submit_ppa_list);
-
-/**
- * nvm_submit_ppa - submit PPAs to device. PPAs will automatically be unfolded
- *		    as single, dual, quad plane PPAs depending on device type.
- * @dev:	device
- * @ppa:	user created ppa_list
- * @nr_ppas:	length of ppa_list
- * @opcode:	device opcode
- * @flags:	device flags
- * @buf:	data buffer
- * @len:	data buffer length
- */
-int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
-				int opcode, int flags, void *buf, int len)
-{
-	struct nvm_rq rqd;
-	int ret;
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-	ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas, 1);
-	if (ret)
-		return ret;
-
-	ret = __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len);
-
-	nvm_free_rqd_ppalist(dev, &rqd);
-
-	return ret;
-}
-EXPORT_SYMBOL(nvm_submit_ppa);
-
 /*
  * folds a bad block list from its plane representation to its virtual
  * block representation. The fold is done in place and reduced size is
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index f2007b2c4979..abb3d55c107f 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -489,10 +489,6 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
 extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
 extern void nvm_end_io(struct nvm_rq *, int);
-extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
-								void *, int);
-extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int,
-							int, void *, int);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
-- 
cgit v1.2.3


From 8f4fe008fb256649bd0e16c96a6eafa3bd916ac3 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:12 +0100
Subject: lightnvm: remove nvm_get_bb_tbl and nvm_set_bb_tbl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the merge of gennvm and core, there is no longer a need for the
device specific bad block functions.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 40 ++++------------------------------------
 include/linux/lightnvm.h |  2 --
 2 files changed, 4 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ffdf048d58d1..18d48732fe74 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -605,33 +605,6 @@ static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev,
 	}
 }
 
-int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
-								int type)
-{
-	struct nvm_rq rqd;
-	int ret;
-
-	if (nr_ppas > dev->ops->max_phys_sect) {
-		pr_err("nvm: unable to update all sysblocks atomically\n");
-		return -EINVAL;
-	}
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
-	nvm_generic_to_addr_mode(dev, &rqd);
-
-	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
-	nvm_free_rqd_ppalist(dev, &rqd);
-	if (ret) {
-		pr_err("nvm: sysblk failed bb mark\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(nvm_set_bb_tbl);
-
 int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 		       int nr_ppas, int type)
 {
@@ -919,20 +892,15 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 }
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
 
-int nvm_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks)
-{
-	ppa = generic_to_dev_addr(dev, ppa);
-
-	return dev->ops->get_bb_tbl(dev, ppa, blks);
-}
-EXPORT_SYMBOL(nvm_get_bb_tbl);
-
 int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
 		       u8 *blks)
 {
+	struct nvm_dev *dev = tgt_dev->parent;
+
 	ppa = nvm_trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV);
+	ppa = generic_to_dev_addr(dev, ppa);
 
-	return nvm_get_bb_tbl(tgt_dev->parent, ppa, blks);
+	return dev->ops->get_bb_tbl(dev, ppa, blks);
 }
 EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index abb3d55c107f..cad1e1cb0635 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -473,7 +473,6 @@ extern struct nvm_dev *nvm_alloc_dev(int);
 extern int nvm_register(struct nvm_dev *);
 extern void nvm_unregister(struct nvm_dev *);
 
-extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int);
 extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 			      int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
@@ -490,7 +489,6 @@ extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
 extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
 extern void nvm_end_io(struct nvm_rq *, int);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
-extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
 extern int nvm_dev_factory(struct nvm_dev *, int flags);
-- 
cgit v1.2.3


From dab8ee9e8a30620a5b5f22d6c0b3749217093803 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:14 +0100
Subject: lightnvm: cleanup nvm transformation functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Going from target specific ppa addresses to device was accomplished by
first converting target to generic ppa addresses and generic to device
addresses. The conversion was either open-coded or used the built-in
nvm_trans_* and nvm_map_* functions for conversion. Simplify the
interface and cleanup the calls to provide clean functions that now
either take a list of ppas or a nvm_rq, and is exposed through:

 void nvm_ppa_* - target to/from device with a list of PPAs,
 void nvm_rq_* - target to/from device with a nvm_rq.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 115 ++++++++++++++---------------------------------
 include/linux/lightnvm.h |  14 +++---
 2 files changed, 40 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index a1a7a5a303bd..0842c85d3b84 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -50,11 +50,6 @@ struct nvm_area {
 	sector_t end;	/* end is excluded */
 };
 
-enum {
-	TRANS_TGT_TO_DEV =	0x0,
-	TRANS_DEV_TO_TGT =	0x1,
-};
-
 static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
 {
 	struct nvm_target *tgt;
@@ -428,38 +423,46 @@ static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
 	p->g.lun -= lun_roff;
 }
 
-static void nvm_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-			int flag)
+static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev,
+				struct ppa_addr *ppa_list, int nr_ppas)
 {
 	int i;
 
-	if (rqd->nr_ppas == 1) {
-		if (flag == TRANS_TGT_TO_DEV)
-			nvm_map_to_dev(tgt_dev, &rqd->ppa_addr);
-		else
-			nvm_map_to_tgt(tgt_dev, &rqd->ppa_addr);
-		return;
+	for (i = 0; i < nr_ppas; i++) {
+		nvm_map_to_dev(tgt_dev, &ppa_list[i]);
+		ppa_list[i] = generic_to_dev_addr(tgt_dev, ppa_list[i]);
 	}
+}
 
-	for (i = 0; i < rqd->nr_ppas; i++) {
-		if (flag == TRANS_TGT_TO_DEV)
-			nvm_map_to_dev(tgt_dev, &rqd->ppa_list[i]);
-		else
-			nvm_map_to_tgt(tgt_dev, &rqd->ppa_list[i]);
+static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev,
+				struct ppa_addr *ppa_list, int nr_ppas)
+{
+	int i;
+
+	for (i = 0; i < nr_ppas; i++) {
+		ppa_list[i] = dev_to_generic_addr(tgt_dev, ppa_list[i]);
+		nvm_map_to_tgt(tgt_dev, &ppa_list[i]);
 	}
 }
 
-static struct ppa_addr nvm_trans_ppa(struct nvm_tgt_dev *tgt_dev,
-				     struct ppa_addr p, int dir)
+static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
-	struct ppa_addr ppa = p;
+	if (rqd->nr_ppas == 1) {
+		nvm_ppa_tgt_to_dev(tgt_dev, &rqd->ppa_addr, 1);
+		return;
+	}
 
-	if (dir == TRANS_TGT_TO_DEV)
-		nvm_map_to_dev(tgt_dev, &ppa);
-	else
-		nvm_map_to_tgt(tgt_dev, &ppa);
+	nvm_ppa_tgt_to_dev(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
+}
+
+static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+{
+	if (rqd->nr_ppas == 1) {
+		nvm_ppa_dev_to_tgt(tgt_dev, &rqd->ppa_addr, 1);
+		return;
+	}
 
-	return ppa;
+	nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
 }
 
 void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
@@ -564,26 +567,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 	return NULL;
 }
 
-static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev,
-					 struct nvm_rq *rqd)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	int i;
-
-	if (rqd->nr_ppas > 1) {
-		for (i = 0; i < rqd->nr_ppas; i++) {
-			rqd->ppa_list[i] = nvm_trans_ppa(tgt_dev,
-					rqd->ppa_list[i], TRANS_TGT_TO_DEV);
-			rqd->ppa_list[i] = generic_to_dev_addr(dev,
-							rqd->ppa_list[i]);
-		}
-	} else {
-		rqd->ppa_addr = nvm_trans_ppa(tgt_dev, rqd->ppa_addr,
-						TRANS_TGT_TO_DEV);
-		rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
-	}
-}
-
 int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 		       int nr_ppas, int type)
 {
@@ -599,7 +582,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
 	nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
-	nvm_tgt_generic_to_addr_mode(tgt_dev, &rqd);
+	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
 	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
 	nvm_free_rqd_ppalist(dev, &rqd);
@@ -627,8 +610,7 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 	if (!dev->ops->submit_io)
 		return -ENODEV;
 
-	/* Convert address space */
-	nvm_generic_to_addr_mode(dev, rqd);
+	nvm_rq_tgt_to_dev(tgt_dev, rqd);
 
 	rqd->dev = tgt_dev;
 	return dev->ops->submit_io(dev, rqd);
@@ -652,7 +634,7 @@ int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
 	if (ret)
 		return ret;
 
-	nvm_generic_to_addr_mode(dev, &rqd);
+	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
 	rqd.flags = flags;
 
@@ -741,34 +723,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 }
 EXPORT_SYMBOL(nvm_put_area);
 
-void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	int i;
-
-	if (rqd->nr_ppas > 1) {
-		for (i = 0; i < rqd->nr_ppas; i++)
-			rqd->ppa_list[i] = dev_to_generic_addr(dev,
-							rqd->ppa_list[i]);
-	} else {
-		rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
-	}
-}
-EXPORT_SYMBOL(nvm_addr_to_generic_mode);
-
-void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	int i;
-
-	if (rqd->nr_ppas > 1) {
-		for (i = 0; i < rqd->nr_ppas; i++)
-			rqd->ppa_list[i] = generic_to_dev_addr(dev,
-							rqd->ppa_list[i]);
-	} else {
-		rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
-	}
-}
-EXPORT_SYMBOL(nvm_generic_to_addr_mode);
-
 int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 			const struct ppa_addr *ppas, int nr_ppas, int vblk)
 {
@@ -826,7 +780,7 @@ void nvm_end_io(struct nvm_rq *rqd, int error)
 
 	/* Convert address space */
 	if (tgt_dev)
-		nvm_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT);
+		nvm_rq_dev_to_tgt(tgt_dev, rqd);
 
 	rqd->error = error;
 	ins->tt->end_io(rqd);
@@ -874,8 +828,7 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
 {
 	struct nvm_dev *dev = tgt_dev->parent;
 
-	ppa = nvm_trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV);
-	ppa = generic_to_dev_addr(dev, ppa);
+	nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1);
 
 	return dev->ops->get_bb_tbl(dev, ppa, blks);
 }
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index cad1e1cb0635..faa0fbfe339a 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -378,10 +378,10 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
 	return l;
 }
 
-static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
-						struct ppa_addr r)
+static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev,
+						  struct ppa_addr r)
 {
-	struct nvm_geo *geo = &dev->geo;
+	struct nvm_geo *geo = &tgt_dev->geo;
 	struct ppa_addr l;
 
 	l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset;
@@ -394,10 +394,10 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
 	return l;
 }
 
-static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
-						struct ppa_addr r)
+static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev,
+						  struct ppa_addr r)
 {
-	struct nvm_geo *geo = &dev->geo;
+	struct nvm_geo *geo = &tgt_dev->geo;
 	struct ppa_addr l;
 
 	l.ppa = 0;
@@ -477,8 +477,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 			      int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
-extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
 					const struct ppa_addr *, int, int);
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-- 
cgit v1.2.3


From 19bd6fe73ca812964963aa30827cff9aae64a715 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:15 +0100
Subject: lightnvm: reduce number of nvm_id groups to one
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The number of configuration groups has been limited to one in current
code, even if there is support for up to four. With the introduction
of the open-channel SSD 1.3 specification, only a single
group is exposed onwards. Reflect this in the nvm_id structure.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c     |  3 +-
 drivers/lightnvm/core.c      | 12 ++----
 drivers/nvme/host/lightnvm.c | 87 +++++++++++++++++++++-----------------------
 include/linux/lightnvm.h     |  3 +-
 4 files changed, 47 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index c0e14e54909b..e666095278ab 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -460,7 +460,6 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
 
 	id->ver_id = 0x1;
 	id->vmnt = 0;
-	id->cgrps = 1;
 	id->cap = 0x2;
 	id->dom = 0x1;
 
@@ -479,7 +478,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
 
 	sector_div(size, bs); /* convert size to pages */
 	size >>= 8; /* concert size to pgs pr blk */
-	grp = &id->groups[0];
+	grp = &id->grp;
 	grp->mtype = 0;
 	grp->fmtype = 0;
 	grp->num_ch = 1;
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 0842c85d3b84..80cd7677762d 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -884,7 +884,7 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
 static int nvm_core_init(struct nvm_dev *dev)
 {
 	struct nvm_id *id = &dev->identity;
-	struct nvm_id_group *grp = &id->groups[0];
+	struct nvm_id_group *grp = &id->grp;
 	struct nvm_geo *geo = &dev->geo;
 	int ret;
 
@@ -988,20 +988,14 @@ static int nvm_init(struct nvm_dev *dev)
 		goto err;
 	}
 
-	pr_debug("nvm: ver:%x nvm_vendor:%x groups:%u\n",
-			dev->identity.ver_id, dev->identity.vmnt,
-							dev->identity.cgrps);
+	pr_debug("nvm: ver:%x nvm_vendor:%x\n",
+			dev->identity.ver_id, dev->identity.vmnt);
 
 	if (dev->identity.ver_id != 1) {
 		pr_err("nvm: device not supported by kernel.");
 		goto err;
 	}
 
-	if (dev->identity.cgrps != 1) {
-		pr_err("nvm: only one group configuration supported.");
-		goto err;
-	}
-
 	ret = nvm_core_init(dev);
 	if (ret) {
 		pr_err("nvm: could not initialize core structures.\n");
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ad54ec9c986f..733992a25d6a 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -248,50 +248,48 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 {
 	struct nvme_nvm_id_group *src;
 	struct nvm_id_group *dst;
-	int i, end;
-
-	end = min_t(u32, 4, nvm_id->cgrps);
-
-	for (i = 0; i < end; i++) {
-		src = &nvme_nvm_id->groups[i];
-		dst = &nvm_id->groups[i];
-
-		dst->mtype = src->mtype;
-		dst->fmtype = src->fmtype;
-		dst->num_ch = src->num_ch;
-		dst->num_lun = src->num_lun;
-		dst->num_pln = src->num_pln;
-
-		dst->num_pg = le16_to_cpu(src->num_pg);
-		dst->num_blk = le16_to_cpu(src->num_blk);
-		dst->fpg_sz = le16_to_cpu(src->fpg_sz);
-		dst->csecs = le16_to_cpu(src->csecs);
-		dst->sos = le16_to_cpu(src->sos);
-
-		dst->trdt = le32_to_cpu(src->trdt);
-		dst->trdm = le32_to_cpu(src->trdm);
-		dst->tprt = le32_to_cpu(src->tprt);
-		dst->tprm = le32_to_cpu(src->tprm);
-		dst->tbet = le32_to_cpu(src->tbet);
-		dst->tbem = le32_to_cpu(src->tbem);
-		dst->mpos = le32_to_cpu(src->mpos);
-		dst->mccap = le32_to_cpu(src->mccap);
-
-		dst->cpar = le16_to_cpu(src->cpar);
-
-		if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
-			memcpy(dst->lptbl.id, src->lptbl.id, 8);
-			dst->lptbl.mlc.num_pairs =
-					le16_to_cpu(src->lptbl.mlc.num_pairs);
-
-			if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
-				pr_err("nvm: number of MLC pairs not supported\n");
-				return -EINVAL;
-			}
-
-			memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
-						dst->lptbl.mlc.num_pairs);
+
+	if (nvme_nvm_id->cgrps != 1)
+		return -EINVAL;
+
+	src = &nvme_nvm_id->groups[0];
+	dst = &nvm_id->grp;
+
+	dst->mtype = src->mtype;
+	dst->fmtype = src->fmtype;
+	dst->num_ch = src->num_ch;
+	dst->num_lun = src->num_lun;
+	dst->num_pln = src->num_pln;
+
+	dst->num_pg = le16_to_cpu(src->num_pg);
+	dst->num_blk = le16_to_cpu(src->num_blk);
+	dst->fpg_sz = le16_to_cpu(src->fpg_sz);
+	dst->csecs = le16_to_cpu(src->csecs);
+	dst->sos = le16_to_cpu(src->sos);
+
+	dst->trdt = le32_to_cpu(src->trdt);
+	dst->trdm = le32_to_cpu(src->trdm);
+	dst->tprt = le32_to_cpu(src->tprt);
+	dst->tprm = le32_to_cpu(src->tprm);
+	dst->tbet = le32_to_cpu(src->tbet);
+	dst->tbem = le32_to_cpu(src->tbem);
+	dst->mpos = le32_to_cpu(src->mpos);
+	dst->mccap = le32_to_cpu(src->mccap);
+
+	dst->cpar = le16_to_cpu(src->cpar);
+
+	if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
+		memcpy(dst->lptbl.id, src->lptbl.id, 8);
+		dst->lptbl.mlc.num_pairs =
+				le16_to_cpu(src->lptbl.mlc.num_pairs);
+
+		if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
+			pr_err("nvm: number of MLC pairs not supported\n");
+			return -EINVAL;
 		}
+
+		memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
+					dst->lptbl.mlc.num_pairs);
 	}
 
 	return 0;
@@ -321,7 +319,6 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
 
 	nvm_id->ver_id = nvme_nvm_id->ver_id;
 	nvm_id->vmnt = nvme_nvm_id->vmnt;
-	nvm_id->cgrps = nvme_nvm_id->cgrps;
 	nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap);
 	nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom);
 	memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf,
@@ -622,7 +619,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
 		return 0;
 
 	id = &ndev->identity;
-	grp = &id->groups[0];
+	grp = &id->grp;
 	attr = &dattr->attr;
 
 	if (strcmp(attr->name, "version") == 0) {
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index faa0fbfe339a..ce0b2dac84ac 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -200,11 +200,10 @@ struct nvm_addr_format {
 struct nvm_id {
 	u8	ver_id;
 	u8	vmnt;
-	u8	cgrps;
 	u32	cap;
 	u32	dom;
 	struct nvm_addr_format ppaf;
-	struct nvm_id_group groups[4];
+	struct nvm_id_group grp;
 } __packed;
 
 struct nvm_target {
-- 
cgit v1.2.3


From 06894efea706b3cd4ce31e341ec51b4c62c34a86 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:17 +0100
Subject: lightnvm: use end_io callback instead of instance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the lightnvm core had the "gennvm" layer between the device and the
target, there was a need for the core to be able to figure out which
target it should send an end_io callback to. Leading to a "double"
end_io, first for the media manager instance, and then for the target
instance. Now that core and gennvm is merged, there is no longer a need
for this, and a single end_io callback will do.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c     |  3 ++-
 drivers/lightnvm/core.c      |  7 +++----
 drivers/lightnvm/rrpc.c      |  7 +++----
 drivers/lightnvm/rrpc.h      |  3 ---
 drivers/nvme/host/lightnvm.c |  3 ++-
 include/linux/lightnvm.h     | 10 +++-------
 6 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index e666095278ab..a67b7ea1e3bf 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -420,7 +420,8 @@ static void null_lnvm_end_io(struct request *rq, int error)
 {
 	struct nvm_rq *rqd = rq->end_io_data;
 
-	nvm_end_io(rqd, error);
+	rqd->error = error;
+	nvm_end_io(rqd);
 
 	blk_put_request(rq);
 }
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 80cd7677762d..4f4db991c4a6 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -773,17 +773,16 @@ void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_free_rqd_ppalist);
 
-void nvm_end_io(struct nvm_rq *rqd, int error)
+void nvm_end_io(struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *tgt_dev = rqd->dev;
-	struct nvm_tgt_instance *ins = rqd->ins;
 
 	/* Convert address space */
 	if (tgt_dev)
 		nvm_rq_dev_to_tgt(tgt_dev, rqd);
 
-	rqd->error = error;
-	ins->tt->end_io(rqd);
+	if (rqd->end_io)
+		rqd->end_io(rqd);
 }
 EXPORT_SYMBOL(nvm_end_io);
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 9fb7de395915..e00b1d7b976f 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -779,7 +779,7 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 
 static void rrpc_end_io(struct nvm_rq *rqd)
 {
-	struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance);
+	struct rrpc *rrpc = rqd->private;
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
 	uint8_t npages = rqd->nr_ppas;
@@ -972,8 +972,9 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 
 	bio_get(bio);
 	rqd->bio = bio;
-	rqd->ins = &rrpc->instance;
+	rqd->private = rrpc;
 	rqd->nr_ppas = nr_pages;
+	rqd->end_io = rrpc_end_io;
 	rrq->flags = flags;
 
 	err = nvm_submit_io(dev, rqd);
@@ -1532,7 +1533,6 @@ static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
 	if (!rrpc)
 		return ERR_PTR(-ENOMEM);
 
-	rrpc->instance.tt = &tt_rrpc;
 	rrpc->dev = dev;
 	rrpc->disk = tdisk;
 
@@ -1611,7 +1611,6 @@ static struct nvm_tgt_type tt_rrpc = {
 
 	.make_rq	= rrpc_make_rq,
 	.capacity	= rrpc_capacity,
-	.end_io		= rrpc_end_io,
 
 	.init		= rrpc_init,
 	.exit		= rrpc_exit,
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index 94e4d73116b2..fdb6ff902903 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -102,9 +102,6 @@ struct rrpc_lun {
 };
 
 struct rrpc {
-	/* instance must be kept in top to resolve rrpc in unprep */
-	struct nvm_tgt_instance instance;
-
 	struct nvm_tgt_dev *dev;
 	struct gendisk *disk;
 
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 3b6cd9bdba7e..21cac8523bd8 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -484,7 +484,8 @@ static void nvme_nvm_end_io(struct request *rq, int error)
 	struct nvm_rq *rqd = rq->end_io_data;
 
 	rqd->ppa_status = nvme_req(rq)->result.u64;
-	nvm_end_io(rqd, error);
+	rqd->error = error;
+	nvm_end_io(rqd);
 
 	kfree(nvme_req(rq)->cmd);
 	blk_mq_free_request(rq);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ce0b2dac84ac..17cd454f0d87 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -213,10 +213,6 @@ struct nvm_target {
 	struct gendisk *disk;
 };
 
-struct nvm_tgt_instance {
-	struct nvm_tgt_type *tt;
-};
-
 #define ADDR_EMPTY (~0ULL)
 
 #define NVM_VERSION_MAJOR 1
@@ -227,7 +223,6 @@ struct nvm_rq;
 typedef void (nvm_end_io_fn)(struct nvm_rq *);
 
 struct nvm_rq {
-	struct nvm_tgt_instance *ins;
 	struct nvm_tgt_dev *dev;
 
 	struct bio *bio;
@@ -251,6 +246,8 @@ struct nvm_rq {
 
 	u64 ppa_status; /* ppa media status */
 	int error;
+
+	void *private;
 };
 
 static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu)
@@ -450,7 +447,6 @@ struct nvm_tgt_type {
 	/* target entry points */
 	nvm_tgt_make_rq_fn *make_rq;
 	nvm_tgt_capacity_fn *capacity;
-	nvm_end_io_fn *end_io;
 
 	/* module-specific init/teardown */
 	nvm_tgt_init_fn *init;
@@ -484,7 +480,7 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
 extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
-extern void nvm_end_io(struct nvm_rq *, int);
+extern void nvm_end_io(struct nvm_rq *);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
-- 
cgit v1.2.3


From 38ea2f7656f815e7330868cbec7bada0fd7933a8 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Tue, 31 Jan 2017 13:17:18 +0100
Subject: lightnvm: Add CRC read error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let the host differentiate between a read error and a CRC check error on
the device side.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 17cd454f0d87..bc282d26017a 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -107,6 +107,7 @@ enum {
 	NVM_RSP_ERR_FAILWRITE	= 0x40ff,
 	NVM_RSP_ERR_EMPTYPAGE	= 0x42ff,
 	NVM_RSP_ERR_FAILECC	= 0x4281,
+	NVM_RSP_ERR_FAILCRC	= 0x4004,
 	NVM_RSP_WARN_HIGHECC	= 0x4700,
 
 	/* Device opcodes */
-- 
cgit v1.2.3


From 9a69b0ed6257ae5e71c99bf21ce53f98c558476a Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Tue, 31 Jan 2017 13:17:20 +0100
Subject: lightnvm: allow targets to use sysfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to register through the sysfs interface, a driver needs to know
its kobject. On a disk structure, this happens when the partition
information is added (device_add_disk), which for lightnvm takes place
after the target has been initialized. This means that on target
initialization, the kboject has not been created yet.

This patch adds a target function to let targets initialize their own
kboject as a child of the disk kobject.

Signed-off-by: Javier González <javier@cnexlabs.com>
Added exit typedef and passed gendisk instead of void pointer for exit.
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 9 +++++++++
 include/linux/lightnvm.h | 6 ++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index b2cd3d6f2a31..9bfe0352d093 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -289,6 +289,9 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	set_capacity(tdisk, tt->capacity(targetdata));
 	add_disk(tdisk);
 
+	if (tt->sysfs_init && tt->sysfs_init(tdisk))
+		goto err_sysfs;
+
 	t->type = tt;
 	t->disk = tdisk;
 	t->dev = tgt_dev;
@@ -298,6 +301,9 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	mutex_unlock(&dev->mlock);
 
 	return 0;
+err_sysfs:
+	if (tt->exit)
+		tt->exit(targetdata);
 err_init:
 	put_disk(tdisk);
 err_queue:
@@ -320,6 +326,9 @@ static void __nvm_remove_target(struct nvm_target *t)
 	del_gendisk(tdisk);
 	blk_cleanup_queue(q);
 
+	if (tt->sysfs_exit)
+		tt->sysfs_exit(tdisk);
+
 	if (tt->exit)
 		tt->exit(tdisk->private_data);
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index bc282d26017a..ca45e4a088a9 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -440,6 +440,8 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
 typedef void (nvm_tgt_exit_fn)(void *);
+typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
+typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
 
 struct nvm_tgt_type {
 	const char *name;
@@ -453,6 +455,10 @@ struct nvm_tgt_type {
 	nvm_tgt_init_fn *init;
 	nvm_tgt_exit_fn *exit;
 
+	/* sysfs */
+	nvm_tgt_sysfs_init_fn *sysfs_init;
+	nvm_tgt_sysfs_exit_fn *sysfs_exit;
+
 	/* For internal use */
 	struct list_head list;
 };
-- 
cgit v1.2.3


From 2b477c00f3bd87c3286f5940cb4174d8b01ee0d5 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.com>
Date: Thu, 22 Dec 2016 12:38:06 -0500
Subject: svcrpc: free contexts immediately on PROC_DESTROY

We currently handle a client PROC_DESTROY request by turning it
CACHE_NEGATIVE, setting the expired time to now, and then waiting for
cache_clean to clean it up later.  Since we forgot to set the cache's
nextcheck value, that could take up to 30 minutes.  Also, though there's
probably no real bug in this case, setting CACHE_NEGATIVE directly like
this probably isn't a great idea in general.

So let's just remove the entry from the cache directly, and move this
bit of cache manipulation to a helper function.

Signed-off-by: Neil Brown <neilb@suse.com>
Reported-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h      |  1 +
 net/sunrpc/auth_gss/svcauth_gss.c |  4 ++--
 net/sunrpc/cache.c                | 12 ++++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 62a60eeacb0a..9dcf2c8fe1c5 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -227,6 +227,7 @@ extern void sunrpc_destroy_cache_detail(struct cache_detail *cd);
 extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
 					umode_t, struct cache_detail *);
 extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
+extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
 
 /* Must store cache_detail in seq_file->private if using next three functions */
 extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 153082598522..a54a7a3d28f5 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 	case RPC_GSS_PROC_DESTROY:
 		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
 			goto auth_err;
-		rsci->h.expiry_time = seconds_since_boot();
-		set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+		/* Delete the entry from the cache_list and call cache_put */
+		sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
 		if (resv->iov_len + 4 > PAGE_SIZE)
 			goto drop;
 		svc_putnl(resv, RPC_SUCCESS);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8147e8d56eb2..502b9fe9817b 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1855,3 +1855,15 @@ void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
 
+void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
+{
+	write_lock(&cd->hash_lock);
+	if (!hlist_unhashed(&h->cache_list)){
+		hlist_del_init(&h->cache_list);
+		cd->entries--;
+		write_unlock(&cd->hash_lock);
+		cache_put(h, cd);
+	} else
+		write_unlock(&cd->hash_lock);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
-- 
cgit v1.2.3


From aef01aad89e457e34a60ff6e8fd69ff6740cf201 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 11 Nov 2016 12:43:12 -0800
Subject: Input: matrix-keypad - switch to using generic device properties

Instead of being OF-specific, let's switch to using generic device
properties, which will make this code usable on ACPI, device tree and
legacy boards that use property sets.

As part of the change let's rename matrix_keypad_parse_of_params() to
matrix_keypad_parse_properties().

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/bcm-keypad.c      |   2 +-
 drivers/input/keyboard/cros_ec_keyb.c    |   3 +-
 drivers/input/keyboard/lpc32xx-keys.c    |   2 +-
 drivers/input/keyboard/omap4-keypad.c    |   4 +-
 drivers/input/keyboard/pmic8xxx-keypad.c |   2 +-
 drivers/input/keyboard/pxa27x_keypad.c   |   2 +-
 drivers/input/keyboard/st-keyscan.c      |   4 +-
 drivers/input/keyboard/stmpe-keypad.c    |   2 +-
 drivers/input/keyboard/tca8418_keypad.c  |   2 +-
 drivers/input/keyboard/twl4030_keypad.c  |   4 +-
 drivers/input/matrix-keymap.c            | 109 ++++++++++++++++---------------
 include/linux/input/matrix_keypad.h      |  21 +-----
 12 files changed, 73 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/bcm-keypad.c b/drivers/input/keyboard/bcm-keypad.c
index 2b4e63d81e6d..e1cf63ee148f 100644
--- a/drivers/input/keyboard/bcm-keypad.c
+++ b/drivers/input/keyboard/bcm-keypad.c
@@ -213,7 +213,7 @@ static int bcm_kp_matrix_key_parse_dt(struct bcm_kp *kp)
 	/* Initialize the KPCR Keypad Configuration Register */
 	kp->kpcr = KPCR_STATUSFILTERENABLE | KPCR_COLFILTERENABLE;
 
-	error = matrix_keypad_parse_of_params(dev, &kp->n_rows, &kp->n_cols);
+	error = matrix_keypad_parse_properties(dev, &kp->n_rows, &kp->n_cols);
 	if (error) {
 		dev_err(dev, "failed to parse kp params\n");
 		return error;
diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c
index 87d071ae21da..780977dcf92d 100644
--- a/drivers/input/keyboard/cros_ec_keyb.c
+++ b/drivers/input/keyboard/cros_ec_keyb.c
@@ -229,7 +229,8 @@ static int cros_ec_keyb_probe(struct platform_device *pdev)
 	ckdev = devm_kzalloc(dev, sizeof(*ckdev), GFP_KERNEL);
 	if (!ckdev)
 		return -ENOMEM;
-	err = matrix_keypad_parse_of_params(dev, &ckdev->rows, &ckdev->cols);
+
+	err = matrix_keypad_parse_properties(dev, &ckdev->rows, &ckdev->cols);
 	if (err)
 		return err;
 
diff --git a/drivers/input/keyboard/lpc32xx-keys.c b/drivers/input/keyboard/lpc32xx-keys.c
index 632523d4f5dc..1dd57ac0e7a2 100644
--- a/drivers/input/keyboard/lpc32xx-keys.c
+++ b/drivers/input/keyboard/lpc32xx-keys.c
@@ -145,7 +145,7 @@ static int lpc32xx_parse_dt(struct device *dev,
 	u32 rows = 0, columns = 0;
 	int err;
 
-	err = matrix_keypad_parse_of_params(dev, &rows, &columns);
+	err = matrix_keypad_parse_properties(dev, &rows, &columns);
 	if (err)
 		return err;
 	if (rows != columns) {
diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c
index 9ecb16701b13..ebc67ba41fe2 100644
--- a/drivers/input/keyboard/omap4-keypad.c
+++ b/drivers/input/keyboard/omap4-keypad.c
@@ -223,8 +223,8 @@ static int omap4_keypad_parse_dt(struct device *dev,
 	struct device_node *np = dev->of_node;
 	int err;
 
-	err = matrix_keypad_parse_of_params(dev, &keypad_data->rows,
-					    &keypad_data->cols);
+	err = matrix_keypad_parse_properties(dev, &keypad_data->rows,
+					     &keypad_data->cols);
 	if (err)
 		return err;
 
diff --git a/drivers/input/keyboard/pmic8xxx-keypad.c b/drivers/input/keyboard/pmic8xxx-keypad.c
index 5c68e3f096bc..97c5424f49b9 100644
--- a/drivers/input/keyboard/pmic8xxx-keypad.c
+++ b/drivers/input/keyboard/pmic8xxx-keypad.c
@@ -515,7 +515,7 @@ static int pmic8xxx_kp_probe(struct platform_device *pdev)
 	int rc;
 	unsigned int ctrl_val;
 
-	rc = matrix_keypad_parse_of_params(&pdev->dev, &rows, &cols);
+	rc = matrix_keypad_parse_properties(&pdev->dev, &rows, &cols);
 	if (rc)
 		return rc;
 
diff --git a/drivers/input/keyboard/pxa27x_keypad.c b/drivers/input/keyboard/pxa27x_keypad.c
index e24443376e75..3841fa30db33 100644
--- a/drivers/input/keyboard/pxa27x_keypad.c
+++ b/drivers/input/keyboard/pxa27x_keypad.c
@@ -126,7 +126,7 @@ static int pxa27x_keypad_matrix_key_parse_dt(struct pxa27x_keypad *keypad,
 	u32 rows, cols;
 	int error;
 
-	error = matrix_keypad_parse_of_params(dev, &rows, &cols);
+	error = matrix_keypad_parse_properties(dev, &rows, &cols);
 	if (error)
 		return error;
 
diff --git a/drivers/input/keyboard/st-keyscan.c b/drivers/input/keyboard/st-keyscan.c
index de7be4f03d91..babcfb165e4f 100644
--- a/drivers/input/keyboard/st-keyscan.c
+++ b/drivers/input/keyboard/st-keyscan.c
@@ -106,8 +106,8 @@ static int keypad_matrix_key_parse_dt(struct st_keyscan *keypad_data)
 	struct device_node *np = dev->of_node;
 	int error;
 
-	error = matrix_keypad_parse_of_params(dev, &keypad_data->n_rows,
-					      &keypad_data->n_cols);
+	error = matrix_keypad_parse_properties(dev, &keypad_data->n_rows,
+					       &keypad_data->n_cols);
 	if (error) {
 		dev_err(dev, "failed to parse keypad params\n");
 		return error;
diff --git a/drivers/input/keyboard/stmpe-keypad.c b/drivers/input/keyboard/stmpe-keypad.c
index fe6e3f22eed7..8c6c0b9109c7 100644
--- a/drivers/input/keyboard/stmpe-keypad.c
+++ b/drivers/input/keyboard/stmpe-keypad.c
@@ -354,7 +354,7 @@ static int stmpe_keypad_probe(struct platform_device *pdev)
 	input->id.bustype = BUS_I2C;
 	input->dev.parent = &pdev->dev;
 
-	error = matrix_keypad_parse_of_params(&pdev->dev, &rows, &cols);
+	error = matrix_keypad_parse_properties(&pdev->dev, &rows, &cols);
 	if (error)
 		return error;
 
diff --git a/drivers/input/keyboard/tca8418_keypad.c b/drivers/input/keyboard/tca8418_keypad.c
index 9f6308fac0b4..ccff9d1b7135 100644
--- a/drivers/input/keyboard/tca8418_keypad.c
+++ b/drivers/input/keyboard/tca8418_keypad.c
@@ -295,7 +295,7 @@ static int tca8418_keypad_probe(struct i2c_client *client,
 		struct device_node *np = dev->of_node;
 		int err;
 
-		err = matrix_keypad_parse_of_params(dev, &rows, &cols);
+		err = matrix_keypad_parse_properties(dev, &rows, &cols);
 		if (err)
 			return err;
 		rep = of_property_read_bool(np, "keypad,autorepeat");
diff --git a/drivers/input/keyboard/twl4030_keypad.c b/drivers/input/keyboard/twl4030_keypad.c
index 29396ca69416..39e72b3219d8 100644
--- a/drivers/input/keyboard/twl4030_keypad.c
+++ b/drivers/input/keyboard/twl4030_keypad.c
@@ -374,8 +374,8 @@ static int twl4030_kp_probe(struct platform_device *pdev)
 		kp->autorepeat = pdata->rep;
 		keymap_data = pdata->keymap_data;
 	} else {
-		error = matrix_keypad_parse_of_params(&pdev->dev, &kp->n_rows,
-						      &kp->n_cols);
+		error = matrix_keypad_parse_properties(&pdev->dev, &kp->n_rows,
+						       &kp->n_cols);
 		if (error)
 			return error;
 
diff --git a/drivers/input/matrix-keymap.c b/drivers/input/matrix-keymap.c
index 08b61f506db6..8ccefc15c7a4 100644
--- a/drivers/input/matrix-keymap.c
+++ b/drivers/input/matrix-keymap.c
@@ -14,18 +14,18 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
  */
 
 #include <linux/device.h>
+#include <linux/export.h>
 #include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
 #include <linux/input.h>
-#include <linux/of.h>
-#include <linux/export.h>
-#include <linux/module.h>
 #include <linux/input/matrix_keypad.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/property.h>
+#include <linux/slab.h>
+#include <linux/types.h>
 
 static bool matrix_keypad_map_key(struct input_dev *input_dev,
 				  unsigned int rows, unsigned int cols,
@@ -49,18 +49,22 @@ static bool matrix_keypad_map_key(struct input_dev *input_dev,
 	return true;
 }
 
-#ifdef CONFIG_OF
-int matrix_keypad_parse_of_params(struct device *dev,
-				  unsigned int *rows, unsigned int *cols)
+/**
+ * matrix_keypad_parse_properties() - Read properties of matrix keypad
+ *
+ * @dev: Device containing properties
+ * @rows: Returns number of matrix rows
+ * @cols: Returns number of matrix columns
+ * @return 0 if OK, <0 on error
+ */
+int matrix_keypad_parse_properties(struct device *dev,
+				   unsigned int *rows, unsigned int *cols)
 {
-	struct device_node *np = dev->of_node;
+	*rows = *cols = 0;
+
+	device_property_read_u32(dev, "keypad,num-rows", rows);
+	device_property_read_u32(dev, "keypad,num-columns", cols);
 
-	if (!np) {
-		dev_err(dev, "missing DT data");
-		return -EINVAL;
-	}
-	of_property_read_u32(np, "keypad,num-rows", rows);
-	of_property_read_u32(np, "keypad,num-columns", cols);
 	if (!*rows || !*cols) {
 		dev_err(dev, "number of keypad rows/columns not specified\n");
 		return -EINVAL;
@@ -68,62 +72,61 @@ int matrix_keypad_parse_of_params(struct device *dev,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(matrix_keypad_parse_of_params);
+EXPORT_SYMBOL_GPL(matrix_keypad_parse_properties);
 
-static int matrix_keypad_parse_of_keymap(const char *propname,
-					 unsigned int rows, unsigned int cols,
-					 struct input_dev *input_dev)
+static int matrix_keypad_parse_keymap(const char *propname,
+				      unsigned int rows, unsigned int cols,
+				      struct input_dev *input_dev)
 {
 	struct device *dev = input_dev->dev.parent;
-	struct device_node *np = dev->of_node;
 	unsigned int row_shift = get_count_order(cols);
 	unsigned int max_keys = rows << row_shift;
-	unsigned int proplen, i, size;
-	const __be32 *prop;
-
-	if (!np)
-		return -ENOENT;
+	u32 *keys;
+	int i;
+	int size;
+	int retval;
 
 	if (!propname)
 		propname = "linux,keymap";
 
-	prop = of_get_property(np, propname, &proplen);
-	if (!prop) {
-		dev_err(dev, "OF: %s property not defined in %s\n",
-			propname, np->full_name);
-		return -ENOENT;
+	size = device_property_read_u32_array(dev, propname, NULL, 0);
+	if (size <= 0) {
+		dev_err(dev, "missing or malformed property %s: %d\n",
+			propname, size);
+		return size < 0 ? size : -EINVAL;
 	}
 
-	if (proplen % sizeof(u32)) {
-		dev_err(dev, "OF: Malformed keycode property %s in %s\n",
-			propname, np->full_name);
+	if (size > max_keys) {
+		dev_err(dev, "%s size overflow (%d vs max %u)\n",
+			propname, size, max_keys);
 		return -EINVAL;
 	}
 
-	size = proplen / sizeof(u32);
-	if (size > max_keys) {
-		dev_err(dev, "OF: %s size overflow\n", propname);
-		return -EINVAL;
+	keys = kmalloc_array(size, sizeof(u32), GFP_KERNEL);
+	if (!keys)
+		return -ENOMEM;
+
+	retval = device_property_read_u32_array(dev, propname, keys, size);
+	if (retval) {
+		dev_err(dev, "failed to read %s property: %d\n",
+			propname, retval);
+		goto out;
 	}
 
 	for (i = 0; i < size; i++) {
-		unsigned int key = be32_to_cpup(prop + i);
-
 		if (!matrix_keypad_map_key(input_dev, rows, cols,
-					   row_shift, key))
-			return -EINVAL;
+					   row_shift, keys[i])) {
+			retval = -EINVAL;
+			goto out;
+		}
 	}
 
-	return 0;
-}
-#else
-static int matrix_keypad_parse_of_keymap(const char *propname,
-					 unsigned int rows, unsigned int cols,
-					 struct input_dev *input_dev)
-{
-	return -ENOSYS;
+	retval = 0;
+
+out:
+	kfree(keys);
+	return retval;
 }
-#endif
 
 /**
  * matrix_keypad_build_keymap - convert platform keymap into matrix keymap
@@ -192,8 +195,8 @@ int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
 				return -EINVAL;
 		}
 	} else {
-		error = matrix_keypad_parse_of_keymap(keymap_name, rows, cols,
-						      input_dev);
+		error = matrix_keypad_parse_keymap(keymap_name, rows, cols,
+						   input_dev);
 		if (error)
 			return error;
 	}
diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h
index 27e06acc509a..37b04a0fdea4 100644
--- a/include/linux/input/matrix_keypad.h
+++ b/include/linux/input/matrix_keypad.h
@@ -80,24 +80,9 @@ int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
 			       unsigned int rows, unsigned int cols,
 			       unsigned short *keymap,
 			       struct input_dev *input_dev);
+int matrix_keypad_parse_properties(struct device *dev,
+				   unsigned int *rows, unsigned int *cols);
 
-#ifdef CONFIG_OF
-/**
- * matrix_keypad_parse_of_params() - Read parameters from matrix-keypad node
- *
- * @dev: Device containing of_node
- * @rows: Returns number of matrix rows
- * @cols: Returns number of matrix columns
- * @return 0 if OK, <0 on error
- */
-int matrix_keypad_parse_of_params(struct device *dev,
-				  unsigned int *rows, unsigned int *cols);
-#else
-static inline int matrix_keypad_parse_of_params(struct device *dev,
-				  unsigned int *rows, unsigned int *cols)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_OF */
+#define matrix_keypad_parse_of_params matrix_keypad_parse_properties
 
 #endif /* _MATRIX_KEYPAD_H */
-- 
cgit v1.2.3


From 8b1a315b35a961198336b944635c6936321b4a77 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 17 Jan 2017 14:18:50 -0800
Subject: Input: tca8418 - switch to using generic device properties

Let's drop legacy platform data support (there are no users in mainline)
and switch to using generic device properties, which will make the driver
simpler (non-OF boards can use property sets to describe hardware).

Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/tca8418_keypad.c | 90 ++++++++++-----------------------
 include/linux/input/tca8418_keypad.h    | 44 ----------------
 2 files changed, 26 insertions(+), 108 deletions(-)
 delete mode 100644 include/linux/input/tca8418_keypad.h

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/tca8418_keypad.c b/drivers/input/keyboard/tca8418_keypad.c
index ccff9d1b7135..44dd7689c571 100644
--- a/drivers/input/keyboard/tca8418_keypad.c
+++ b/drivers/input/keyboard/tca8418_keypad.c
@@ -24,18 +24,17 @@
  * alternative licensing inquiries.
  */
 
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/workqueue.h>
-#include <linux/gpio.h>
 #include <linux/i2c.h>
+#include <linux/init.h>
 #include <linux/input.h>
-#include <linux/input/tca8418_keypad.h>
+#include <linux/input/matrix_keypad.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
 #include <linux/of.h>
+#include <linux/property.h>
+#include <linux/slab.h>
+#include <linux/types.h>
 
 /* TCA8418 hardware limits */
 #define TCA8418_MAX_ROWS	8
@@ -264,43 +263,25 @@ static int tca8418_configure(struct tca8418_keypad *keypad_data,
 }
 
 static int tca8418_keypad_probe(struct i2c_client *client,
-					  const struct i2c_device_id *id)
+				const struct i2c_device_id *id)
 {
 	struct device *dev = &client->dev;
-	const struct tca8418_keypad_platform_data *pdata =
-						dev_get_platdata(dev);
 	struct tca8418_keypad *keypad_data;
 	struct input_dev *input;
-	const struct matrix_keymap_data *keymap_data = NULL;
 	u32 rows = 0, cols = 0;
-	bool rep = false;
-	bool irq_is_gpio = false;
-	int irq;
 	int error, row_shift, max_keys;
-	unsigned long trigger = 0;
 
-	/* Copy the platform data */
-	if (pdata) {
-		if (!pdata->keymap_data) {
-			dev_err(dev, "no keymap data defined\n");
-			return -EINVAL;
-		}
-		keymap_data = pdata->keymap_data;
-		rows = pdata->rows;
-		cols = pdata->cols;
-		rep  = pdata->rep;
-		irq_is_gpio = pdata->irq_is_gpio;
-		trigger = IRQF_TRIGGER_FALLING;
-	} else {
-		struct device_node *np = dev->of_node;
-		int err;
-
-		err = matrix_keypad_parse_properties(dev, &rows, &cols);
-		if (err)
-			return err;
-		rep = of_property_read_bool(np, "keypad,autorepeat");
+	/* Check i2c driver capabilities */
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE)) {
+		dev_err(dev, "%s adapter not supported\n",
+			dev_driver_string(&client->adapter->dev));
+		return -ENODEV;
 	}
 
+	error = matrix_keypad_parse_properties(dev, &rows, &cols);
+	if (error)
+		return error;
+
 	if (!rows || rows > TCA8418_MAX_ROWS) {
 		dev_err(dev, "invalid rows\n");
 		return -EINVAL;
@@ -311,13 +292,6 @@ static int tca8418_keypad_probe(struct i2c_client *client,
 		return -EINVAL;
 	}
 
-	/* Check i2c driver capabilities */
-	if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE)) {
-		dev_err(dev, "%s adapter not supported\n",
-			dev_driver_string(&client->adapter->dev));
-		return -ENODEV;
-	}
-
 	row_shift = get_count_order(cols);
 	max_keys = rows << row_shift;
 
@@ -347,23 +321,20 @@ static int tca8418_keypad_probe(struct i2c_client *client,
 	input->id.product = 0x001;
 	input->id.version = 0x0001;
 
-	error = matrix_keypad_build_keymap(keymap_data, NULL, rows, cols,
-					   NULL, input);
+	error = matrix_keypad_build_keymap(NULL, NULL, rows, cols, NULL, input);
 	if (error) {
 		dev_err(dev, "Failed to build keymap\n");
 		return error;
 	}
 
-	if (rep)
+	if (device_property_read_bool(dev, "keypad,autorepeat"))
 		__set_bit(EV_REP, input->evbit);
-	input_set_capability(input, EV_MSC, MSC_SCAN);
 
-	irq = client->irq;
-	if (irq_is_gpio)
-		irq = gpio_to_irq(irq);
+	input_set_capability(input, EV_MSC, MSC_SCAN);
 
-	error = devm_request_threaded_irq(dev, irq, NULL, tca8418_irq_handler,
-					  trigger | IRQF_SHARED | IRQF_ONESHOT,
+	error = devm_request_threaded_irq(dev, client->irq,
+					  NULL, tca8418_irq_handler,
+					  IRQF_SHARED | IRQF_ONESHOT,
 					  client->name, keypad_data);
 	if (error) {
 		dev_err(dev, "Unable to claim irq %d; error %d\n",
@@ -382,30 +353,21 @@ static int tca8418_keypad_probe(struct i2c_client *client,
 }
 
 static const struct i2c_device_id tca8418_id[] = {
-	{ TCA8418_NAME, 8418, },
+	{ "tca8418", 8418, },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, tca8418_id);
 
-#ifdef CONFIG_OF
 static const struct of_device_id tca8418_dt_ids[] = {
 	{ .compatible = "ti,tca8418", },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, tca8418_dt_ids);
 
-/*
- * The device tree based i2c loader looks for
- * "i2c:" + second_component_of(property("compatible"))
- * and therefore we need an alias to be found.
- */
-MODULE_ALIAS("i2c:tca8418");
-#endif
-
 static struct i2c_driver tca8418_keypad_driver = {
 	.driver = {
-		.name	= TCA8418_NAME,
-		.of_match_table = of_match_ptr(tca8418_dt_ids),
+		.name	= "tca8418_keypad",
+		.of_match_table = tca8418_dt_ids,
 	},
 	.probe		= tca8418_keypad_probe,
 	.id_table	= tca8418_id,
diff --git a/include/linux/input/tca8418_keypad.h b/include/linux/input/tca8418_keypad.h
deleted file mode 100644
index e71a85dc2cbd..000000000000
--- a/include/linux/input/tca8418_keypad.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * TCA8418 keypad platform support
- *
- * Copyright (C) 2011 Fuel7, Inc.  All rights reserved.
- *
- * Author: Kyle Manna <kyle.manna@fuel7.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- * If you can't comply with GPLv2, alternative licensing terms may be
- * arranged. Please contact Fuel7, Inc. (http://fuel7.com/) for proprietary
- * alternative licensing inquiries.
- */
-
-#ifndef _TCA8418_KEYPAD_H
-#define _TCA8418_KEYPAD_H
-
-#include <linux/types.h>
-#include <linux/input/matrix_keypad.h>
-
-#define TCA8418_I2C_ADDR	0x34
-#define	TCA8418_NAME		"tca8418_keypad"
-
-struct tca8418_keypad_platform_data {
-	const struct matrix_keymap_data *keymap_data;
-	unsigned rows;
-	unsigned cols;
-	bool rep;
-	bool irq_is_gpio;
-};
-
-#endif
-- 
cgit v1.2.3


From 57292b58ddb58689e8c3b4c6eadbef10d9ca44dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 31 Jan 2017 16:57:29 +0100
Subject: block: introduce blk_rq_is_passthrough

This can be used to check for fs vs non-fs requests and basically
removes all knowledge of BLOCK_PC specific from the block layer,
as well as preparing for removing the cmd_type field in struct request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                      |  8 ++++----
 block/blk-exec.c                      |  2 +-
 block/blk.h                           |  2 +-
 block/elevator.c                      |  4 ++--
 block/mq-deadline.c                   |  2 +-
 drivers/ata/libata-scsi.c             |  2 +-
 drivers/block/cciss.c                 | 36 +++++++++++++++++------------------
 drivers/nvme/host/fc.c                |  2 +-
 drivers/nvme/host/pci.c               |  4 ++--
 drivers/nvme/host/rdma.c              |  4 ++--
 drivers/nvme/target/loop.c            |  2 +-
 drivers/scsi/hpsa.c                   |  4 ++--
 drivers/scsi/scsi.c                   |  2 +-
 drivers/scsi/scsi_error.c             |  4 ++--
 drivers/scsi/scsi_lib.c               |  6 +++---
 drivers/scsi/smartpqi/smartpqi_init.c |  2 +-
 drivers/scsi/sun3_scsi.c              |  2 +-
 include/linux/blkdev.h                | 16 +++++++++++-----
 include/linux/blktrace_api.h          |  4 ++--
 include/scsi/scsi_cmnd.h              |  2 +-
 kernel/trace/blktrace.c               |  2 +-
 21 files changed, 59 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 95829523cded..44431086e4e7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2506,10 +2506,10 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 * TODO: tj: This is too subtle.  It would be better to let
 	 * low level drivers do what they see fit.
 	 */
-	if (req->cmd_type == REQ_TYPE_FS)
+	if (!blk_rq_is_passthrough(req))
 		req->errors = 0;
 
-	if (error && req->cmd_type == REQ_TYPE_FS &&
+	if (error && !blk_rq_is_passthrough(req) &&
 	    !(req->rq_flags & RQF_QUIET)) {
 		char *error_type;
 
@@ -2581,7 +2581,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	req->__data_len -= total_bytes;
 
 	/* update sector only for requests with clear definition of sector */
-	if (req->cmd_type == REQ_TYPE_FS)
+	if (!blk_rq_is_passthrough(req))
 		req->__sector += total_bytes >> 9;
 
 	/* mixed attributes always follow the first bio */
@@ -2659,7 +2659,7 @@ void blk_finish_request(struct request *req, int error)
 
 	BUG_ON(blk_queued_rq(req));
 
-	if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
+	if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
 		laptop_io_completion(&req->q->backing_dev_info);
 
 	blk_delete_timer(req);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index ed51800f4b44..8cd0e9bc8dc8 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -51,7 +51,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
 	WARN_ON(irqs_disabled());
-	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
+	WARN_ON(!blk_rq_is_passthrough(rq));
 
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
diff --git a/block/blk.h b/block/blk.h
index 9a716b5925a4..c1bd4bf9e645 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -249,7 +249,7 @@ static inline int blk_do_io_stat(struct request *rq)
 {
 	return rq->rq_disk &&
 	       (rq->rq_flags & RQF_IO_STAT) &&
-		(rq->cmd_type == REQ_TYPE_FS);
+		!blk_rq_is_passthrough(rq);
 }
 
 /*
diff --git a/block/elevator.c b/block/elevator.c
index ef7f59469acc..dba9be891a6b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -634,7 +634,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 
 	if (rq->rq_flags & RQF_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
-		if (rq->cmd_type == REQ_TYPE_FS) {
+		if (!blk_rq_is_passthrough(rq)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
@@ -676,7 +676,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		if (elv_attempt_insert_merge(q, rq))
 			break;
 	case ELEVATOR_INSERT_SORT:
-		BUG_ON(rq->cmd_type != REQ_TYPE_FS);
+		BUG_ON(blk_rq_is_passthrough(rq));
 		rq->rq_flags |= RQF_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index d93ec713fa62..49583536698c 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -398,7 +398,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	if (blk_mq_sched_bypass_insert(hctx, rq))
 		return;
 
-	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
+	if (at_head || blk_rq_is_passthrough(rq)) {
 		if (at_head)
 			list_add(&rq->queuelist, &dd->dispatch);
 		else
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6abd73975f87..c771d4c341ea 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1265,7 +1265,7 @@ static void ata_scsi_sdev_config(struct scsi_device *sdev)
  */
 static int atapi_drain_needed(struct request *rq)
 {
-	if (likely(rq->cmd_type != REQ_TYPE_BLOCK_PC))
+	if (likely(!blk_rq_is_passthrough(rq)))
 		return 0;
 
 	if (!blk_rq_bytes(rq) || op_is_write(req_op(rq)))
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b93bb73b49d2..53dc2971a3ba 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1854,7 +1854,7 @@ static void cciss_softirq_done(struct request *rq)
 	dev_dbg(&h->pdev->dev, "Done with %p\n", rq);
 
 	/* set the residual count for pc requests */
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
+	if (blk_rq_is_passthrough(rq))
 		scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
 
 	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
@@ -3083,7 +3083,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	driver_byte = DRIVER_OK;
 	msg_byte = cmd->err_info->CommandStatus; /* correct?  seems too device specific */
 
-	if (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC)
+	if (blk_rq_is_passthrough(cmd->rq))
 		host_byte = DID_PASSTHROUGH;
 	else
 		host_byte = DID_OK;
@@ -3092,7 +3092,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 		host_byte, driver_byte);
 
 	if (cmd->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) {
-		if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC)
+		if (!blk_rq_is_passthrough(cmd->rq))
 			dev_warn(&h->pdev->dev, "cmd %p "
 			       "has SCSI Status 0x%x\n",
 			       cmd, cmd->err_info->ScsiStatus);
@@ -3103,16 +3103,16 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	sense_key = 0xf & cmd->err_info->SenseInfo[2];
 	/* no status or recovered error */
 	if (((sense_key == 0x0) || (sense_key == 0x1)) &&
-	    (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC))
+	    !blk_rq_is_passthrough(cmd->rq))
 		error_value = 0;
 
 	if (check_for_unit_attention(h, cmd)) {
-		*retry_cmd = !(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC);
+		*retry_cmd = !blk_rq_is_passthrough(cmd->rq);
 		return 0;
 	}
 
 	/* Not SG_IO or similar? */
-	if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC) {
+	if (!blk_rq_is_passthrough(cmd->rq)) {
 		if (error_value != 0)
 			dev_warn(&h->pdev->dev, "cmd %p has CHECK CONDITION"
 			       " sense key = 0x%x\n", cmd, sense_key);
@@ -3146,14 +3146,14 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
 		break;
 	case CMD_DATA_UNDERRUN:
-		if (cmd->rq->cmd_type == REQ_TYPE_FS) {
+		if (!blk_rq_is_passthrough(cmd->rq)) {
 			dev_warn(&h->pdev->dev, "cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
 		}
 		break;
 	case CMD_DATA_OVERRUN:
-		if (cmd->rq->cmd_type == REQ_TYPE_FS)
+		if (!blk_rq_is_passthrough(cmd->rq))
 			dev_warn(&h->pdev->dev, "cciss: cmd %p has"
 			       " completed with data overrun "
 			       "reported\n", cmd);
@@ -3163,7 +3163,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "reported invalid\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_PROTOCOL_ERR:
@@ -3171,7 +3171,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "protocol error\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_HARDWARE_ERR:
@@ -3179,7 +3179,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       " hardware error\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_CONNECTION_LOST:
@@ -3187,7 +3187,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "connection lost\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_ABORTED:
@@ -3195,7 +3195,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "aborted\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_ABORT_FAILED:
@@ -3203,7 +3203,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "abort failed\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNSOLICITED_ABORT:
@@ -3218,21 +3218,21 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 				"%p retried too many times\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_TIMEOUT:
 		dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNABORTABLE:
 		dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	default:
@@ -3241,7 +3241,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       cmd->err_info->CommandStatus);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 	}
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index fcc9dcfdf675..40c979b777b8 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1937,7 +1937,7 @@ nvme_fc_complete_rq(struct request *rq)
 			return;
 		}
 
-		if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(rq))
 			error = rq->errors;
 		else
 			error = nvme_error_status(rq->errors);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7103bce4ba4f..f29365b8c9be 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -588,7 +588,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 */
 	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8) &&
-					req->cmd_type != REQ_TYPE_DRV_PRIV) {
+		    !blk_rq_is_passthrough(req)) {
 			blk_mq_end_request(req, -EFAULT);
 			return BLK_MQ_RQ_QUEUE_OK;
 		}
@@ -645,7 +645,7 @@ static void nvme_complete_rq(struct request *req)
 			return;
 		}
 
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(req))
 			error = req->errors;
 		else
 			error = nvme_error_status(req->errors);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 557f29b1f1bb..ecc79b2fcfaf 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1423,7 +1423,7 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
 	if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
 		struct nvme_command *cmd = nvme_req(rq)->cmd;
 
-		if (rq->cmd_type != REQ_TYPE_DRV_PRIV ||
+		if (!blk_rq_is_passthrough(rq) ||
 		    cmd->common.opcode != nvme_fabrics_command ||
 		    cmd->fabrics.fctype != nvme_fabrics_type_connect)
 			return false;
@@ -1522,7 +1522,7 @@ static void nvme_rdma_complete_rq(struct request *rq)
 			return;
 		}
 
-		if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(rq))
 			error = rq->errors;
 		else
 			error = nvme_error_status(rq->errors);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 9aaa70071ae5..f3862e38f574 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -104,7 +104,7 @@ static void nvme_loop_complete_rq(struct request *req)
 			return;
 		}
 
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(req))
 			error = req->errors;
 		else
 			error = nvme_error_status(req->errors);
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index cbc0c5fe5a60..c611412a8de9 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -5539,8 +5539,8 @@ static int hpsa_scsi_queue_command(struct Scsi_Host *sh, struct scsi_cmnd *cmd)
 	 * Retries always go down the normal I/O path.
 	 */
 	if (likely(cmd->retries == 0 &&
-		cmd->request->cmd_type == REQ_TYPE_FS &&
-		h->acciopath_status)) {
+			!blk_rq_is_passthrough(cmd->request) &&
+			h->acciopath_status)) {
 		rc = hpsa_ioaccel_submit(h, c, cmd, scsi3addr);
 		if (rc == 0)
 			return 0;
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 3d8d2153b448..7bfbcfa7af40 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -238,7 +238,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
 				"(result %x)\n", cmd->result));
 
 	good_bytes = scsi_bufflen(cmd);
-        if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
+        if (!blk_rq_is_passthrough(cmd->request)) {
 		int old_good_bytes = good_bytes;
 		drv = scsi_cmd_to_driver(cmd);
 		if (drv->done)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 4b40f746d534..b4ce7bb5d2a9 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1106,7 +1106,7 @@ static int scsi_request_sense(struct scsi_cmnd *scmd)
 
 static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn)
 {
-	if (scmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
+	if (!blk_rq_is_passthrough(scmd->request)) {
 		struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd);
 		if (sdrv->eh_action)
 			rtn = sdrv->eh_action(scmd, rtn);
@@ -1746,7 +1746,7 @@ check_type:
 	 * the check condition was retryable.
 	 */
 	if (scmd->request->cmd_flags & REQ_FAILFAST_DEV ||
-	    scmd->request->cmd_type == REQ_TYPE_BLOCK_PC)
+	    blk_rq_is_passthrough(scmd->request))
 		return 1;
 	else
 		return 0;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8188e5c71f75..31629a7b728d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -582,7 +582,7 @@ void scsi_run_host_queues(struct Scsi_Host *shost)
 
 static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
 {
-	if (cmd->request->cmd_type == REQ_TYPE_FS) {
+	if (!blk_rq_is_passthrough(cmd->request)) {
 		struct scsi_driver *drv = scsi_cmd_to_driver(cmd);
 
 		if (drv->uninit_command)
@@ -806,7 +806,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			sense_deferred = scsi_sense_is_deferred(&sshdr);
 	}
 
-	if (req->cmd_type == REQ_TYPE_BLOCK_PC) { /* SG_IO ioctl from block level */
+	if (blk_rq_is_passthrough(req)) {
 		if (result) {
 			if (sense_valid) {
 				/*
@@ -847,7 +847,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		error = __scsi_error_from_host_byte(cmd, result);
 	}
 
-	/* no bidi support for !REQ_TYPE_BLOCK_PC yet */
+	/* no bidi support for !blk_rq_is_passthrough yet */
 	BUG_ON(blk_bidi_rq(req));
 
 	/*
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index 8702d9cf8040..11c0dfb3dfa3 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -4499,7 +4499,7 @@ static int pqi_scsi_queue_command(struct Scsi_Host *shost,
 	if (pqi_is_logical_device(device)) {
 		raid_bypassed = false;
 		if (device->offload_enabled &&
-			scmd->request->cmd_type == REQ_TYPE_FS) {
+				!blk_rq_is_passthrough(scmd->request)) {
 			rc = pqi_raid_bypass_submit_scsi_cmd(ctrl_info, device,
 				scmd, queue_group);
 			if (rc == 0 ||
diff --git a/drivers/scsi/sun3_scsi.c b/drivers/scsi/sun3_scsi.c
index 88db6992420e..bcf7d05d1aab 100644
--- a/drivers/scsi/sun3_scsi.c
+++ b/drivers/scsi/sun3_scsi.c
@@ -260,7 +260,7 @@ static int sun3scsi_dma_xfer_len(struct NCR5380_hostdata *hostdata,
 {
 	int wanted_len = cmd->SCp.this_residual;
 
-	if (wanted_len < DMA_MIN_SIZE || cmd->request->cmd_type != REQ_TYPE_FS)
+	if (wanted_len < DMA_MIN_SIZE || blk_rq_is_passthrough(cmd->request))
 		return 0;
 
 	return wanted_len;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e4c5f284fe2d..7121be081517 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -242,6 +242,11 @@ struct request {
 	struct request *next_rq;
 };
 
+static inline bool blk_rq_is_passthrough(struct request *rq)
+{
+	return rq->cmd_type != REQ_TYPE_FS;
+}
+
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
@@ -698,9 +703,10 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
 
-#define blk_account_rq(rq) \
-	(((rq)->rq_flags & RQF_STARTED) && \
-	 ((rq)->cmd_type == REQ_TYPE_FS))
+static inline bool blk_account_rq(struct request *rq)
+{
+	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
+}
 
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
@@ -775,7 +781,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
 
 static inline bool rq_mergeable(struct request *rq)
 {
-	if (rq->cmd_type != REQ_TYPE_FS)
+	if (blk_rq_is_passthrough(rq))
 		return false;
 
 	if (req_op(rq) == REQ_OP_FLUSH)
@@ -1049,7 +1055,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 {
 	struct request_queue *q = rq->q;
 
-	if (unlikely(rq->cmd_type != REQ_TYPE_FS))
+	if (blk_rq_is_passthrough(rq))
 		return q->limits.max_hw_sectors;
 
 	if (!q->limits.chunk_sectors ||
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index a143a67a6f33..341f9418bd68 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -114,12 +114,12 @@ extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
-	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_pos(rq);
+	return blk_rq_is_passthrough(rq) ? 0 : blk_rq_pos(rq);
 }
 
 static inline unsigned int blk_rq_trace_nr_sectors(struct request *rq)
 {
-	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_sectors(rq);
+	return blk_rq_is_passthrough(rq) ? 0 : blk_rq_sectors(rq);
 }
 
 #endif
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index f708f1acfc50..b379f93a2c48 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -151,7 +151,7 @@ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd)
 	return cmd + 1;
 }
 
-/* make sure not to use it with REQ_TYPE_BLOCK_PC commands */
+/* make sure not to use it with passthrough commands */
 static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
 {
 	return *(struct scsi_driver **)cmd->request->rq_disk->private_data;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 512b34703ac3..e33050e3ea1c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -712,7 +712,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_type != REQ_TYPE_FS)
+	if (blk_rq_is_passthrough(rq))
 		what |= BLK_TC_ACT(BLK_TC_PC);
 	else
 		what |= BLK_TC_ACT(BLK_TC_FS);
-- 
cgit v1.2.3


From 2f5a8e80f79dc82e00f4cca557dc9ceaf064b450 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 31 Jan 2017 16:57:30 +0100
Subject: ide: don't abuse cmd_type

Currently the legacy ide driver defines several request types of it's own,
which is in the way of removing that field entirely.

Instead add a type field to struct ide_request and use that to distinguish
the different types of IDE-internal requests.

It's a bit of a mess, but so is the surrounding code..

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/ide/ide-atapi.c    | 18 ++++++++++-----
 drivers/ide/ide-cd.c       | 25 ++++++++++++---------
 drivers/ide/ide-cd_ioctl.c |  1 +
 drivers/ide/ide-devsets.c  |  1 +
 drivers/ide/ide-disk.c     |  6 +++--
 drivers/ide/ide-eh.c       |  4 ++--
 drivers/ide/ide-floppy.c   | 18 ++++++++++-----
 drivers/ide/ide-io.c       |  8 +++----
 drivers/ide/ide-ioctls.c   |  4 +++-
 drivers/ide/ide-park.c     |  2 ++
 drivers/ide/ide-pm.c       | 16 ++++++++-----
 drivers/ide/ide-tape.c     |  6 +++--
 drivers/ide/ide-taskfile.c |  3 ++-
 include/linux/ide.h        | 56 +++++++++++++++++++++++++++++++++++++---------
 14 files changed, 117 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 7c826ecbd276..a8c650e8c92e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -95,6 +95,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = (char *)pc;
 
 	if (buf && bufflen) {
@@ -193,7 +194,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 
 	BUG_ON(sense_len > sizeof(*sense));
 
-	if (rq->cmd_type == REQ_TYPE_ATA_SENSE || drive->sense_rq_armed)
+	if (ata_sense_request(rq) || drive->sense_rq_armed)
 		return;
 
 	memset(sense, 0, sizeof(*sense));
@@ -211,7 +212,8 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	}
 
 	sense_rq->rq_disk = rq->rq_disk;
-	sense_rq->cmd_type = REQ_TYPE_ATA_SENSE;
+	sense_rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(sense_rq)->type = ATA_PRIV_SENSE;
 	sense_rq->rq_flags |= RQF_PREEMPT;
 
 	req->cmd[0] = GPCMD_REQUEST_SENSE;
@@ -313,10 +315,14 @@ int ide_cd_get_xferlen(struct request *rq)
 	switch (rq->cmd_type) {
 	case REQ_TYPE_FS:
 		return 32768;
-	case REQ_TYPE_ATA_SENSE:
 	case REQ_TYPE_BLOCK_PC:
-	case REQ_TYPE_ATA_PC:
 		return blk_rq_bytes(rq);
+	case REQ_TYPE_DRV_PRIV:
+		switch (ide_req(rq)->type) {
+		case ATA_PRIV_PC:
+		case ATA_PRIV_SENSE:
+			return blk_rq_bytes(rq);
+		}
 	default:
 		return 0;
 	}
@@ -377,7 +383,7 @@ int ide_check_ireason(ide_drive_t *drive, struct request *rq, int len,
 				drive->name, __func__, ireason);
 	}
 
-	if (dev_is_idecd(drive) && rq->cmd_type == REQ_TYPE_ATA_PC)
+	if (dev_is_idecd(drive) && ata_pc_request(rq))
 		rq->rq_flags |= RQF_FAILED;
 
 	return 1;
@@ -480,7 +486,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if (uptodate == 0)
 			drive->failed_pc = NULL;
 
-		if (rq->cmd_type == REQ_TYPE_DRV_PRIV) {
+		if (ata_misc_request(rq)) {
 			rq->errors = 0;
 			error = 0;
 		} else {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 6eb98725e194..207af7816544 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,7 +210,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For REQ_TYPE_ATA_SENSE, "rq->special" points to the original
+	 * For ATA_PRIV_SENSE, "rq->special" points to the original
 	 * failed request.  Also, the sense data should be read
 	 * directly from rq which might be different from the original
 	 * sense buffer if it got copied during mapping.
@@ -282,7 +282,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 				  "stat 0x%x",
 				  rq->cmd[0], rq->cmd_type, err, stat);
 
-	if (rq->cmd_type == REQ_TYPE_ATA_SENSE) {
+	if (ata_sense_request(rq)) {
 		/*
 		 * We got an error trying to get sense info from the drive
 		 * (probably while trying to recover from a former error).
@@ -438,7 +438,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		rq = blk_get_request(drive->queue, write, __GFP_RECLAIM);
 		scsi_req_init(rq);
 		memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
-		rq->cmd_type = REQ_TYPE_ATA_PC;
+		rq->cmd_type = REQ_TYPE_DRV_PRIV;
+		ide_req(rq)->type = ATA_PRIV_PC;
 		rq->rq_flags |= rq_flags;
 		rq->timeout = timeout;
 		if (buffer) {
@@ -520,7 +521,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, thislen, uptodate = 0;
 	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0;
-	int sense = (rq->cmd_type == REQ_TYPE_ATA_SENSE);
+	int sense = ata_sense_request(rq);
 	unsigned int timeout;
 	u16 len;
 	u8 ireason, stat;
@@ -785,18 +786,22 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		if (cdrom_start_rw(drive, rq) == ide_stopped)
 			goto out_end;
 		break;
-	case REQ_TYPE_ATA_SENSE:
 	case REQ_TYPE_BLOCK_PC:
-	case REQ_TYPE_ATA_PC:
+	handle_pc:
 		if (!rq->timeout)
 			rq->timeout = ATAPI_WAIT_PC;
-
 		cdrom_do_block_pc(drive, rq);
 		break;
 	case REQ_TYPE_DRV_PRIV:
-		/* right now this can only be a reset... */
-		uptodate = 1;
-		goto out_end;
+		switch (ide_req(rq)->type) {
+		case ATA_PRIV_MISC:
+			/* right now this can only be a reset... */
+			uptodate = 1;
+			goto out_end;
+		case ATA_PRIV_SENSE:
+		case ATA_PRIV_PC:
+			goto handle_pc;
+		}
 	default:
 		BUG();
 	}
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index da0aa0153fb1..3f03eed0ff78 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -306,6 +306,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
 	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index fd56c9dd9bc7..c040b9de2b4e 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -168,6 +168,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	rq = blk_get_request(q, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 5;
 	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
 	*(int *)&scsi_req(rq)->cmd[1] = arg;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 3437c5b28599..69cf71729841 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -453,7 +453,8 @@ static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
 	cmd->tf_flags = IDE_TFLAG_DYN;
 	cmd->protocol = ATA_PROT_NODATA;
 
-	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 	rq->special = cmd;
 	cmd->rq = rq;
 
@@ -479,7 +480,8 @@ static int set_multcount(ide_drive_t *drive, int arg)
 
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	drive->mult_req = arg;
 	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 35e5b892f9d7..1e4b1476e559 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -124,7 +124,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 
 	/* retry only "normal" I/O: */
 	if (rq->cmd_type != REQ_TYPE_FS) {
-		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
+		if (ata_taskfile_request(rq)) {
 			struct ide_cmd *cmd = rq->special;
 
 			if (cmd)
@@ -147,7 +147,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 {
 	struct request *rq = drive->hwif->rq;
 
-	if (rq && rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+	if (rq && ata_misc_request(rq) &&
 	    scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
 		if (err <= 0 && rq->errors == 0)
 			rq->errors = -EIO;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3bd678ad3519..24c6d363b3a4 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -97,7 +97,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 			       "Aborting request!\n");
 	}
 
-	if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
+	if (ata_misc_request(rq))
 		rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
 
 	return uptodate;
@@ -246,7 +246,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		} else
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
-		if (rq->cmd_type == REQ_TYPE_DRV_PRIV) {
+		if (ata_misc_request(rq)) {
 			rq->errors = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
@@ -265,14 +265,20 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 		break;
-	case REQ_TYPE_DRV_PRIV:
-	case REQ_TYPE_ATA_SENSE:
-		pc = (struct ide_atapi_pc *)rq->special;
-		break;
 	case REQ_TYPE_BLOCK_PC:
 		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
 		break;
+	case REQ_TYPE_DRV_PRIV:
+		switch (ide_req(rq)->type) {
+		case ATA_PRIV_MISC:
+		case ATA_PRIV_SENSE:
+			pc = (struct ide_atapi_pc *)rq->special;
+			break;
+		default:
+			BUG();
+		}
+		break;
 	default:
 		BUG();
 	}
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3378503afed4..6735c92b8da7 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -102,7 +102,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 			drive->dev_flags |= IDE_DFLAG_PARKED;
 	}
 
-	if (rq && rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
+	if (rq && ata_taskfile_request(rq)) {
 		struct ide_cmd *orig_cmd = rq->special;
 
 		if (cmd->tf_flags & IDE_TFLAG_DYN)
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(ide_complete_rq);
 
 void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
-	u8 drv_req = (rq->cmd_type == REQ_TYPE_DRV_PRIV) && rq->rq_disk;
+	u8 drv_req = ata_misc_request(rq) && rq->rq_disk;
 	u8 media = drive->media;
 
 	drive->failed_pc = NULL;
@@ -340,7 +340,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		if (drive->current_speed == 0xff)
 			ide_config_drive_speed(drive, drive->desired_speed);
 
-		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
+		if (ata_taskfile_request(rq))
 			return execute_drive_cmd(drive, rq);
 		else if (ata_pm_request(rq)) {
 			struct ide_pm_state *pm = rq->special;
@@ -353,7 +353,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 			    pm->pm_step == IDE_PM_COMPLETED)
 				ide_complete_pm_rq(drive, rq);
 			return startstop;
-		} else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_DRV_PRIV)
+		} else if (!rq->rq_disk && ata_misc_request(rq))
 			/*
 			 * TODO: Once all ULDs have been modified to
 			 * check for specific op codes rather than
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index a5d22c6e956a..fc19e842deef 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -127,7 +127,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 
 		rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 		scsi_req_init(rq);
-		rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
+		rq->cmd_type = REQ_TYPE_DRV_PRIV;
+		ide_req(rq)->type = ATA_PRIV_TASKFILE;
 		err = blk_execute_rq(drive->queue, NULL, rq, 0);
 		blk_put_request(rq);
 
@@ -225,6 +226,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index c37604a75097..fc3c944ca4be 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -36,6 +36,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = &timeout;
 	rc = blk_execute_rq(q, NULL, rq, 1);
 	blk_put_request(rq);
@@ -54,6 +55,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_MISC;
 	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
 
 out:
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index f6767aba7cfb..0c1296815dfa 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -20,7 +20,8 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
@@ -91,7 +92,8 @@ int generic_ide_resume(struct device *dev)
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
 	rq->rq_flags |= RQF_PREEMPT;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
@@ -223,10 +225,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 #ifdef DEBUG_PM
 	printk("%s: completing PM request, %s\n", drive->name,
-	       (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) ? "suspend" : "resume");
+	       (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND)
+	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
 		blk_stop_queue(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
@@ -242,11 +244,13 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
 	struct ide_pm_state *pm = rq->special;
 
-	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND &&
+	if (rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+	    ide_req(rq)->type == ATA_PRIV_PM_SUSPEND &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
 		/* Mark drive blocked when starting the suspend sequence. */
 		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (rq->cmd_type == REQ_TYPE_ATA_PM_RESUME &&
+	else if (rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+	         ide_req(rq)->type == ATA_PRIV_PM_RESUME &&
 		 pm->pm_step == IDE_PM_START_RESUME) {
 		/*
 		 * The first thing we do on wakeup is to wait for BSY bit to
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index f6bc1e2bb035..37c7beabaacc 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -577,8 +577,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		      req->cmd[0], (unsigned long long)blk_rq_pos(rq),
 		      blk_rq_sectors(rq));
 
-	BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV ||
-		 rq->cmd_type == REQ_TYPE_ATA_SENSE));
+	BUG_ON(rq->cmd_type != REQ_TYPE_DRV_PRIV);
+	BUG_ON(ide_req(rq)->type != ATA_PRIV_MISC &&
+	       ide_req(rq)->type != ATA_PRIV_SENSE);
 
 	/* Retry a failed packet command */
 	if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) {
@@ -856,6 +857,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	scsi_req_init(rq);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
 	rq->__sector = tape->first_frame;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index a393e13b8007..9ccc1d573393 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -432,7 +432,8 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	/*
 	 * (ks) We transfer currently only whole sectors.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 086fbe172817..5cc6caa94cac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -40,24 +40,58 @@
 
 struct device;
 
-/* IDE-specific values for req->cmd_type */
-enum ata_cmd_type_bits {
-	REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1,
-	REQ_TYPE_ATA_PC,
-	REQ_TYPE_ATA_SENSE,	/* sense request */
-	REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */
-	REQ_TYPE_ATA_PM_RESUME,	/* resume request */
+/* values for ide_request.type */
+enum ata_priv_type {
+	ATA_PRIV_MISC,
+	ATA_PRIV_TASKFILE,
+	ATA_PRIV_PC,
+	ATA_PRIV_SENSE,		/* sense request */
+	ATA_PRIV_PM_SUSPEND,	/* suspend request */
+	ATA_PRIV_PM_RESUME,	/* resume request */
 };
 
-#define ata_pm_request(rq)	\
-	((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \
-	 (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME)
-
 struct ide_request {
 	struct scsi_request sreq;
 	u8 sense[SCSI_SENSE_BUFFERSIZE];
+	u8 type;
 };
 
+static inline struct ide_request *ide_req(struct request *rq)
+{
+	return blk_mq_rq_to_pdu(rq);
+}
+
+static inline bool ata_misc_request(struct request *rq)
+{
+	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+               ide_req(rq)->type == ATA_PRIV_MISC;
+}
+
+static inline bool ata_taskfile_request(struct request *rq)
+{
+	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+               ide_req(rq)->type == ATA_PRIV_TASKFILE;
+}
+
+static inline bool ata_pc_request(struct request *rq)
+{
+	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+               ide_req(rq)->type == ATA_PRIV_PC;
+}
+
+static inline bool ata_sense_request(struct request *rq)
+{
+	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+               ide_req(rq)->type == ATA_PRIV_SENSE;
+}
+
+static inline bool ata_pm_request(struct request *rq)
+{
+	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+		(ide_req(rq)->type == ATA_PRIV_PM_SUSPEND ||
+		 ide_req(rq)->type == ATA_PRIV_PM_RESUME);
+}
+
 /* Error codes returned in rq->errors to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
-- 
cgit v1.2.3


From aebf526b53aea164508730427597d45f3e06b376 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 31 Jan 2017 16:57:31 +0100
Subject: block: fold cmd_type into the REQ_OP_ space

Instead of keeping two levels of indirection for requests types, fold it
all into the operations.  The little caveat here is that previously
cmd_type only applied to struct request, while the request and bio op
fields were set to plain REQ_OP_READ/WRITE even for passthrough
operations.

Instead this patch adds new REQ_OP_* for SCSI passthrough and driver
private requests, althought it has to add two for each so that we
can communicate the data in/out nature of the request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                        | 10 -----
 block/blk-core.c                   |  6 +--
 block/blk-flush.c                  |  1 -
 block/blk-map.c                    | 13 ++++---
 block/blk-mq-debugfs.c             |  4 +-
 block/bsg.c                        | 17 ++++-----
 block/scsi_ioctl.c                 |  9 +++--
 drivers/block/cciss.c              | 13 +++++--
 drivers/block/floppy.c             |  4 +-
 drivers/block/hd.c                 | 45 +++++++++++-----------
 drivers/block/mg_disk.c            | 31 +++++++++-------
 drivers/block/nbd.c                | 19 ++++++----
 drivers/block/null_blk.c           |  4 +-
 drivers/block/osdblk.c             |  6 ---
 drivers/block/paride/pd.c          | 15 +++-----
 drivers/block/pktcdvd.c            |  2 +-
 drivers/block/ps3disk.c            | 15 +++++---
 drivers/block/rbd.c                | 22 ++++++-----
 drivers/block/sx8.c                |  4 +-
 drivers/block/virtio_blk.c         | 61 +++++++++++++++---------------
 drivers/block/xen-blkfront.c       |  2 +-
 drivers/block/xsysace.c            |  2 +-
 drivers/cdrom/cdrom.c              |  2 +-
 drivers/cdrom/gdrom.c              | 29 ++++++++-------
 drivers/ide/ide-atapi.c            | 21 ++++++-----
 drivers/ide/ide-cd.c               | 76 ++++++++++++++++++++++----------------
 drivers/ide/ide-cd_ioctl.c         |  3 +-
 drivers/ide/ide-devsets.c          |  3 +-
 drivers/ide/ide-disk.c             |  9 ++---
 drivers/ide/ide-eh.c               |  2 +-
 drivers/ide/ide-floppy.c           | 19 +++++-----
 drivers/ide/ide-io.c               |  2 +-
 drivers/ide/ide-ioctls.c           |  6 +--
 drivers/ide/ide-park.c             |  6 +--
 drivers/ide/ide-pm.c               | 10 ++---
 drivers/ide/ide-tape.c             |  5 +--
 drivers/ide/ide-taskfile.c         |  6 +--
 drivers/mtd/mtd_blkdevs.c          | 13 +++----
 drivers/mtd/ubi/block.c            | 15 ++++----
 drivers/nvme/host/core.c           | 26 +++++++++----
 drivers/nvme/host/rdma.c           |  2 +-
 drivers/scsi/osd/osd_initiator.c   |  3 +-
 drivers/scsi/osst.c                |  3 +-
 drivers/scsi/scsi_error.c          |  2 +-
 drivers/scsi/scsi_lib.c            | 30 +++++++--------
 drivers/scsi/sg.c                  |  3 +-
 drivers/scsi/sr.c                  |  9 +++--
 drivers/scsi/st.c                  |  6 +--
 drivers/target/target_core_pscsi.c |  3 +-
 fs/nfsd/blocklayout.c              |  2 +-
 include/linux/blk_types.h          |  7 ++++
 include/linux/blkdev.h             | 22 +++++------
 include/linux/ide.h                | 14 +++----
 53 files changed, 338 insertions(+), 326 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 2b375020fc49..9a2dd7145e83 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1227,9 +1227,6 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	if (!bio)
 		goto out_bmd;
 
-	if (iter->type & WRITE)
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
 	ret = 0;
 
 	if (map_data) {
@@ -1394,12 +1391,6 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 
 	kfree(pages);
 
-	/*
-	 * set data direction, and check if mapped pages need bouncing
-	 */
-	if (iter->type & WRITE)
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
 	bio_set_flag(bio, BIO_USER_MAPPED);
 
 	/*
@@ -1590,7 +1581,6 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 		bio->bi_private = data;
 	} else {
 		bio->bi_end_io = bio_copy_kern_endio;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 	}
 
 	return bio;
diff --git a/block/blk-core.c b/block/blk-core.c
index 44431086e4e7..3266daaa343f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -158,8 +158,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
-	printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
-		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
+	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
+		rq->rq_disk ? rq->rq_disk->disk_name : "?",
 		(unsigned long long) rq->cmd_flags);
 
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
@@ -1593,7 +1593,6 @@ out:
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-	req->cmd_type = REQ_TYPE_FS;
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
@@ -2983,7 +2982,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 0a0358e48b76..968162579234 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -327,7 +327,6 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
 	}
 
-	flush_rq->cmd_type = REQ_TYPE_FS;
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
 	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
diff --git a/block/blk-map.c b/block/blk-map.c
index 0acb6640ead7..2f18c2a0be1b 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -16,8 +16,6 @@
 int blk_rq_append_bio(struct request *rq, struct bio *bio)
 {
 	if (!rq->bio) {
-		rq->cmd_flags &= REQ_OP_MASK;
-		rq->cmd_flags |= (bio->bi_opf & REQ_OP_MASK);
 		blk_rq_bio_prep(rq->q, rq, bio);
 	} else {
 		if (!ll_back_merge_fn(rq->q, rq, bio))
@@ -62,6 +60,9 @@ static int __blk_rq_map_user_iov(struct request *rq,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
+	bio->bi_opf &= ~REQ_OP_MASK;
+	bio->bi_opf |= req_op(rq);
+
 	if (map_data && map_data->null_mapped)
 		bio_set_flag(bio, BIO_NULL_MAPPED);
 
@@ -90,7 +91,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
 }
 
 /**
- * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
+ * blk_rq_map_user_iov - map user data to a request, for passthrough requests
  * @q:		request queue where request should be inserted
  * @rq:		request to map data to
  * @map_data:   pointer to the rq_map_data holding pages (if necessary)
@@ -199,7 +200,7 @@ int blk_rq_unmap_user(struct bio *bio)
 EXPORT_SYMBOL(blk_rq_unmap_user);
 
 /**
- * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
+ * blk_rq_map_kern - map kernel data to a request, for passthrough requests
  * @q:		request queue where request should be inserted
  * @rq:		request to fill
  * @kbuf:	the kernel buffer
@@ -234,8 +235,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	if (!reading)
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	bio->bi_opf &= ~REQ_OP_MASK;
+	bio->bi_opf |= req_op(rq);
 
 	if (do_copy)
 		rq->rq_flags |= RQF_COPY_USER;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5cd2b435a9f5..1e2a4a2ff623 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -88,8 +88,8 @@ static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 {
 	struct request *rq = list_entry_rq(v);
 
-	seq_printf(m, "%p {.cmd_type=%u, .cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
-		   rq, rq->cmd_type, rq->cmd_flags, (unsigned int)rq->rq_flags,
+	seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
+		   rq, rq->cmd_flags, (unsigned int)rq->rq_flags,
 		   rq->tag, rq->internal_tag);
 	return 0;
 }
diff --git a/block/bsg.c b/block/bsg.c
index e34c3320956c..a9a8b8e0446f 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -177,7 +177,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
  * Check if sg_io_v4 from user is allowed and valid
  */
 static int
-bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *rw)
+bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
 {
 	int ret = 0;
 
@@ -198,7 +198,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *rw)
 		ret = -EINVAL;
 	}
 
-	*rw = hdr->dout_xfer_len ? WRITE : READ;
+	*op = hdr->dout_xfer_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN;
 	return ret;
 }
 
@@ -210,8 +210,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
-	int ret, rw;
-	unsigned int dxfer_len;
+	int ret;
+	unsigned int op, dxfer_len;
 	void __user *dxferp = NULL;
 	struct bsg_class_device *bcd = &q->bsg_dev;
 
@@ -226,14 +226,14 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 		hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
 		hdr->din_xfer_len);
 
-	ret = bsg_validate_sgv4_hdr(hdr, &rw);
+	ret = bsg_validate_sgv4_hdr(hdr, &op);
 	if (ret)
 		return ERR_PTR(ret);
 
 	/*
 	 * map scatter-gather elements separately and string them to request
 	 */
-	rq = blk_get_request(q, rw, GFP_KERNEL);
+	rq = blk_get_request(q, op, GFP_KERNEL);
 	if (IS_ERR(rq))
 		return rq;
 	scsi_req_init(rq);
@@ -242,20 +242,19 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 	if (ret)
 		goto out;
 
-	if (rw == WRITE && hdr->din_xfer_len) {
+	if (op == REQ_OP_SCSI_OUT && hdr->din_xfer_len) {
 		if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) {
 			ret = -EOPNOTSUPP;
 			goto out;
 		}
 
-		next_rq = blk_get_request(q, READ, GFP_KERNEL);
+		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
 		if (IS_ERR(next_rq)) {
 			ret = PTR_ERR(next_rq);
 			next_rq = NULL;
 			goto out;
 		}
 		rq->next_rq = next_rq;
-		next_rq->cmd_type = rq->cmd_type;
 
 		dxferp = (void __user *)(unsigned long)hdr->din_xferp;
 		ret =  blk_rq_map_user(q, next_rq, NULL, dxferp,
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 7edf44f25e08..2a2fc768b27a 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,7 +321,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		at_head = 1;
 
 	ret = -ENOMEM;
-	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
+	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
+			GFP_KERNEL);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	req = scsi_req(rq);
@@ -448,7 +449,8 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	}
 
-	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM);
+	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
+			__GFP_RECLAIM);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
@@ -537,7 +539,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	struct request *rq;
 	int err;
 
-	rq = blk_get_request(q, WRITE, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	scsi_req_init(rq);
@@ -745,7 +747,6 @@ void scsi_req_init(struct request *rq)
 {
 	struct scsi_request *req = scsi_req(rq);
 
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	memset(req->__cmd, 0, sizeof(req->__cmd));
 	req->cmd = req->__cmd;
 	req->cmd_len = BLK_MAX_CDB;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 53dc2971a3ba..f21c2f0c5ccd 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3394,7 +3394,9 @@ static void do_cciss_request(struct request_queue *q)
 		c->Header.SGList = h->max_cmd_sgentries;
 	set_performant_mode(h, c);
 
-	if (likely(creq->cmd_type == REQ_TYPE_FS)) {
+	switch (req_op(creq)) {
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
 		if(h->cciss_read == CCISS_READ_10) {
 			c->Request.CDB[1] = 0;
 			c->Request.CDB[2] = (start_blk >> 24) & 0xff; /* MSB */
@@ -3424,13 +3426,16 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[14] = c->Request.CDB[15] = 0;
 		}
-	} else if (creq->cmd_type == REQ_TYPE_BLOCK_PC) {
+		break;
+	case REQ_OP_SCSI_IN:
+	case REQ_OP_SCSI_OUT:
 		c->Request.CDBLen = scsi_req(creq)->cmd_len;
 		memcpy(c->Request.CDB, scsi_req(creq)->cmd, BLK_MAX_CDB);
 		scsi_req(creq)->sense = c->err_info->SenseInfo;
-	} else {
+		break;
+	default:
 		dev_warn(&h->pdev->dev, "bad request type %d\n",
-			creq->cmd_type);
+			creq->cmd_flags);
 		BUG();
 	}
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 184887af4b9f..45b4384f650c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2900,8 +2900,8 @@ static void do_fd_request(struct request_queue *q)
 		return;
 
 	if (WARN(atomic_read(&usage_count) == 0,
-		 "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%llx\n",
-		 current_req, (long)blk_rq_pos(current_req), current_req->cmd_type,
+		 "warning: usage count=0, current_req=%p sect=%ld flags=%llx\n",
+		 current_req, (long)blk_rq_pos(current_req),
 		 (unsigned long long) current_req->cmd_flags))
 		return;
 
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index a9b48ed7a3cd..6043648da1e8 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -626,30 +626,29 @@ repeat:
 		req_data_dir(req) == READ ? "read" : "writ",
 		cyl, head, sec, nsect, bio_data(req->bio));
 #endif
-	if (req->cmd_type == REQ_TYPE_FS) {
-		switch (rq_data_dir(req)) {
-		case READ:
-			hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
-				&read_intr);
-			if (reset)
-				goto repeat;
-			break;
-		case WRITE:
-			hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
-				&write_intr);
-			if (reset)
-				goto repeat;
-			if (wait_DRQ()) {
-				bad_rw_intr();
-				goto repeat;
-			}
-			outsw(HD_DATA, bio_data(req->bio), 256);
-			break;
-		default:
-			printk("unknown hd-command\n");
-			hd_end_request_cur(-EIO);
-			break;
+
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+		hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
+			&read_intr);
+		if (reset)
+			goto repeat;
+		break;
+	case REQ_OP_WRITE:
+		hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
+			&write_intr);
+		if (reset)
+			goto repeat;
+		if (wait_DRQ()) {
+			bad_rw_intr();
+			goto repeat;
 		}
+		outsw(HD_DATA, bio_data(req->bio), 256);
+		break;
+	default:
+		printk("unknown hd-command\n");
+		hd_end_request_cur(-EIO);
+		break;
 	}
 }
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index e937fcf71769..286f276f586e 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -670,15 +670,17 @@ static void mg_request_poll(struct request_queue *q)
 				break;
 		}
 
-		if (unlikely(host->req->cmd_type != REQ_TYPE_FS)) {
-			mg_end_request_cur(host, -EIO);
-			continue;
-		}
-
-		if (rq_data_dir(host->req) == READ)
+		switch (req_op(host->req)) {
+		case REQ_OP_READ:
 			mg_read(host->req);
-		else
+			break;
+		case REQ_OP_WRITE:
 			mg_write(host->req);
+			break;
+		default:
+			mg_end_request_cur(host, -EIO);
+			break;
+		}
 	}
 }
 
@@ -687,13 +689,15 @@ static unsigned int mg_issue_req(struct request *req,
 		unsigned int sect_num,
 		unsigned int sect_cnt)
 {
-	if (rq_data_dir(req) == READ) {
+	switch (req_op(host->req)) {
+	case REQ_OP_READ:
 		if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
 				!= MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return host->error;
 		}
-	} else {
+		break;
+	case REQ_OP_WRITE:
 		/* TODO : handler */
 		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 		if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
@@ -712,6 +716,10 @@ static unsigned int mg_issue_req(struct request *req,
 		mod_timer(&host->timer, jiffies + 3 * HZ);
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
+		break;
+	default:
+		mg_end_request_cur(host, -EIO);
+		break;
 	}
 	return MG_ERR_NONE;
 }
@@ -753,11 +761,6 @@ static void mg_request(struct request_queue *q)
 			continue;
 		}
 
-		if (unlikely(req->cmd_type != REQ_TYPE_FS)) {
-			mg_end_request_cur(host, -EIO);
-			continue;
-		}
-
 		if (!mg_issue_req(req, host, sect_num, sect_cnt))
 			return;
 	}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 8bce1c7c18d5..58296e4427b0 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -271,17 +271,22 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	u32 type;
 	u32 tag = blk_mq_unique_tag(req);
 
-	if (req->cmd_type != REQ_TYPE_FS)
-		return -EIO;
-
-	if (req_op(req) == REQ_OP_DISCARD)
+	switch (req_op(req)) {
+	case REQ_OP_DISCARD:
 		type = NBD_CMD_TRIM;
-	else if (req_op(req) == REQ_OP_FLUSH)
+		break;
+	case REQ_OP_FLUSH:
 		type = NBD_CMD_FLUSH;
-	else if (rq_data_dir(req) == WRITE)
+		break;
+	case REQ_OP_WRITE:
 		type = NBD_CMD_WRITE;
-	else
+		break;
+	case REQ_OP_READ:
 		type = NBD_CMD_READ;
+		break;
+	default:
+		return -EIO;
+	}
 
 	if (rq_data_dir(req) == WRITE &&
 	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index c0e14e54909b..4335b84a565b 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -431,11 +431,11 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	struct request *rq;
 	struct bio *bio = rqd->bio;
 
-	rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
+	rq = blk_mq_alloc_request(q,
+		op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return -ENOMEM;
 
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->__sector = bio->bi_iter.bi_sector;
 	rq->ioprio = bio_prio(bio);
 
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 92900f5f0b47..8127b8201a01 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -308,12 +308,6 @@ static void osdblk_rq_fn(struct request_queue *q)
 		if (!rq)
 			break;
 
-		/* filter out block requests we don't understand */
-		if (rq->cmd_type != REQ_TYPE_FS) {
-			blk_end_request_all(rq, 0);
-			continue;
-		}
-
 		/* deduce our operation (read, write, flush) */
 		/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
 		 * into a clearly defined set of RPC commands:
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index c3ed2fc72daa..644ba0888bd4 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -439,18 +439,16 @@ static int pd_retries = 0;	/* i/o error retry count */
 static int pd_block;		/* address of next requested block */
 static int pd_count;		/* number of blocks still to do */
 static int pd_run;		/* sectors in current cluster */
-static int pd_cmd;		/* current command READ/WRITE */
 static char *pd_buf;		/* buffer for request in progress */
 
 static enum action do_pd_io_start(void)
 {
-	if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) {
+	switch (req_op(pd_req)) {
+	case REQ_OP_DRV_IN:
 		phase = pd_special;
 		return pd_special();
-	}
-
-	pd_cmd = rq_data_dir(pd_req);
-	if (pd_cmd == READ || pd_cmd == WRITE) {
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
 		pd_block = blk_rq_pos(pd_req);
 		pd_count = blk_rq_cur_sectors(pd_req);
 		if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
@@ -458,7 +456,7 @@ static enum action do_pd_io_start(void)
 		pd_run = blk_rq_sectors(pd_req);
 		pd_buf = bio_data(pd_req->bio);
 		pd_retries = 0;
-		if (pd_cmd == READ)
+		if (req_op(pd_req) == REQ_OP_READ)
 			return do_pd_read_start();
 		else
 			return do_pd_write_start();
@@ -723,11 +721,10 @@ static int pd_special_command(struct pd_unit *disk,
 	struct request *rq;
 	int err = 0;
 
-	rq = blk_get_request(disk->gd->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = func;
 
 	err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 918a92ccd0c0..f4bd95943dc2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -704,7 +704,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	int ret = 0;
 
 	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-			     WRITE : READ, __GFP_RECLAIM);
+			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	scsi_req_init(rq);
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 76f33c84ce3d..a809e3e9feb8 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -196,16 +196,19 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
 	while ((req = blk_fetch_request(q))) {
-		if (req_op(req) == REQ_OP_FLUSH) {
+		switch (req_op(req)) {
+		case REQ_OP_FLUSH:
 			if (ps3disk_submit_flush_request(dev, req))
-				break;
-		} else if (req->cmd_type == REQ_TYPE_FS) {
+				return;
+			break;
+		case REQ_OP_READ:
+		case REQ_OP_WRITE:
 			if (ps3disk_submit_request_sg(dev, req))
-				break;
-		} else {
+				return;
+			break;
+		default:
 			blk_dump_rq_flags(req, DEVICE_NAME " bad request");
 			__blk_end_request_all(req, -EIO);
-			continue;
 		}
 	}
 }
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 36d2b9f4e836..4d78cf605b21 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4099,19 +4099,21 @@ static void rbd_queue_workfn(struct work_struct *work)
 	bool must_be_locked;
 	int result;
 
-	if (rq->cmd_type != REQ_TYPE_FS) {
-		dout("%s: non-fs request type %d\n", __func__,
-			(int) rq->cmd_type);
-		result = -EIO;
-		goto err;
-	}
-
-	if (req_op(rq) == REQ_OP_DISCARD)
+	switch (req_op(rq)) {
+	case REQ_OP_DISCARD:
 		op_type = OBJ_OP_DISCARD;
-	else if (req_op(rq) == REQ_OP_WRITE)
+		break;
+	case REQ_OP_WRITE:
 		op_type = OBJ_OP_WRITE;
-	else
+		break;
+	case REQ_OP_READ:
 		op_type = OBJ_OP_READ;
+		break;
+	default:
+		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
+		result = -EIO;
+		goto err;
+	}
 
 	/* Ignore/skip any zero-length requests */
 
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 0e93ad7b8511..c8e072caf56f 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -567,7 +567,7 @@ static struct carm_request *carm_get_special(struct carm_host *host)
 	if (!crq)
 		return NULL;
 
-	rq = blk_get_request(host->oob_q, WRITE /* bogus */, GFP_KERNEL);
+	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, GFP_KERNEL);
 	if (IS_ERR(rq)) {
 		spin_lock_irqsave(&host->lock, flags);
 		carm_put_request(host, crq);
@@ -620,7 +620,6 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	spin_unlock_irq(&host->lock);
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
@@ -661,7 +660,6 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	crq->msg_bucket = (u32) rc;
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index d9491bdd3ad3..a363170e45b1 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -175,11 +175,12 @@ static inline void virtblk_request_done(struct request *req)
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	int error = virtblk_result(vbr);
 
-	switch (req->cmd_type) {
-	case REQ_TYPE_BLOCK_PC:
+	switch (req_op(req)) {
+	case REQ_OP_SCSI_IN:
+	case REQ_OP_SCSI_OUT:
 		virtblk_scsi_reques_done(req);
 		break;
-	case REQ_TYPE_DRV_PRIV:
+	case REQ_OP_DRV_IN:
 		req->errors = (error != 0);
 		break;
 	}
@@ -226,36 +227,35 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	int qid = hctx->queue_num;
 	int err;
 	bool notify = false;
+	u32 type;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
-	if (req_op(req) == REQ_OP_FLUSH) {
-		vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH);
-		vbr->out_hdr.sector = 0;
-		vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
-	} else {
-		switch (req->cmd_type) {
-		case REQ_TYPE_FS:
-			vbr->out_hdr.type = 0;
-			vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
-			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
-			break;
-		case REQ_TYPE_BLOCK_PC:
-			vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD);
-			vbr->out_hdr.sector = 0;
-			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
-			break;
-		case REQ_TYPE_DRV_PRIV:
-			vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
-			vbr->out_hdr.sector = 0;
-			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
-			break;
-		default:
-			/* We don't put anything else in the queue. */
-			BUG();
-		}
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
+		type = 0;
+		break;
+	case REQ_OP_FLUSH:
+		type = VIRTIO_BLK_T_FLUSH;
+		break;
+	case REQ_OP_SCSI_IN:
+	case REQ_OP_SCSI_OUT:
+		type = VIRTIO_BLK_T_SCSI_CMD;
+		break;
+	case REQ_OP_DRV_IN:
+		type = VIRTIO_BLK_T_GET_ID;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
 
+	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
+	vbr->out_hdr.sector = type ?
+		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
+	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
+
 	blk_mq_start_request(req);
 
 	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
@@ -267,7 +267,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
-	if (req->cmd_type == REQ_TYPE_BLOCK_PC)
+	if (req_op(req) == REQ_OP_SCSI_IN || req_op(req) == REQ_OP_SCSI_OUT)
 		err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num);
 	else
 		err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
@@ -300,10 +300,9 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	struct request *req;
 	int err;
 
-	req = blk_get_request(q, READ, GFP_KERNEL);
+	req = blk_get_request(q, REQ_OP_DRV_IN, GFP_KERNEL);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	req->cmd_type = REQ_TYPE_DRV_PRIV;
 
 	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
 	if (err)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b2bdfa81f929..9a8cb226cb3a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -865,7 +865,7 @@ static inline void flush_requests(struct blkfront_ring_info *rinfo)
 static inline bool blkif_request_flush_invalid(struct request *req,
 					       struct blkfront_info *info)
 {
-	return ((req->cmd_type != REQ_TYPE_FS) ||
+	return (blk_rq_is_passthrough(req) ||
 		((req_op(req) == REQ_OP_FLUSH) &&
 		 !info->feature_flush) ||
 		((req->cmd_flags & REQ_FUA) &&
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index c4328d9d9981..757dce2147e0 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -468,7 +468,7 @@ static struct request *ace_get_next_request(struct request_queue *q)
 	struct request *req;
 
 	while ((req = blk_peek_request(q)) != NULL) {
-		if (req->cmd_type == REQ_TYPE_FS)
+		if (!blk_rq_is_passthrough(req))
 			break;
 		blk_start_request(req);
 		__blk_end_request_all(req, -EIO);
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 36f5237a8a69..8f496b4f3dd1 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2191,7 +2191,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		rq = blk_get_request(q, READ, GFP_KERNEL);
+		rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			break;
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 584bc3126403..b6f1a5d95bc5 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -659,23 +659,24 @@ static void gdrom_request(struct request_queue *rq)
 	struct request *req;
 
 	while ((req = blk_fetch_request(rq)) != NULL) {
-		if (req->cmd_type != REQ_TYPE_FS) {
-			printk(KERN_DEBUG "gdrom: Non-fs request ignored\n");
-			__blk_end_request_all(req, -EIO);
-			continue;
-		}
-		if (rq_data_dir(req) != READ) {
+		switch (req_op(req)) {
+		case REQ_OP_READ:
+			/*
+			 * Add to list of deferred work and then schedule
+			 * workqueue.
+			 */
+			list_add_tail(&req->queuelist, &gdrom_deferred);
+			schedule_work(&work);
+			break;
+		case REQ_OP_WRITE:
 			pr_notice("Read only device - write request ignored\n");
 			__blk_end_request_all(req, -EIO);
-			continue;
+			break;
+		default:
+			printk(KERN_DEBUG "gdrom: Non-fs request ignored\n");
+			__blk_end_request_all(req, -EIO);
+			break;
 		}
-
-		/*
-		 * Add to list of deferred work and then schedule
-		 * workqueue.
-		 */
-		list_add_tail(&req->queuelist, &gdrom_deferred);
-		schedule_work(&work);
 	}
 }
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index a8c650e8c92e..feb30061123b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -92,9 +92,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	struct request *rq;
 	int error;
 
-	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = (char *)pc;
 
@@ -212,7 +211,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	}
 
 	sense_rq->rq_disk = rq->rq_disk;
-	sense_rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	sense_rq->cmd_flags = REQ_OP_DRV_IN;
 	ide_req(sense_rq)->type = ATA_PRIV_SENSE;
 	sense_rq->rq_flags |= RQF_PREEMPT;
 
@@ -312,19 +311,21 @@ EXPORT_SYMBOL_GPL(ide_cd_expiry);
 
 int ide_cd_get_xferlen(struct request *rq)
 {
-	switch (rq->cmd_type) {
-	case REQ_TYPE_FS:
+	switch (req_op(rq)) {
+	default:
 		return 32768;
-	case REQ_TYPE_BLOCK_PC:
+	case REQ_OP_SCSI_IN:
+	case REQ_OP_SCSI_OUT:
 		return blk_rq_bytes(rq);
-	case REQ_TYPE_DRV_PRIV:
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
 		switch (ide_req(rq)->type) {
 		case ATA_PRIV_PC:
 		case ATA_PRIV_SENSE:
 			return blk_rq_bytes(rq);
+		default:
+			return 0;
 		}
-	default:
-		return 0;
 	}
 }
 EXPORT_SYMBOL_GPL(ide_cd_get_xferlen);
@@ -491,7 +492,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			error = 0;
 		} else {
 
-			if (rq->cmd_type != REQ_TYPE_FS && uptodate <= 0) {
+			if (blk_rq_is_passthrough(rq) && uptodate <= 0) {
 				if (rq->errors == 0)
 					rq->errors = -EIO;
 			}
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 207af7816544..dc039d150357 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -176,7 +176,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 			if (!sense->valid)
 				break;
 			if (failed_command == NULL ||
-			    failed_command->cmd_type != REQ_TYPE_FS)
+			    blk_rq_is_passthrough(failed_command))
 				break;
 			sector = (sense->information[0] << 24) |
 				 (sense->information[1] << 16) |
@@ -293,7 +293,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	}
 
 	/* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !rq->errors)
+	if (blk_rq_is_scsi(rq) && !rq->errors)
 		rq->errors = SAM_STAT_CHECK_CONDITION;
 
 	if (blk_noretry_request(rq))
@@ -301,13 +301,13 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	switch (sense_key) {
 	case NOT_READY:
-		if (rq->cmd_type == REQ_TYPE_FS && rq_data_dir(rq) == WRITE) {
+		if (req_op(rq) == REQ_OP_WRITE) {
 			if (ide_cd_breathe(drive, rq))
 				return 1;
 		} else {
 			cdrom_saw_media_change(drive);
 
-			if (rq->cmd_type == REQ_TYPE_FS &&
+			if (!blk_rq_is_passthrough(rq) &&
 			    !(rq->rq_flags & RQF_QUIET))
 				printk(KERN_ERR PFX "%s: tray open\n",
 					drive->name);
@@ -317,7 +317,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	case UNIT_ATTENTION:
 		cdrom_saw_media_change(drive);
 
-		if (rq->cmd_type != REQ_TYPE_FS)
+		if (blk_rq_is_passthrough(rq))
 			return 0;
 
 		/*
@@ -365,7 +365,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		do_end_request = 1;
 		break;
 	default:
-		if (rq->cmd_type != REQ_TYPE_FS)
+		if (blk_rq_is_passthrough(rq))
 			break;
 		if (err & ~ATA_ABORTED) {
 			/* go to the default handler for other errors */
@@ -376,7 +376,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 			do_end_request = 1;
 	}
 
-	if (rq->cmd_type != REQ_TYPE_FS) {
+	if (blk_rq_is_passthrough(rq)) {
 		rq->rq_flags |= RQF_FAILED;
 		do_end_request = 1;
 	}
@@ -435,10 +435,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		int error;
 		bool delay = false;
 
-		rq = blk_get_request(drive->queue, write, __GFP_RECLAIM);
+		rq = blk_get_request(drive->queue,
+			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,  __GFP_RECLAIM);
 		scsi_req_init(rq);
 		memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
-		rq->cmd_type = REQ_TYPE_DRV_PRIV;
 		ide_req(rq)->type = ATA_PRIV_PC;
 		rq->rq_flags |= rq_flags;
 		rq->timeout = timeout;
@@ -564,7 +564,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	ide_read_bcount_and_ireason(drive, &len, &ireason);
 
-	thislen = (rq->cmd_type == REQ_TYPE_FS) ? len : cmd->nleft;
+	thislen = !blk_rq_is_passthrough(rq) ? len : cmd->nleft;
 	if (thislen > len)
 		thislen = len;
 
@@ -573,7 +573,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	/* If DRQ is clear, the command has completed. */
 	if ((stat & ATA_DRQ) == 0) {
-		if (rq->cmd_type == REQ_TYPE_FS) {
+		switch (req_op(rq)) {
+		default:
 			/*
 			 * If we're not done reading/writing, complain.
 			 * Otherwise, complete the command normally.
@@ -587,7 +588,9 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 					rq->rq_flags |= RQF_FAILED;
 				uptodate = 0;
 			}
-		} else if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
+			goto out_end;
+		case REQ_OP_DRV_IN:
+		case REQ_OP_DRV_OUT:
 			ide_cd_request_sense_fixup(drive, cmd);
 
 			uptodate = cmd->nleft ? 0 : 1;
@@ -603,8 +606,11 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 			if (!uptodate)
 				rq->rq_flags |= RQF_FAILED;
+			goto out_end;
+		case REQ_OP_SCSI_IN:
+		case REQ_OP_SCSI_OUT:
+			goto out_end;
 		}
-		goto out_end;
 	}
 
 	rc = ide_check_ireason(drive, rq, len, ireason, write);
@@ -636,7 +642,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	/* pad, if necessary */
 	if (len > 0) {
-		if (rq->cmd_type != REQ_TYPE_FS || write == 0)
+		if (blk_rq_is_passthrough(rq) || write == 0)
 			ide_pad_transfer(drive, write, len);
 		else {
 			printk(KERN_ERR PFX "%s: confused, missing data\n",
@@ -645,12 +651,18 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		}
 	}
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+	switch (req_op(rq)) {
+	case REQ_OP_SCSI_IN:
+	case REQ_OP_SCSI_OUT:
 		timeout = rq->timeout;
-	} else {
+		break;
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
+		expiry = ide_cd_expiry;
+		/*FALLTHRU*/
+	default:
 		timeout = ATAPI_WAIT_PC;
-		if (rq->cmd_type != REQ_TYPE_FS)
-			expiry = ide_cd_expiry;
+		break;
 	}
 
 	hwif->expiry = expiry;
@@ -658,7 +670,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	return ide_started;
 
 out_end:
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && rc == 0) {
+	if (blk_rq_is_scsi(rq) && rc == 0) {
 		scsi_req(rq)->resid_len = 0;
 		blk_end_request_all(rq, 0);
 		hwif->rq = NULL;
@@ -666,7 +678,7 @@ out_end:
 		if (sense && uptodate)
 			ide_cd_complete_failed_rq(drive, rq);
 
-		if (rq->cmd_type == REQ_TYPE_FS) {
+		if (!blk_rq_is_passthrough(rq)) {
 			if (cmd->nleft == 0)
 				uptodate = 1;
 		} else {
@@ -679,7 +691,7 @@ out_end:
 				return ide_stopped;
 
 		/* make sure it's fully ended */
-		if (rq->cmd_type != REQ_TYPE_FS) {
+		if (blk_rq_is_passthrough(rq)) {
 			scsi_req(rq)->resid_len -= cmd->nbytes - cmd->nleft;
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
 				scsi_req(rq)->resid_len += cmd->last_xfer_len;
@@ -739,7 +751,7 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	ide_debug_log(IDE_DBG_PC, "rq->cmd[0]: 0x%x, rq->cmd_type: 0x%x",
 				  rq->cmd[0], rq->cmd_type);
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
+	if (blk_rq_is_scsi(rq))
 		rq->rq_flags |= RQF_QUIET;
 	else
 		rq->rq_flags &= ~RQF_FAILED;
@@ -781,18 +793,20 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	if (drive->debug_mask & IDE_DBG_RQ)
 		blk_dump_rq_flags(rq, "ide_cd_do_request");
 
-	switch (rq->cmd_type) {
-	case REQ_TYPE_FS:
+	switch (req_op(rq)) {
+	default:
 		if (cdrom_start_rw(drive, rq) == ide_stopped)
 			goto out_end;
 		break;
-	case REQ_TYPE_BLOCK_PC:
+	case REQ_OP_SCSI_IN:
+	case REQ_OP_SCSI_OUT:
 	handle_pc:
 		if (!rq->timeout)
 			rq->timeout = ATAPI_WAIT_PC;
 		cdrom_do_block_pc(drive, rq);
 		break;
-	case REQ_TYPE_DRV_PRIV:
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
 		switch (ide_req(rq)->type) {
 		case ATA_PRIV_MISC:
 			/* right now this can only be a reset... */
@@ -801,9 +815,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		case ATA_PRIV_SENSE:
 		case ATA_PRIV_PC:
 			goto handle_pc;
+		default:
+			BUG();
 		}
-	default:
-		BUG();
 	}
 
 	/* prepare sense request for this command */
@@ -816,7 +830,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
-	if (rq->cmd_type == REQ_TYPE_FS || blk_rq_bytes(rq)) {
+	if (!blk_rq_is_passthrough(rq) || blk_rq_bytes(rq)) {
 		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
 		ide_map_sg(drive, &cmd);
 	}
@@ -1373,9 +1387,9 @@ static int ide_cdrom_prep_pc(struct request *rq)
 
 static int ide_cdrom_prep_fn(struct request_queue *q, struct request *rq)
 {
-	if (rq->cmd_type == REQ_TYPE_FS)
+	if (!blk_rq_is_passthrough(rq))
 		return ide_cdrom_prep_fs(q, rq);
-	else if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
+	else if (blk_rq_is_scsi(rq))
 		return ide_cdrom_prep_pc(rq);
 
 	return 0;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 3f03eed0ff78..9fcefbc8425e 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -303,9 +303,8 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	struct request *rq;
 	int ret;
 
-	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
 	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index c040b9de2b4e..a45dda5386e4 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -165,9 +165,8 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	if (!(setting->flags & DS_SYNC))
 		return setting->set(drive, arg);
 
-	rq = blk_get_request(q, READ, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 5;
 	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 69cf71729841..186159715b71 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -184,7 +184,7 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	ide_hwif_t *hwif = drive->hwif;
 
 	BUG_ON(drive->dev_flags & IDE_DFLAG_BLOCKED);
-	BUG_ON(rq->cmd_type != REQ_TYPE_FS);
+	BUG_ON(blk_rq_is_passthrough(rq));
 
 	ledtrig_disk_activity();
 
@@ -452,8 +452,8 @@ static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
 	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd->tf_flags = IDE_TFLAG_DYN;
 	cmd->protocol = ATA_PROT_NODATA;
-
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	rq->cmd_flags &= ~REQ_OP_MASK;
+	rq->cmd_flags |= REQ_OP_DRV_OUT;
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 	rq->special = cmd;
 	cmd->rq = rq;
@@ -478,9 +478,8 @@ static int set_multcount(ide_drive_t *drive, int arg)
 	if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
 		return -EBUSY;
 
-	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	drive->mult_req = arg;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 1e4b1476e559..cf3af6840368 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -123,7 +123,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 		return ide_stopped;
 
 	/* retry only "normal" I/O: */
-	if (rq->cmd_type != REQ_TYPE_FS) {
+	if (blk_rq_is_passthrough(rq)) {
 		if (ata_taskfile_request(rq)) {
 			struct ide_cmd *cmd = rq->special;
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 24c6d363b3a4..a69e8013f1df 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -72,7 +72,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 		drive->failed_pc = NULL;
 
 	if (pc->c[0] == GPCMD_READ_10 || pc->c[0] == GPCMD_WRITE_10 ||
-	    rq->cmd_type == REQ_TYPE_BLOCK_PC)
+	    (req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT))
 		uptodate = 1; /* FIXME */
 	else if (pc->c[0] == GPCMD_REQUEST_SENSE) {
 
@@ -254,8 +254,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 			goto out_end;
 	}
 
-	switch (rq->cmd_type) {
-	case REQ_TYPE_FS:
+	switch (req_op(rq)) {
+	default:
 		if (((long)blk_rq_pos(rq) % floppy->bs_factor) ||
 		    (blk_rq_sectors(rq) % floppy->bs_factor)) {
 			printk(KERN_ERR PFX "%s: unsupported r/w rq size\n",
@@ -265,11 +265,13 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 		break;
-	case REQ_TYPE_BLOCK_PC:
+	case REQ_OP_SCSI_IN:
+	case REQ_OP_SCSI_OUT:
 		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
 		break;
-	case REQ_TYPE_DRV_PRIV:
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
 		switch (ide_req(rq)->type) {
 		case ATA_PRIV_MISC:
 		case ATA_PRIV_SENSE:
@@ -278,9 +280,6 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		default:
 			BUG();
 		}
-		break;
-	default:
-		BUG();
 	}
 
 	ide_prep_sense(drive, rq);
@@ -292,7 +291,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 
 	cmd.rq = rq;
 
-	if (rq->cmd_type == REQ_TYPE_FS || blk_rq_bytes(rq)) {
+	if (!blk_rq_is_passthrough(rq) || blk_rq_bytes(rq)) {
 		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
 		ide_map_sg(drive, &cmd);
 	}
@@ -302,7 +301,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	return ide_floppy_issue_pc(drive, &cmd, pc);
 out_end:
 	drive->failed_pc = NULL;
-	if (rq->cmd_type != REQ_TYPE_FS && rq->errors == 0)
+	if (blk_rq_is_passthrough(rq) && rq->errors == 0)
 		rq->errors = -EIO;
 	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
 	return ide_stopped;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 6735c92b8da7..043b1fb963cb 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -145,7 +145,7 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 	} else {
 		if (media == ide_tape)
 			rq->errors = IDE_DRV_ERROR_GENERAL;
-		else if (rq->cmd_type != REQ_TYPE_FS && rq->errors == 0)
+		else if (blk_rq_is_passthrough(rq) && rq->errors == 0)
 			rq->errors = -EIO;
 	}
 
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index fc19e842deef..248a3e0ceb46 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -125,9 +125,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 	if (NULL == (void *) arg) {
 		struct request *rq;
 
-		rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 		scsi_req_init(rq);
-		rq->cmd_type = REQ_TYPE_DRV_PRIV;
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
 		err = blk_execute_rq(drive->queue, NULL, rq, 0);
 		blk_put_request(rq);
@@ -223,9 +222,8 @@ static int generic_drive_reset(ide_drive_t *drive)
 	struct request *rq;
 	int ret = 0;
 
-	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index fc3c944ca4be..101aed9a61ca 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -31,11 +31,10 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	}
 	spin_unlock_irq(&hwif->lock);
 
-	rq = blk_get_request(q, READ, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
 	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = &timeout;
 	rc = blk_execute_rq(q, NULL, rq, 1);
@@ -47,14 +46,13 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	 * Make sure that *some* command is sent to the drive after the
 	 * timeout has expired, so power management will be reenabled.
 	 */
-	rq = blk_get_request(q, READ, GFP_NOWAIT);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT);
 	scsi_req_init(rq);
 	if (IS_ERR(rq))
 		goto out;
 
 	scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
 
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 0c1296815dfa..ec951be4b0c8 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -18,9 +18,8 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
@@ -90,9 +89,8 @@ int generic_ide_resume(struct device *dev)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
 	rq->rq_flags |= RQF_PREEMPT;
 	rq->special = &rqpm;
@@ -244,12 +242,12 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
 	struct ide_pm_state *pm = rq->special;
 
-	if (rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+	if (blk_rq_is_private(rq) &&
 	    ide_req(rq)->type == ATA_PRIV_PM_SUSPEND &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
 		/* Mark drive blocked when starting the suspend sequence. */
 		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+	else if (blk_rq_is_private(rq) &&
 	         ide_req(rq)->type == ATA_PRIV_PM_RESUME &&
 		 pm->pm_step == IDE_PM_START_RESUME) {
 		/*
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 37c7beabaacc..3c1b7974d66d 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -577,7 +577,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		      req->cmd[0], (unsigned long long)blk_rq_pos(rq),
 		      blk_rq_sectors(rq));
 
-	BUG_ON(rq->cmd_type != REQ_TYPE_DRV_PRIV);
+	BUG_ON(!blk_rq_is_private(rq));
 	BUG_ON(ide_req(rq)->type != ATA_PRIV_MISC &&
 	       ide_req(rq)->type != ATA_PRIV_SENSE);
 
@@ -854,9 +854,8 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 	BUG_ON(size < 0 || size % tape->blk_size);
 
-	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 9ccc1d573393..247b9faccce1 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -428,11 +428,11 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 {
 	struct request *rq;
 	int error;
-	int rw = !(cmd->tf_flags & IDE_TFLAG_WRITE) ? READ : WRITE;
 
-	rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue,
+		(cmd->tf_flags & IDE_TFLAG_WRITE) ?
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM);
 	scsi_req_init(rq);
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	/*
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index df8a5ef334c0..6b8d5cd7dbf6 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -84,9 +84,6 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
 	buf = bio_data(req->bio);
 
-	if (req->cmd_type != REQ_TYPE_FS)
-		return -EIO;
-
 	if (req_op(req) == REQ_OP_FLUSH)
 		return tr->flush(dev);
 
@@ -94,16 +91,16 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	    get_capacity(req->rq_disk))
 		return -EIO;
 
-	if (req_op(req) == REQ_OP_DISCARD)
+	switch (req_op(req)) {
+	case REQ_OP_DISCARD:
 		return tr->discard(dev, block, nsect);
-
-	if (rq_data_dir(req) == READ) {
+	case REQ_OP_READ:
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->readsect(dev, block, buf))
 				return -EIO;
 		rq_flush_dcache_pages(req);
 		return 0;
-	} else {
+	case REQ_OP_WRITE:
 		if (!tr->writesect)
 			return -EIO;
 
@@ -112,6 +109,8 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 			if (tr->writesect(dev, block, buf))
 				return -EIO;
 		return 0;
+	default:
+		return -EIO;
 	}
 }
 
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index d1e6931c132f..c80869e60909 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -323,16 +323,15 @@ static int ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct ubiblock *dev = hctx->queue->queuedata;
 	struct ubiblock_pdu *pdu = blk_mq_rq_to_pdu(req);
 
-	if (req->cmd_type != REQ_TYPE_FS)
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+		ubi_sgl_init(&pdu->usgl);
+		queue_work(dev->wq, &pdu->work);
+		return BLK_MQ_RQ_QUEUE_OK;
+	default:
 		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
 
-	if (rq_data_dir(req) != READ)
-		return BLK_MQ_RQ_QUEUE_ERROR; /* Write not implemented */
-
-	ubi_sgl_init(&pdu->usgl);
-	queue_work(dev->wq, &pdu->work);
-
-	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 static int ubiblock_init_request(void *data, struct request *req,
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8a3c3e32a704..1640a5c8abbb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -208,18 +208,18 @@ EXPORT_SYMBOL_GPL(nvme_requeue_req);
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, unsigned int flags, int qid)
 {
+	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
 	struct request *req;
 
 	if (qid == NVME_QID_ANY) {
-		req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags);
+		req = blk_mq_alloc_request(q, op, flags);
 	} else {
-		req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags,
+		req = blk_mq_alloc_request_hctx(q, op, flags,
 				qid ? qid - 1 : 0);
 	}
 	if (IS_ERR(req))
 		return req;
 
-	req->cmd_type = REQ_TYPE_DRV_PRIV;
 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
 	nvme_req(req)->cmd = cmd;
 
@@ -309,17 +309,27 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 {
 	int ret = BLK_MQ_RQ_QUEUE_OK;
 
-	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+	switch (req_op(req)) {
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
 		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
-	else if (req_op(req) == REQ_OP_FLUSH)
+		break;
+	case REQ_OP_FLUSH:
 		nvme_setup_flush(ns, cmd);
-	else if (req_op(req) == REQ_OP_DISCARD)
+		break;
+	case REQ_OP_DISCARD:
 		ret = nvme_setup_discard(ns, req, cmd);
-	else
+		break;
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
 		nvme_setup_rw(ns, req, cmd);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
 
 	cmd->common.command_id = req->tag;
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ecc79b2fcfaf..a75e95d42b3f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1471,7 +1471,7 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 	ib_dma_sync_single_for_device(dev, sqe->dma,
 			sizeof(struct nvme_command), DMA_TO_DEVICE);
 
-	if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH)
+	if (req_op(rq) == REQ_OP_FLUSH)
 		flush = true;
 	ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
 			req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index fcb040eebf47..30b905080c61 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1565,7 +1565,8 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 	struct bio *bio = oii->bio;
 	int ret;
 
-	req = blk_get_request(q, has_write ? WRITE : READ, flags);
+	req = blk_get_request(q, has_write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
+			flags);
 	if (IS_ERR(req))
 		return req;
 	scsi_req_init(req);
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index d314aa5c71cf..451de6c5e3c9 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -367,7 +367,8 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 	int err = 0;
 	int write = (data_direction == DMA_TO_DEVICE);
 
-	req = blk_get_request(SRpnt->stp->device->request_queue, write, GFP_KERNEL);
+	req = blk_get_request(SRpnt->stp->device->request_queue,
+			write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index b4ce7bb5d2a9..9e82fa5715bc 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1974,7 +1974,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	 * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a
 	 * request becomes available
 	 */
-	req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL);
+	req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, GFP_KERNEL);
 	if (IS_ERR(req))
 		return;
 	rq = scsi_req(req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 31629a7b728d..90f65c8f487a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -219,11 +219,12 @@ static int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		 req_flags_t rq_flags, int *resid)
 {
 	struct request *req;
-	int write = (data_direction == DMA_TO_DEVICE);
 	struct scsi_request *rq;
 	int ret = DRIVER_ERROR << 24;
 
-	req = blk_get_request(sdev->request_queue, write, __GFP_RECLAIM);
+	req = blk_get_request(sdev->request_queue,
+			data_direction == DMA_TO_DEVICE ?
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
 	if (IS_ERR(req))
 		return ret;
 	rq = scsi_req(req);
@@ -839,8 +840,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		}
 	} else if (blk_rq_bytes(req) == 0 && result && !sense_deferred) {
 		/*
-		 * Certain non BLOCK_PC requests are commands that don't
-		 * actually transfer anything (FLUSH), so cannot use
+		 * Flush commands do not transfers any data, and thus cannot use
 		 * good_bytes != blk_rq_bytes(req) as the signal for an error.
 		 * This sets the error explicitly for the problem case.
 		 */
@@ -859,8 +859,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		blk_rq_sectors(req), good_bytes));
 
 	/*
-	 * Recovered errors need reporting, but they're always treated
-	 * as success, so fiddle the result code here.  For BLOCK_PC
+	 * Recovered errors need reporting, but they're always treated as
+	 * success, so fiddle the result code here.  For passthrough requests
 	 * we already took a copy of the original into rq->errors which
 	 * is what gets returned to the user
 	 */
@@ -874,7 +874,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		else if (!(req->rq_flags & RQF_QUIET))
 			scsi_print_sense(cmd);
 		result = 0;
-		/* BLOCK_PC may have set error */
+		/* for passthrough error may be set */
 		error = 0;
 	}
 
@@ -1179,12 +1179,12 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	spin_unlock_irqrestore(&dev->list_lock, flags);
 }
 
-static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
+static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 {
 	struct scsi_cmnd *cmd = req->special;
 
 	/*
-	 * BLOCK_PC requests may transfer data, in which case they must
+	 * Passthrough requests may transfer data, in which case they must
 	 * a bio attached to them.  Or they might contain a SCSI command
 	 * that does not transfer data, in which case they may optionally
 	 * submit a request without an attached bio.
@@ -1207,7 +1207,7 @@ static int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 }
 
 /*
- * Setup a REQ_TYPE_FS command.  These are simple request from filesystems
+ * Setup a normal block command.  These are simple request from filesystems
  * that still need to be translated to SCSI CDBs from the ULD.
  */
 static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
@@ -1236,14 +1236,10 @@ static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
 	else
 		cmd->sc_data_direction = DMA_FROM_DEVICE;
 
-	switch (req->cmd_type) {
-	case REQ_TYPE_FS:
+	if (blk_rq_is_scsi(req))
+		return scsi_setup_scsi_cmnd(sdev, req);
+	else
 		return scsi_setup_fs_cmnd(sdev, req);
-	case REQ_TYPE_BLOCK_PC:
-		return scsi_setup_blk_pc_cmnd(sdev, req);
-	default:
-		return BLKPREP_KILL;
-	}
 }
 
 static int
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 226a8def7a8b..e0e308b7e01a 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1698,7 +1698,8 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 	 * With scsi-mq disabled, blk_get_request() with GFP_KERNEL usually
 	 * does not sleep except under memory pressure.
 	 */
-	rq = blk_get_request(q, rw, GFP_KERNEL);
+	rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
 	if (IS_ERR(rq)) {
 		kfree(long_cmdp);
 		return PTR_ERR(rq);
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 94352e4df831..2e1d910092d8 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -437,14 +437,17 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
 		goto out;
 	}
 
-	if (rq_data_dir(rq) == WRITE) {
+	switch (req_op(rq)) {
+	case REQ_OP_WRITE:
 		if (!cd->writeable)
 			goto out;
 		SCpnt->cmnd[0] = WRITE_10;
 		cd->cdi.media_written = 1;
-	} else if (rq_data_dir(rq) == READ) {
+		break;
+	case REQ_OP_READ:
 		SCpnt->cmnd[0] = READ_10;
-	} else {
+		break;
+	default:
 		blk_dump_rq_flags(rq, "Unknown sr command");
 		goto out;
 	}
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 4af900100a24..81212d4bd9bf 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -541,11 +541,11 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 	struct scsi_request *rq;
 	struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
 	int err = 0;
-	int write = (data_direction == DMA_TO_DEVICE);
 	struct scsi_tape *STp = SRpnt->stp;
 
-	req = blk_get_request(SRpnt->stp->device->request_queue, write,
-			      GFP_KERNEL);
+	req = blk_get_request(SRpnt->stp->device->request_queue,
+			data_direction == DMA_TO_DEVICE ?
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 	rq = scsi_req(req);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index e52f4e1a4675..a8f8e53f2f57 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1005,7 +1005,8 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 		scsi_command_size(cmd->t_task_cdb));
 
 	req = blk_get_request(pdv->pdv_sd->request_queue,
-			(cmd->data_direction == DMA_TO_DEVICE),
+			cmd->data_direction == DMA_TO_DEVICE ?
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
 			GFP_KERNEL);
 	if (IS_ERR(req)) {
 		pr_err("PSCSI: blk_get_request() failed\n");
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 930d98caab5b..a06115e31612 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -223,7 +223,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	if (!buf)
 		return -ENOMEM;
 
-	rq = blk_get_request(q, READ, GFP_KERNEL);
+	rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
 	if (IS_ERR(rq)) {
 		error = -ENOMEM;
 		goto out_free_buf;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 37c9a43c5e78..d703acb55d0f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -162,6 +162,13 @@ enum req_opf {
 	/* write the zero filled sector many times */
 	REQ_OP_WRITE_ZEROES	= 8,
 
+	/* SCSI passthrough using struct scsi_request */
+	REQ_OP_SCSI_IN		= 32,
+	REQ_OP_SCSI_OUT		= 33,
+	/* Driver private requests */
+	REQ_OP_DRV_IN		= 34,
+	REQ_OP_DRV_OUT		= 35,
+
 	REQ_OP_LAST,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7121be081517..1e947e725528 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -70,15 +70,6 @@ struct request_list {
 	unsigned int		flags;
 };
 
-/*
- * request command types
- */
-enum rq_cmd_type_bits {
-	REQ_TYPE_FS		= 1,	/* fs request */
-	REQ_TYPE_BLOCK_PC,		/* scsi command */
-	REQ_TYPE_DRV_PRIV,		/* driver defined types from here */
-};
-
 /*
  * request flags */
 typedef __u32 __bitwise req_flags_t;
@@ -145,7 +136,6 @@ struct request {
 	struct blk_mq_ctx *mq_ctx;
 
 	int cpu;
-	unsigned cmd_type;
 	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
 	unsigned long atomic_flags;
@@ -242,9 +232,19 @@ struct request {
 	struct request *next_rq;
 };
 
+static inline bool blk_rq_is_scsi(struct request *rq)
+{
+	return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
+}
+
+static inline bool blk_rq_is_private(struct request *rq)
+{
+	return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
+}
+
 static inline bool blk_rq_is_passthrough(struct request *rq)
 {
-	return rq->cmd_type != REQ_TYPE_FS;
+	return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
 }
 
 static inline unsigned short req_get_ioprio(struct request *req)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 5cc6caa94cac..2f51c1724b5a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -63,31 +63,27 @@ static inline struct ide_request *ide_req(struct request *rq)
 
 static inline bool ata_misc_request(struct request *rq)
 {
-	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
-               ide_req(rq)->type == ATA_PRIV_MISC;
+	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_MISC;
 }
 
 static inline bool ata_taskfile_request(struct request *rq)
 {
-	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
-               ide_req(rq)->type == ATA_PRIV_TASKFILE;
+	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_TASKFILE;
 }
 
 static inline bool ata_pc_request(struct request *rq)
 {
-	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
-               ide_req(rq)->type == ATA_PRIV_PC;
+	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_PC;
 }
 
 static inline bool ata_sense_request(struct request *rq)
 {
-	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
-               ide_req(rq)->type == ATA_PRIV_SENSE;
+	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_SENSE;
 }
 
 static inline bool ata_pm_request(struct request *rq)
 {
-	return rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+	return blk_rq_is_private(rq) &&
 		(ide_req(rq)->type == ATA_PRIV_PM_SUSPEND ||
 		 ide_req(rq)->type == ATA_PRIV_PM_RESUME);
 }
-- 
cgit v1.2.3


From d486f1f204382557b5fbcb3ddbb5845cd4ba4e2c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 31 Jan 2017 12:34:41 -0700
Subject: block: move internal_tag to same cache line as tag

Since we removed cmd_type, we now have a hole in the struct. Move
the internal_tag member to the same cacheline as tag, since we
use them at the same time.

This doesn't fix the hole, just moves it elsewhere.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1e947e725528..11f7a8e86a89 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -138,6 +138,9 @@ struct request {
 	int cpu;
 	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
+
+	int internal_tag;
+
 	unsigned long atomic_flags;
 
 	/* the following two fields are internal, NEVER access directly */
@@ -209,8 +212,6 @@ struct request {
 
 	unsigned short ioprio;
 
-	int internal_tag;
-
 	void *special;		/* opaque pointer available for LLD use */
 
 	int errors;
-- 
cgit v1.2.3


From db4545d9a7881db0a7e18599e6cd1adbcb93db33 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 31 Jan 2017 13:21:34 +0000
Subject: x86/efi: Deduplicate efi_char16_printk()

Eliminate the separate 32-bit and 64x- bit code paths by way of the shiny
new efi_call_proto() macro.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-3-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c | 26 ++------------------------
 include/linux/efi.h              |  8 ++++----
 2 files changed, 6 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index f1cf284d631e..6d3aeabbce68 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -101,30 +101,8 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
 
 void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
 {
-	unsigned long output_string;
-	size_t offset;
-
-	if (efi_early->is64) {
-		struct efi_simple_text_output_protocol_64 *out;
-		u64 *func;
-
-		offset = offsetof(typeof(*out), output_string);
-		output_string = efi_early->text_output + offset;
-		out = (typeof(out))(unsigned long)efi_early->text_output;
-		func = (u64 *)output_string;
-
-		efi_early->call(*func, out, str);
-	} else {
-		struct efi_simple_text_output_protocol_32 *out;
-		u32 *func;
-
-		offset = offsetof(typeof(*out), output_string);
-		output_string = efi_early->text_output + offset;
-		out = (typeof(out))(unsigned long)efi_early->text_output;
-		func = (u32 *)output_string;
-
-		efi_early->call(*func, out, str);
-	}
+	efi_call_proto(efi_simple_text_output_protocol, output_string,
+		       efi_early->text_output, str);
 }
 
 static efi_status_t
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 5b1af30ece55..6642c4d9d11d 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1240,17 +1240,17 @@ struct efivar_entry {
 	bool deleting;
 };
 
-struct efi_simple_text_output_protocol_32 {
+typedef struct {
 	u32 reset;
 	u32 output_string;
 	u32 test_string;
-};
+} efi_simple_text_output_protocol_32_t;
 
-struct efi_simple_text_output_protocol_64 {
+typedef struct {
 	u64 reset;
 	u64 output_string;
 	u64 test_string;
-};
+} efi_simple_text_output_protocol_64_t;
 
 struct efi_simple_text_output_protocol {
 	void *reset;
-- 
cgit v1.2.3


From a19ebf59e20880c87dd49b6336476307559ac5ba Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Tue, 31 Jan 2017 13:21:36 +0000
Subject: efi: Introduce the EFI_MEM_ATTR bit and set it from the memory
 attributes table

UEFI v2.6 introduces a configuration table called
EFI_MEMORY_ATTRIBUTES_TABLE which provides additional information about
EFI runtime regions. Currently this table describes memory protections
that may be applied to the EFI Runtime code and data regions by the kernel.

Allocate a EFI_XXX bit to keep track of whether this feature is
published by firmware or not.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Lee, Chun-Yi <jlee@suse.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-5-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/memattr.c | 1 +
 include/linux/efi.h            | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c
index 236004b9a50d..402197460507 100644
--- a/drivers/firmware/efi/memattr.c
+++ b/drivers/firmware/efi/memattr.c
@@ -43,6 +43,7 @@ int __init efi_memattr_init(void)
 
 	tbl_size = sizeof(*tbl) + tbl->num_entries * tbl->desc_size;
 	memblock_reserve(efi.mem_attr_table, tbl_size);
+	set_bit(EFI_MEM_ATTR, &efi.flags);
 
 unmap:
 	early_memunmap(tbl, sizeof(*tbl));
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 6642c4d9d11d..5f632bf9969d 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1065,6 +1065,7 @@ extern int __init efi_setup_pcdp_console(char *);
 #define EFI_ARCH_1		7	/* First arch-specific bit */
 #define EFI_DBG			8	/* Print additional debug info at runtime */
 #define EFI_NX_PE_DATA		9	/* Can runtime data regions be mapped non-executable? */
+#define EFI_MEM_ATTR		10	/* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */
 
 #ifdef CONFIG_EFI
 /*
-- 
cgit v1.2.3


From c4c39c70c5fef43655019236bec8ba5e7273b868 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 31 Jan 2017 13:21:39 +0000
Subject: efi: Use typed function pointers for the runtime services table

Instead of using void pointers, and casting them to correctly typed
function pointers upon use, declare the runtime services pointers
as function pointers using their respective prototypes, for which
typedefs are already available.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-8-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/efi.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 5f632bf9969d..85e9fdaa8d07 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -509,24 +509,6 @@ typedef struct {
 	u64 query_variable_info;
 } efi_runtime_services_64_t;
 
-typedef struct {
-	efi_table_hdr_t hdr;
-	void *get_time;
-	void *set_time;
-	void *get_wakeup_time;
-	void *set_wakeup_time;
-	void *set_virtual_address_map;
-	void *convert_pointer;
-	void *get_variable;
-	void *get_next_variable;
-	void *set_variable;
-	void *get_next_high_mono_count;
-	void *reset_system;
-	void *update_capsule;
-	void *query_capsule_caps;
-	void *query_variable_info;
-} efi_runtime_services_t;
-
 typedef efi_status_t efi_get_time_t (efi_time_t *tm, efi_time_cap_t *tc);
 typedef efi_status_t efi_set_time_t (efi_time_t *tm);
 typedef efi_status_t efi_get_wakeup_time_t (efi_bool_t *enabled, efi_bool_t *pending,
@@ -561,6 +543,24 @@ typedef efi_status_t efi_query_variable_store_t(u32 attributes,
 						unsigned long size,
 						bool nonblocking);
 
+typedef struct {
+	efi_table_hdr_t			hdr;
+	efi_get_time_t			*get_time;
+	efi_set_time_t			*set_time;
+	efi_get_wakeup_time_t		*get_wakeup_time;
+	efi_set_wakeup_time_t		*set_wakeup_time;
+	efi_set_virtual_address_map_t	*set_virtual_address_map;
+	void				*convert_pointer;
+	efi_get_variable_t		*get_variable;
+	efi_get_next_variable_t		*get_next_variable;
+	efi_set_variable_t		*set_variable;
+	efi_get_next_high_mono_count_t	*get_next_high_mono_count;
+	efi_reset_system_t		*reset_system;
+	efi_update_capsule_t		*update_capsule;
+	efi_query_capsule_caps_t	*query_capsule_caps;
+	efi_query_variable_info_t	*query_variable_info;
+} efi_runtime_services_t;
+
 void efi_native_runtime_setup(void);
 
 /*
-- 
cgit v1.2.3


From 7b0a911478c74ca02581d496f732c10e811e894f Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Tue, 31 Jan 2017 13:21:40 +0000
Subject: efi/x86: Move the EFI BGRT init code to early init code

Before invoking the arch specific handler, efi_mem_reserve() reserves
the given memory region through memblock.

efi_bgrt_init() will call efi_mem_reserve() after mm_init(), at which
time memblock is dead and should not be used anymore.

The EFI BGRT code depends on ACPI initialization to get the BGRT ACPI
table, so move parsing of the BGRT table to ACPI early boot code to
ensure that efi_mem_reserve() in EFI BGRT code still use memblock safely.

Tested-by: Bhupesh Sharma <bhsharma@redhat.com>
Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-acpi@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1485868902-20401-9-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/acpi/boot.c      |  9 ++++++
 arch/x86/platform/efi/efi-bgrt.c | 59 +++++++++++++++++-----------------------
 arch/x86/platform/efi/efi.c      |  5 ----
 drivers/acpi/bgrt.c              | 28 +++++++++++++------
 include/linux/efi-bgrt.h         | 11 ++++----
 init/main.c                      |  1 -
 6 files changed, 59 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 64422f850e95..7ff007ed899d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
+#include <linux/efi-bgrt.h>
 
 #include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
@@ -1557,6 +1558,12 @@ int __init early_acpi_boot_init(void)
 	return 0;
 }
 
+static int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+	efi_bgrt_init(table);
+	return 0;
+}
+
 int __init acpi_boot_init(void)
 {
 	/* those are executed after early-quirks are executed */
@@ -1581,6 +1588,8 @@ int __init acpi_boot_init(void)
 	acpi_process_madt();
 
 	acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+	if (IS_ENABLED(CONFIG_ACPI_BGRT))
+		acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
 
 	if (!acpi_noirq)
 		x86_init.pci.init = pci_acpi_init;
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 6aad870e8962..04ca8764f0c0 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -19,8 +19,7 @@
 #include <linux/efi.h>
 #include <linux/efi-bgrt.h>
 
-struct acpi_table_bgrt *bgrt_tab;
-void *__initdata bgrt_image;
+struct acpi_table_bgrt bgrt_tab;
 size_t __initdata bgrt_image_size;
 
 struct bmp_header {
@@ -28,66 +27,58 @@ struct bmp_header {
 	u32 size;
 } __packed;
 
-void __init efi_bgrt_init(void)
+void __init efi_bgrt_init(struct acpi_table_header *table)
 {
-	acpi_status status;
 	void *image;
 	struct bmp_header bmp_header;
+	struct acpi_table_bgrt *bgrt = &bgrt_tab;
 
 	if (acpi_disabled)
 		return;
 
-	status = acpi_get_table("BGRT", 0,
-	                        (struct acpi_table_header **)&bgrt_tab);
-	if (ACPI_FAILURE(status))
-		return;
-
-	if (bgrt_tab->header.length < sizeof(*bgrt_tab)) {
+	if (table->length < sizeof(bgrt_tab)) {
 		pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n",
-		       bgrt_tab->header.length, sizeof(*bgrt_tab));
+		       table->length, sizeof(bgrt_tab));
 		return;
 	}
-	if (bgrt_tab->version != 1) {
+	*bgrt = *(struct acpi_table_bgrt *)table;
+	if (bgrt->version != 1) {
 		pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n",
-		       bgrt_tab->version);
-		return;
+		       bgrt->version);
+		goto out;
 	}
-	if (bgrt_tab->status & 0xfe) {
+	if (bgrt->status & 0xfe) {
 		pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n",
-		       bgrt_tab->status);
-		return;
+		       bgrt->status);
+		goto out;
 	}
-	if (bgrt_tab->image_type != 0) {
+	if (bgrt->image_type != 0) {
 		pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n",
-		       bgrt_tab->image_type);
-		return;
+		       bgrt->image_type);
+		goto out;
 	}
-	if (!bgrt_tab->image_address) {
+	if (!bgrt->image_address) {
 		pr_notice("Ignoring BGRT: null image address\n");
-		return;
+		goto out;
 	}
 
-	image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB);
+	image = early_memremap(bgrt->image_address, sizeof(bmp_header));
 	if (!image) {
 		pr_notice("Ignoring BGRT: failed to map image header memory\n");
-		return;
+		goto out;
 	}
 
 	memcpy(&bmp_header, image, sizeof(bmp_header));
-	memunmap(image);
+	early_memunmap(image, sizeof(bmp_header));
 	if (bmp_header.id != 0x4d42) {
 		pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n",
 			bmp_header.id);
-		return;
+		goto out;
 	}
 	bgrt_image_size = bmp_header.size;
+	efi_mem_reserve(bgrt->image_address, bgrt_image_size);
 
-	bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
-	if (!bgrt_image) {
-		pr_notice("Ignoring BGRT: failed to map image memory\n");
-		bgrt_image = NULL;
-		return;
-	}
-
-	efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size);
+	return;
+out:
+	memset(bgrt, 0, sizeof(bgrt_tab));
 }
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 274dfc481849..0d4becfc5145 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -542,11 +542,6 @@ void __init efi_init(void)
 		efi_print_memmap();
 }
 
-void __init efi_late_init(void)
-{
-	efi_bgrt_init();
-}
-
 void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
 {
 	u64 addr, npages;
diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c
index 75f128e766a9..ca28aa572aa9 100644
--- a/drivers/acpi/bgrt.c
+++ b/drivers/acpi/bgrt.c
@@ -15,40 +15,41 @@
 #include <linux/sysfs.h>
 #include <linux/efi-bgrt.h>
 
+static void *bgrt_image;
 static struct kobject *bgrt_kobj;
 
 static ssize_t show_version(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab->version);
+	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.version);
 }
 static DEVICE_ATTR(version, S_IRUGO, show_version, NULL);
 
 static ssize_t show_status(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab->status);
+	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.status);
 }
 static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
 
 static ssize_t show_type(struct device *dev,
 			 struct device_attribute *attr, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab->image_type);
+	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.image_type);
 }
 static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
 
 static ssize_t show_xoffset(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab->image_offset_x);
+	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.image_offset_x);
 }
 static DEVICE_ATTR(xoffset, S_IRUGO, show_xoffset, NULL);
 
 static ssize_t show_yoffset(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab->image_offset_y);
+	return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.image_offset_y);
 }
 static DEVICE_ATTR(yoffset, S_IRUGO, show_yoffset, NULL);
 
@@ -84,15 +85,24 @@ static int __init bgrt_init(void)
 {
 	int ret;
 
-	if (!bgrt_image)
+	if (!bgrt_tab.image_address)
 		return -ENODEV;
 
+	bgrt_image = memremap(bgrt_tab.image_address, bgrt_image_size,
+			      MEMREMAP_WB);
+	if (!bgrt_image) {
+		pr_notice("Ignoring BGRT: failed to map image memory\n");
+		return -ENOMEM;
+	}
+
 	bin_attr_image.private = bgrt_image;
 	bin_attr_image.size = bgrt_image_size;
 
 	bgrt_kobj = kobject_create_and_add("bgrt", acpi_kobj);
-	if (!bgrt_kobj)
-		return -EINVAL;
+	if (!bgrt_kobj) {
+		ret = -EINVAL;
+		goto out_memmap;
+	}
 
 	ret = sysfs_create_group(bgrt_kobj, &bgrt_attribute_group);
 	if (ret)
@@ -102,6 +112,8 @@ static int __init bgrt_init(void)
 
 out_kobject:
 	kobject_put(bgrt_kobj);
+out_memmap:
+	memunmap(bgrt_image);
 	return ret;
 }
 device_initcall(bgrt_init);
diff --git a/include/linux/efi-bgrt.h b/include/linux/efi-bgrt.h
index 051b21fedf68..2fd3993c370b 100644
--- a/include/linux/efi-bgrt.h
+++ b/include/linux/efi-bgrt.h
@@ -1,20 +1,19 @@
 #ifndef _LINUX_EFI_BGRT_H
 #define _LINUX_EFI_BGRT_H
 
-#ifdef CONFIG_ACPI_BGRT
-
 #include <linux/acpi.h>
 
-void efi_bgrt_init(void);
+#ifdef CONFIG_ACPI_BGRT
+
+void efi_bgrt_init(struct acpi_table_header *table);
 
 /* The BGRT data itself; only valid if bgrt_image != NULL. */
-extern void *bgrt_image;
 extern size_t bgrt_image_size;
-extern struct acpi_table_bgrt *bgrt_tab;
+extern struct acpi_table_bgrt bgrt_tab;
 
 #else /* !CONFIG_ACPI_BGRT */
 
-static inline void efi_bgrt_init(void) {}
+static inline void efi_bgrt_init(struct acpi_table_header *table) {}
 
 #endif /* !CONFIG_ACPI_BGRT */
 
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..9648d707eea5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -663,7 +663,6 @@ asmlinkage __visible void __init start_kernel(void)
 	sfi_init_late();
 
 	if (efi_enabled(EFI_RUNTIME_SERVICES)) {
-		efi_late_init();
 		efi_free_boot_services();
 	}
 
-- 
cgit v1.2.3


From 07e5f5e353aaa61696c8353d87050994a0c4648a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:17 +0100
Subject: time: Introduce jiffies64_to_nsecs()

This will be needed for the cputime_t to nsec conversion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jiffies.h  |  2 ++
 kernel/time/time.c       | 10 ++++++++++
 kernel/time/timeconst.bc |  6 ++++++
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 589d14e970ad..624215cebee5 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -293,6 +293,8 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 	return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
 }
 
+extern u64 jiffies64_to_nsecs(u64 j);
+
 extern unsigned long __msecs_to_jiffies(const unsigned int m);
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
 /*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a3a9a8a029dc..25bdd2504571 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -702,6 +702,16 @@ u64 nsec_to_clock_t(u64 x)
 #endif
 }
 
+u64 jiffies64_to_nsecs(u64 j)
+{
+#if !(NSEC_PER_SEC % HZ)
+	return (NSEC_PER_SEC / HZ) * j;
+# else
+	return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_nsecs);
+
 /**
  * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index c48688904f9f..f83bbb81600b 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -98,6 +98,12 @@ define timeconst(hz) {
 		print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
 		print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
 		print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+
+		cd=gcd(hz,1000000000)
+		print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n"
+		print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n"
+		print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+		print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n"
 		print "\n"
 
 		print "#endif /* KERNEL_TIMECONST_H */\n"
-- 
cgit v1.2.3


From ba03ce822db234f8acb559de4a317a5c1f95c029 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:18 +0100
Subject: sched/cputime: Remove the unused INIT_CPUTIME macro

It's a leftover from removed code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69e6852fede1..5f60aed37701 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -620,13 +620,6 @@ struct task_cputime {
 #define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
-#define INIT_CPUTIME	\
-	(struct task_cputime) {					\
-		.utime = 0,					\
-		.stime = 0,					\
-		.sum_exec_runtime = 0,				\
-	}
-
 /*
  * This is the atomic variant of task_cputime, which can be used for
  * storing and updating task_cputime statistics without locking.
-- 
cgit v1.2.3


From 16a6d9be90373fb0b521850cd0185a4d460dd152 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:21 +0100
Subject: sched/cputime: Convert guest time accounting to nsecs (u64)

cputime_t is being obsolete and replaced by nsecs units in order to make
internal timestamps less opaque and more granular.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/array.c        |  6 +++---
 include/linux/sched.h  | 10 +++++-----
 kernel/sched/cputime.c |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 51a4213afa2e..25b54cf0c042 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -402,7 +402,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
-	cputime_t cgtime, gtime;
+	u64 cgtime, gtime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
@@ -542,8 +542,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, " ", task->rt_priority);
 	seq_put_decimal_ull(m, " ", task->policy);
 	seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime));
 
 	if (mm && permitted) {
 		seq_put_decimal_ull(m, " ", mm->start_data);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f60aed37701..252ff25983c8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -774,8 +774,8 @@ struct signal_struct {
 	 */
 	seqlock_t stats_lock;
 	cputime_t utime, stime, cutime, cstime;
-	cputime_t gtime;
-	cputime_t cgtime;
+	u64 gtime;
+	u64 cgtime;
 	struct prev_cputime prev_cputime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
@@ -1658,7 +1658,7 @@ struct task_struct {
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 	cputime_t utimescaled, stimescaled;
 #endif
-	cputime_t gtime;
+	u64 gtime;
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqcount_t vtime_seqcount;
@@ -2254,7 +2254,7 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask);
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
 			 cputime_t *utime, cputime_t *stime);
-extern cputime_t task_gtime(struct task_struct *t);
+extern u64 task_gtime(struct task_struct *t);
 #else
 static inline void task_cputime(struct task_struct *t,
 				cputime_t *utime, cputime_t *stime)
@@ -2263,7 +2263,7 @@ static inline void task_cputime(struct task_struct *t,
 	*stime = t->stime;
 }
 
-static inline cputime_t task_gtime(struct task_struct *t)
+static inline u64 task_gtime(struct task_struct *t)
 {
 	return t->gtime;
 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 61e270926e94..8bcd98e2b821 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -158,7 +158,7 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
 	/* Add guest time to process. */
 	p->utime += cputime;
 	account_group_user_time(p, cputime);
-	p->gtime += cputime;
+	p->gtime += cputime_to_nsecs(cputime);
 
 	/* Add guest time to cpustat. */
 	if (task_nice(p) > 0) {
@@ -824,10 +824,10 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_restore(flags);
 }
 
-cputime_t task_gtime(struct task_struct *t)
+u64 task_gtime(struct task_struct *t)
 {
 	unsigned int seq;
-	cputime_t gtime;
+	u64 gtime;
 
 	if (!vtime_accounting_enabled())
 		return t->gtime;
@@ -837,7 +837,7 @@ cputime_t task_gtime(struct task_struct *t)
 
 		gtime = t->gtime;
 		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
-			gtime += vtime_delta(t);
+			gtime += cputime_to_nsecs(vtime_delta(t));
 
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 
-- 
cgit v1.2.3


From a1cecf2ba78e0a6de00ff99df34b662728535aa5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:22 +0100
Subject: sched/cputime: Introduce special task_cputime_t() API to return
 old-typed cputime

This API returns a task's cputime in cputime_t in order to ease the
conversion of cputime internals to use nsecs units instead. Blindly
converting all cputime readers to use this API now will later let us
convert more smoothly and step by step all these places to use the
new nsec based cputime.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-7-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c    |  2 +-
 arch/x86/kernel/apm_32.c       |  2 +-
 drivers/isdn/mISDN/stack.c     |  2 +-
 fs/binfmt_elf.c                |  6 +++---
 fs/binfmt_elf_fdpic.c          |  6 +++---
 include/linux/sched.h          | 32 ++++++++++++++++++++++++++---
 kernel/acct.c                  |  2 +-
 kernel/delayacct.c             |  4 ++--
 kernel/signal.c                |  4 ++--
 kernel/time/itimer.c           |  2 +-
 kernel/time/posix-cpu-timers.c | 46 +++++++++++++++++++++---------------------
 kernel/tsacct.c                |  6 +++---
 12 files changed, 70 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 54d8616644e2..0f92438d736b 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1154,7 +1154,7 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru)
 	memset(&r, 0, sizeof(r));
 	switch (who) {
 	case RUSAGE_SELF:
-		task_cputime(current, &utime, &stime);
+		task_cputime_t(current, &utime, &stime);
 		utime_jiffies = cputime_to_jiffies(utime);
 		stime_jiffies = cputime_to_jiffies(stime);
 		jiffies_to_timeval32(utime_jiffies, &r.ru_utime);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 45d44c173cf9..89c84fcdd3c0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -913,7 +913,7 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 	unsigned int bucket;
 
 recalc:
-	task_cputime(current, &utime, &stime);
+	task_cputime_t(current, &utime, &stime);
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
 	} else if (jiffies_since_last_check > idle_period) {
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index 9cb4b621fbc3..0a3661767531 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -306,7 +306,7 @@ mISDNStackd(void *data)
 	       "msg %d sleep %d stopped\n",
 	       dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt,
 	       st->stopped_cnt);
-	task_cputime(st->thread, &utime, &stime);
+	task_cputime_t(st->thread, &utime, &stime);
 	printk(KERN_DEBUG
 	       "mISDNStackd daemon for %s utime(%ld) stime(%ld)\n",
 	       dev_name(&st->dev->dev), utime, stime);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 422370293cfd..68b915650cae 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1421,19 +1421,19 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
-		struct task_cputime cputime;
+		struct task_cputime_t cputime;
 
 		/*
 		 * This is the record for the group leader.  It shows the
 		 * group-wide total, not its individual thread total.
 		 */
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_t utime, stime;
 
-		task_cputime(p, &utime, &stime);
+		task_cputime_t(p, &utime, &stime);
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d2e36f82c35d..6ccd9df7247a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1342,19 +1342,19 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
-		struct task_cputime cputime;
+		struct task_cputime_t cputime;
 
 		/*
 		 * This is the record for the group leader.  It shows the
 		 * group-wide total, not its individual thread total.
 		 */
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_t utime, stime;
 
-		task_cputime(p, &utime, &stime);
+		task_cputime_t(p, &utime, &stime);
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 252ff25983c8..9cc722f77799 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -615,6 +615,13 @@ struct task_cputime {
 	unsigned long long sum_exec_runtime;
 };
 
+/* Temporary type to ease cputime_t to nsecs conversion */
+struct task_cputime_t {
+	cputime_t utime;
+	cputime_t stime;
+	unsigned long long sum_exec_runtime;
+};
+
 /* Alternate field names when used to cache expirations. */
 #define virt_exp	utime
 #define prof_exp	stime
@@ -748,7 +755,7 @@ struct signal_struct {
 	struct thread_group_cputimer cputimer;
 
 	/* Earliest-expiration cache. */
-	struct task_cputime cputime_expires;
+	struct task_cputime_t cputime_expires;
 
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t tick_dep_mask;
@@ -1682,7 +1689,7 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
-	struct task_cputime cputime_expires;
+	struct task_cputime_t cputime_expires;
 	struct list_head cpu_timers[3];
 
 /* process credentials */
@@ -2286,6 +2293,19 @@ static inline void task_cputime_scaled(struct task_struct *t,
 }
 #endif
 
+static inline void task_cputime_t(struct task_struct *t,
+				  cputime_t *utime, cputime_t *stime)
+{
+	task_cputime(t, utime, stime);
+}
+
+static inline void task_cputime_t_scaled(struct task_struct *t,
+					 cputime_t *utimescaled,
+					 cputime_t *stimescaled)
+{
+	task_cputime_scaled(t, utimescaled, stimescaled);
+}
+
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
@@ -3499,7 +3519,13 @@ static __always_inline bool need_resched(void)
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times);
+
+static inline void thread_group_cputime_t(struct task_struct *tsk,
+					  struct task_cputime_t *times)
+{
+	thread_group_cputime(tsk, (struct task_cputime *)times);
+}
 
 /*
  * Reevaluate whether the task has signals pending delivery.
diff --git a/kernel/acct.c b/kernel/acct.c
index 74963d192c5d..b9b190a8eecf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -559,7 +559,7 @@ void acct_collect(long exitcode, int group_dead)
 		pacct->ac_flag |= ACORE;
 	if (current->flags & PF_SIGNALED)
 		pacct->ac_flag |= AXSIG;
-	task_cputime(current, &utime, &stime);
+	task_cputime_t(current, &utime, &stime);
 	pacct->ac_utime += utime;
 	pacct->ac_stime += stime;
 	pacct->ac_minflt += current->min_flt;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a45118..228640f2b3d2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -87,12 +87,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	unsigned long flags, t1;
 	s64 tmp;
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	tmp = (s64)d->cpu_run_real_total;
 	tmp += cputime_to_nsecs(utime + stime);
 	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
 
-	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	task_cputime_t_scaled(tsk, &utimescaled, &stimescaled);
 	tmp = (s64)d->cpu_scaled_run_real_total;
 	tmp += cputime_to_nsecs(utimescaled + stimescaled);
 	d->cpu_scaled_run_real_total =
diff --git a/kernel/signal.c b/kernel/signal.c
index 3603d93a1968..218048a837ea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1619,7 +1619,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 				       task_uid(tsk));
 	rcu_read_unlock();
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
 	info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
 
@@ -1704,7 +1704,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
 	rcu_read_unlock();
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	info.si_utime = cputime_to_clock_t(utime);
 	info.si_stime = cputime_to_clock_t(stime);
 
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 8c89143f9ebf..f2d5097bcb6d 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -53,7 +53,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	cval = it->expires;
 	cinterval = it->incr;
 	if (cval) {
-		struct task_cputime cputime;
+		struct task_cputime_t cputime;
 		cputime_t t;
 
 		thread_group_cputimer(tsk, &cputime);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e9e8c10f0d9a..d53ff711a2a8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -115,7 +115,7 @@ static void bump_cpu_timer(struct k_itimer *timer,
  * Checks @cputime to see if all fields are zero.  Returns true if all fields
  * are zero, false if any field is nonzero.
  */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
+static inline int task_cputime_zero(const struct task_cputime_t *cputime)
 {
 	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
 		return 1;
@@ -126,7 +126,7 @@ static inline unsigned long long prof_ticks(struct task_struct *p)
 {
 	cputime_t utime, stime;
 
-	task_cputime(p, &utime, &stime);
+	task_cputime_t(p, &utime, &stime);
 
 	return cputime_to_expires(utime + stime);
 }
@@ -134,7 +134,7 @@ static inline unsigned long long virt_ticks(struct task_struct *p)
 {
 	cputime_t utime, stime;
 
-	task_cputime(p, &utime, &stime);
+	task_cputime_t(p, &utime, &stime);
 
 	return cputime_to_expires(utime);
 }
@@ -210,7 +210,7 @@ retry:
 	}
 }
 
-static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime_t *sum)
 {
 	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
 	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
@@ -218,7 +218,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct
 }
 
 /* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-static inline void sample_cputime_atomic(struct task_cputime *times,
+static inline void sample_cputime_atomic(struct task_cputime_t *times,
 					 struct task_cputime_atomic *atomic_times)
 {
 	times->utime = atomic64_read(&atomic_times->utime);
@@ -226,10 +226,10 @@ static inline void sample_cputime_atomic(struct task_cputime *times,
 	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	struct task_cputime sum;
+	struct task_cputime_t sum;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running)) {
@@ -238,7 +238,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 		 * values through the TIMER_ABSTIME flag, therefore we have
 		 * to synchronize the timer to the clock every time we start it.
 		 */
-		thread_group_cputime(tsk, &sum);
+		thread_group_cputime_t(tsk, &sum);
 		update_gt_cputime(&cputimer->cputime_atomic, &sum);
 
 		/*
@@ -262,21 +262,21 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
 				  unsigned long long *sample)
 {
-	struct task_cputime cputime;
+	struct task_cputime_t cputime;
 
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		*sample = cputime_to_expires(cputime.utime + cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		*sample = cputime_to_expires(cputime.utime);
 		break;
 	case CPUCLOCK_SCHED:
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		*sample = cputime.sum_exec_runtime;
 		break;
 	}
@@ -466,7 +466,7 @@ static void arm_timer(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	struct list_head *head, *listpos;
-	struct task_cputime *cputime_expires;
+	struct task_cputime_t *cputime_expires;
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
 
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
 				  unsigned long long *sample)
 {
-	struct task_cputime cputime;
+	struct task_cputime_t cputime;
 
 	thread_group_cputimer(p, &cputime);
 	switch (CPUCLOCK_WHICH(which_clock)) {
@@ -761,7 +761,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		/*
 		 * Protect against sighand release/switch in exit/exec and
 		 * also make timer sampling safe if it ends up calling
-		 * thread_group_cputime().
+		 * thread_group_cputime_t().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -826,7 +826,7 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 	struct list_head *timers = tsk->cpu_timers;
 	struct signal_struct *const sig = tsk->signal;
-	struct task_cputime *tsk_expires = &tsk->cputime_expires;
+	struct task_cputime_t *tsk_expires = &tsk->cputime_expires;
 	unsigned long long expires;
 	unsigned long soft;
 
@@ -934,7 +934,7 @@ static void check_process_timers(struct task_struct *tsk,
 	unsigned long long utime, ptime, virt_expires, prof_expires;
 	unsigned long long sum_sched_runtime, sched_expires;
 	struct list_head *timers = sig->cpu_timers;
-	struct task_cputime cputime;
+	struct task_cputime_t cputime;
 	unsigned long soft;
 
 	/*
@@ -1037,7 +1037,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	} else {
 		/*
 		 * Protect arm_timer() and timer sampling in case of call to
-		 * thread_group_cputime().
+		 * thread_group_cputime_t().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -1080,8 +1080,8 @@ out:
  * Returns true if any field of the former is greater than the corresponding
  * field of the latter if the latter field is set.  Otherwise returns false.
  */
-static inline int task_cputime_expired(const struct task_cputime *sample,
-					const struct task_cputime *expires)
+static inline int task_cputime_expired(const struct task_cputime_t *sample,
+					const struct task_cputime_t *expires)
 {
 	if (expires->utime && sample->utime >= expires->utime)
 		return 1;
@@ -1108,9 +1108,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	struct signal_struct *sig;
 
 	if (!task_cputime_zero(&tsk->cputime_expires)) {
-		struct task_cputime task_sample;
+		struct task_cputime_t task_sample;
 
-		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
+		task_cputime_t(tsk, &task_sample.utime, &task_sample.stime);
 		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
 			return 1;
@@ -1133,7 +1133,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	 */
 	if (READ_ONCE(sig->cputimer.running) &&
 	    !READ_ONCE(sig->cputimer.checking_timer)) {
-		struct task_cputime group_sample;
+		struct task_cputime_t group_sample;
 
 		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f8e26ab963ed..040d0a64d0d1 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -66,11 +66,11 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
 	rcu_read_unlock();
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	stats->ac_utime = cputime_to_usecs(utime);
 	stats->ac_stime = cputime_to_usecs(stime);
 
-	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	task_cputime_t_scaled(tsk, &utimescaled, &stimescaled);
 	stats->ac_utimescaled = cputime_to_usecs(utimescaled);
 	stats->ac_stimescaled = cputime_to_usecs(stimescaled);
 
@@ -159,7 +159,7 @@ void acct_update_integrals(struct task_struct *tsk)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	__acct_update_integrals(tsk, utime, stime);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From 5613fda9a503cd6137b120298902a34a1386b2c1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:23 +0100
Subject: sched/cputime: Convert task/group cputime to nsecs

Now that most cputime readers use the transition API which return the
task cputime in old style cputime_t, we can safely store the cputime in
nsecs. This will eventually make cputime statistics less opaque and more
granular. Back and forth convertions between cputime_t and nsecs in order
to deal with cputime_t random granularity won't be needed anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-8-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c |  4 ++--
 arch/powerpc/kernel/time.c  |  4 ++--
 arch/s390/kernel/vtime.c    |  6 ++---
 arch/x86/kvm/hyperv.c       |  5 +++--
 fs/binfmt_elf.c             | 11 +++++++--
 fs/binfmt_elf_fdpic.c       |  4 ++--
 fs/proc/array.c             | 10 ++++-----
 include/linux/sched.h       | 55 ++++++++++++++++++++++++++++-----------------
 kernel/exit.c               |  4 ++--
 kernel/sched/cputime.c      | 35 ++++++++++++++---------------
 kernel/signal.c             |  4 ++--
 kernel/sys.c                | 16 ++++++-------
 12 files changed, 89 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 0f92438d736b..82ccb43b795b 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1163,8 +1163,8 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru)
 		r.ru_majflt = current->maj_flt;
 		break;
 	case RUSAGE_CHILDREN:
-		utime_jiffies = cputime_to_jiffies(current->signal->cutime);
-		stime_jiffies = cputime_to_jiffies(current->signal->cstime);
+		utime_jiffies = nsecs_to_jiffies(current->signal->cutime);
+		stime_jiffies = nsecs_to_jiffies(current->signal->cstime);
 		jiffies_to_timeval32(utime_jiffies, &r.ru_utime);
 		jiffies_to_timeval32(stime_jiffies, &r.ru_stime);
 		r.ru_minflt = current->signal->cmin_flt;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 02e97305d22b..3cca82e065c9 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -396,7 +396,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_user_time(tsk, acct->utime);
 
 	if (acct->utime_scaled)
-		tsk->utimescaled += acct->utime_scaled;
+		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
 
 	if (acct->gtime)
 		account_guest_time(tsk, acct->gtime);
@@ -411,7 +411,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM);
 
 	if (acct->stime_scaled)
-		tsk->stimescaled += acct->stime_scaled;
+		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
 
 	if (acct->hardirq_time)
 		account_system_index_time(tsk, acct->hardirq_time, CPUTIME_IRQ);
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 0a9e5d67547d..f2fc27491604 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -114,7 +114,7 @@ static void account_system_index_scaled(struct task_struct *p,
 					cputime_t cputime, cputime_t scaled,
 					enum cpu_usage_stat index)
 {
-	p->stimescaled += scaled;
+	p->stimescaled += cputime_to_nsecs(scaled);
 	account_system_index_time(p, cputime, index);
 }
 
@@ -167,12 +167,12 @@ static int do_account_vtime(struct task_struct *tsk)
 	/* Push account value */
 	if (user) {
 		account_user_time(tsk, user);
-		tsk->utimescaled += scale_vtime(user);
+		tsk->utimescaled += cputime_to_nsecs(scale_vtime(user));
 	}
 
 	if (guest) {
 		account_guest_time(tsk, guest);
-		tsk->utimescaled += scale_vtime(guest);
+		tsk->utimescaled += cputime_to_nsecs(scale_vtime(guest));
 	}
 
 	if (system)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1572c35b4f1a..2ecd7dab4631 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -964,10 +964,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 /* Calculate cpu time spent by current task in 100ns units */
 static u64 current_task_runtime_100ns(void)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 
 	task_cputime_adjusted(current, &utime, &stime);
-	return div_u64(cputime_to_nsecs(utime + stime), 100);
+
+	return div_u64(utime + stime, 100);
 }
 
 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 68b915650cae..6d451936a858 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1411,6 +1411,8 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
 static void fill_prstatus(struct elf_prstatus *prstatus,
 		struct task_struct *p, long signr)
 {
+	struct timeval tv;
+
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
@@ -1437,8 +1439,13 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
-	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
-	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+	tv = ns_to_timeval(p->signal->cutime);
+	prstatus->pr_cutime.tv_sec = tv.tv_sec;
+	prstatus->pr_cutime.tv_usec = tv.tv_usec;
+
+	tv = ns_to_timeval(p->signal->cstime);
+	prstatus->pr_cstime.tv_sec = tv.tv_sec;
+	prstatus->pr_cstime.tv_usec = tv.tv_usec;
 }
 
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6ccd9df7247a..e1f373460257 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1358,8 +1358,8 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
-	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
-	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+	prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
+	prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
 
 	prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
 	prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 25b54cf0c042..fe12b519d09b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -401,7 +401,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
-	cputime_t cutime, cstime, utime, stime;
+	u64 cutime, cstime, utime, stime;
 	u64 cgtime, gtime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
@@ -497,10 +497,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, " ", cmin_flt);
 	seq_put_decimal_ull(m, " ", maj_flt);
 	seq_put_decimal_ull(m, " ", cmaj_flt);
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime));
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime));
 	seq_put_decimal_ll(m, " ", priority);
 	seq_put_decimal_ll(m, " ", nice);
 	seq_put_decimal_ll(m, " ", num_threads);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9cc722f77799..b7ccc54b35cc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -585,8 +585,8 @@ struct cpu_itimer {
  */
 struct prev_cputime {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	cputime_t utime;
-	cputime_t stime;
+	u64 utime;
+	u64 stime;
 	raw_spinlock_t lock;
 #endif
 };
@@ -601,8 +601,8 @@ static inline void prev_cputime_init(struct prev_cputime *prev)
 
 /**
  * struct task_cputime - collected CPU time counts
- * @utime:		time spent in user mode, in &cputime_t units
- * @stime:		time spent in kernel mode, in &cputime_t units
+ * @utime:		time spent in user mode, in nanoseconds
+ * @stime:		time spent in kernel mode, in nanoseconds
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
  *
  * This structure groups together three kinds of CPU time that are tracked for
@@ -610,8 +610,8 @@ static inline void prev_cputime_init(struct prev_cputime *prev)
  * these counts together and treat all three of them in parallel.
  */
 struct task_cputime {
-	cputime_t utime;
-	cputime_t stime;
+	u64 utime;
+	u64 stime;
 	unsigned long long sum_exec_runtime;
 };
 
@@ -780,7 +780,7 @@ struct signal_struct {
 	 * in __exit_signal, except for the group leader.
 	 */
 	seqlock_t stats_lock;
-	cputime_t utime, stime, cutime, cstime;
+	u64 utime, stime, cutime, cstime;
 	u64 gtime;
 	u64 cgtime;
 	struct prev_cputime prev_cputime;
@@ -1661,9 +1661,9 @@ struct task_struct {
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
-	cputime_t utime, stime;
+	u64 utime, stime;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
-	cputime_t utimescaled, stimescaled;
+	u64 utimescaled, stimescaled;
 #endif
 	u64 gtime;
 	struct prev_cputime prev_cputime;
@@ -2260,11 +2260,11 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
-			 cputime_t *utime, cputime_t *stime);
+			 u64 *utime, u64 *stime);
 extern u64 task_gtime(struct task_struct *t);
 #else
 static inline void task_cputime(struct task_struct *t,
-				cputime_t *utime, cputime_t *stime)
+				u64 *utime, u64 *stime)
 {
 	*utime = t->utime;
 	*stime = t->stime;
@@ -2278,16 +2278,16 @@ static inline u64 task_gtime(struct task_struct *t)
 
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 static inline void task_cputime_scaled(struct task_struct *t,
-				       cputime_t *utimescaled,
-				       cputime_t *stimescaled)
+				       u64 *utimescaled,
+				       u64 *stimescaled)
 {
 	*utimescaled = t->utimescaled;
 	*stimescaled = t->stimescaled;
 }
 #else
 static inline void task_cputime_scaled(struct task_struct *t,
-				       cputime_t *utimescaled,
-				       cputime_t *stimescaled)
+				       u64 *utimescaled,
+				       u64 *stimescaled)
 {
 	task_cputime(t, utimescaled, stimescaled);
 }
@@ -2296,18 +2296,26 @@ static inline void task_cputime_scaled(struct task_struct *t,
 static inline void task_cputime_t(struct task_struct *t,
 				  cputime_t *utime, cputime_t *stime)
 {
-	task_cputime(t, utime, stime);
+	u64 ut, st;
+
+	task_cputime(t, &ut, &st);
+	*utime = nsecs_to_cputime(ut);
+	*stime = nsecs_to_cputime(st);
 }
 
 static inline void task_cputime_t_scaled(struct task_struct *t,
 					 cputime_t *utimescaled,
 					 cputime_t *stimescaled)
 {
-	task_cputime_scaled(t, utimescaled, stimescaled);
+	u64 ut, st;
+
+	task_cputime_scaled(t, &ut, &st);
+	*utimescaled = nsecs_to_cputime(ut);
+	*stimescaled = nsecs_to_cputime(st);
 }
 
-extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
-extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
+extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 
 /*
  * Per process flags
@@ -3522,9 +3530,14 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times);
 
 static inline void thread_group_cputime_t(struct task_struct *tsk,
-					  struct task_cputime_t *times)
+					  struct task_cputime_t *cputime)
 {
-	thread_group_cputime(tsk, (struct task_cputime *)times);
+	struct task_cputime times;
+
+	thread_group_cputime(tsk, &times);
+	cputime->utime = nsecs_to_cputime(times.utime);
+	cputime->stime = nsecs_to_cputime(times.stime);
+	cputime->sum_exec_runtime = times.sum_exec_runtime;
 }
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..8e5e21338b3a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -86,7 +86,7 @@ static void __exit_signal(struct task_struct *tsk)
 	bool group_dead = thread_group_leader(tsk);
 	struct sighand_struct *sighand;
 	struct tty_struct *uninitialized_var(tty);
-	cputime_t utime, stime;
+	u64 utime, stime;
 
 	sighand = rcu_dereference_check(tsk->sighand,
 					lockdep_tasklist_lock_is_held());
@@ -1091,7 +1091,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		struct signal_struct *sig = p->signal;
 		struct signal_struct *psig = current->signal;
 		unsigned long maxrss;
-		cputime_t tgutime, tgstime;
+		u64 tgutime, tgstime;
 
 		/*
 		 * The resource counters for the group leader are in its
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8bcd98e2b821..0bdef50d88bc 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -134,7 +134,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 	int index;
 
 	/* Add user time to process. */
-	p->utime += cputime;
+	p->utime += cputime_to_nsecs(cputime);
 	account_group_user_time(p, cputime);
 
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
@@ -156,7 +156,7 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
 	/* Add guest time to process. */
-	p->utime += cputime;
+	p->utime += cputime_to_nsecs(cputime);
 	account_group_user_time(p, cputime);
 	p->gtime += cputime_to_nsecs(cputime);
 
@@ -180,7 +180,7 @@ void account_system_index_time(struct task_struct *p,
 			       cputime_t cputime, enum cpu_usage_stat index)
 {
 	/* Add system time to process. */
-	p->stime += cputime;
+	p->stime += cputime_to_nsecs(cputime);
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
@@ -315,7 +315,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct signal_struct *sig = tsk->signal;
-	cputime_t utime, stime;
+	u64 utime, stime;
 	struct task_struct *t;
 	unsigned int seq, nextseq;
 	unsigned long flags;
@@ -465,14 +465,14 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	*ut = p->utime;
 	*st = p->stime;
 }
 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	struct task_cputime cputime;
 
@@ -543,7 +543,7 @@ void account_idle_ticks(unsigned long ticks)
  * Perform (stime * rtime) / total, but avoid multiplication overflow by
  * loosing precision when the numbers are big.
  */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 {
 	u64 scaled;
 
@@ -580,7 +580,7 @@ drop_precision:
 	 * followed by a 64/32->64 divide.
 	 */
 	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return (__force cputime_t) scaled;
+	return scaled;
 }
 
 /*
@@ -605,14 +605,14 @@ drop_precision:
  */
 static void cputime_adjust(struct task_cputime *curr,
 			   struct prev_cputime *prev,
-			   cputime_t *ut, cputime_t *st)
+			   u64 *ut, u64 *st)
 {
-	cputime_t rtime, stime, utime;
+	u64 rtime, stime, utime;
 	unsigned long flags;
 
 	/* Serialize concurrent callers such that we can honour our guarantees */
 	raw_spin_lock_irqsave(&prev->lock, flags);
-	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+	rtime = curr->sum_exec_runtime;
 
 	/*
 	 * This is possible under two circumstances:
@@ -643,8 +643,7 @@ static void cputime_adjust(struct task_cputime *curr,
 		goto update;
 	}
 
-	stime = scale_stime((__force u64)stime, (__force u64)rtime,
-			    (__force u64)(stime + utime));
+	stime = scale_stime(stime, rtime, stime + utime);
 
 update:
 	/*
@@ -677,7 +676,7 @@ out:
 	raw_spin_unlock_irqrestore(&prev->lock, flags);
 }
 
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	struct task_cputime cputime = {
 		.sum_exec_runtime = p->se.sum_exec_runtime,
@@ -688,7 +687,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 }
 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	struct task_cputime cputime;
 
@@ -849,9 +848,9 @@ u64 task_gtime(struct task_struct *t)
  * add up the pending nohz execution time since the last
  * cputime snapshot.
  */
-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 {
-	cputime_t delta;
+	u64 delta;
 	unsigned int seq;
 
 	if (!vtime_accounting_enabled()) {
@@ -870,7 +869,7 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 		if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
-		delta = vtime_delta(t);
+		delta = cputime_to_nsecs(vtime_delta(t));
 
 		/*
 		 * Task runs either in user or kernel space, add pending nohz time to
diff --git a/kernel/signal.c b/kernel/signal.c
index 218048a837ea..b63522193076 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1620,8 +1620,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	rcu_read_unlock();
 
 	task_cputime_t(tsk, &utime, &stime);
-	info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
-	info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
+	info.si_utime = cputime_to_clock_t(utime + nsecs_to_cputime(tsk->signal->utime));
+	info.si_stime = cputime_to_clock_t(stime + nsecs_to_cputime(tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
diff --git a/kernel/sys.c b/kernel/sys.c
index 842914ef7de4..7d4a9a6df956 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,15 +881,15 @@ SYSCALL_DEFINE0(getegid)
 
 void do_sys_times(struct tms *tms)
 {
-	cputime_t tgutime, tgstime, cutime, cstime;
+	u64 tgutime, tgstime, cutime, cstime;
 
 	thread_group_cputime_adjusted(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
-	tms->tms_utime = cputime_to_clock_t(tgutime);
-	tms->tms_stime = cputime_to_clock_t(tgstime);
-	tms->tms_cutime = cputime_to_clock_t(cutime);
-	tms->tms_cstime = cputime_to_clock_t(cstime);
+	tms->tms_utime = nsec_to_clock_t(tgutime);
+	tms->tms_stime = nsec_to_clock_t(tgstime);
+	tms->tms_cutime = nsec_to_clock_t(cutime);
+	tms->tms_cstime = nsec_to_clock_t(cstime);
 }
 
 SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
@@ -1544,7 +1544,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
-	cputime_t tgutime, tgstime, utime, stime;
+	u64 tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
 
 	memset((char *)r, 0, sizeof (*r));
@@ -1600,8 +1600,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	unlock_task_sighand(p, &flags);
 
 out:
-	cputime_to_timeval(utime, &r->ru_utime);
-	cputime_to_timeval(stime, &r->ru_stime);
+	r->ru_utime = ns_to_timeval(utime);
+	r->ru_stime = ns_to_timeval(stime);
 
 	if (who != RUSAGE_CHILDREN) {
 		struct mm_struct *mm = get_task_mm(p);
-- 
cgit v1.2.3


From cd19c364b313c179410fcac8376330964cc9bfd9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:27 +0100
Subject: fs/binfmt: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-12-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/kernel/binfmt_elfn32.c  | 12 ++----------
 arch/mips/kernel/binfmt_elfo32.c  | 12 ++----------
 arch/parisc/kernel/binfmt_elf32.c | 11 ++---------
 fs/binfmt_elf.c                   | 26 ++++++++++----------------
 fs/binfmt_elf_fdpic.c             | 16 ++++++++--------
 fs/compat_binfmt_elf.c            | 18 ++----------------
 include/linux/compat.h            | 20 +++++++++++++++++++-
 7 files changed, 45 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c
index 9c7f3e136d50..4a2ff3953b99 100644
--- a/arch/mips/kernel/binfmt_elfn32.c
+++ b/arch/mips/kernel/binfmt_elfn32.c
@@ -99,15 +99,7 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value)
 #undef TASK_SIZE
 #define TASK_SIZE TASK_SIZE32
 
-#undef cputime_to_timeval
-#define cputime_to_timeval cputime_to_compat_timeval
-static __inline__ void
-cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value)
-{
-	unsigned long jiffies = cputime_to_jiffies(cputime);
-
-	value->tv_usec = (jiffies % HZ) * (1000000L / HZ);
-	value->tv_sec = jiffies / HZ;
-}
+#undef ns_to_timeval
+#define ns_to_timeval ns_to_compat_timeval
 
 #include "../../../fs/binfmt_elf.c"
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c
index 1ab34322dd97..3916404e7fd1 100644
--- a/arch/mips/kernel/binfmt_elfo32.c
+++ b/arch/mips/kernel/binfmt_elfo32.c
@@ -102,15 +102,7 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value)
 #undef TASK_SIZE
 #define TASK_SIZE TASK_SIZE32
 
-#undef cputime_to_timeval
-#define cputime_to_timeval cputime_to_compat_timeval
-static __inline__ void
-cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value)
-{
-	unsigned long jiffies = cputime_to_jiffies(cputime);
-
-	value->tv_usec = (jiffies % HZ) * (1000000L / HZ);
-	value->tv_sec = jiffies / HZ;
-}
+#undef ns_to_timeval
+#define ns_to_timeval ns_to_compat_timeval
 
 #include "../../../fs/binfmt_elf.c"
diff --git a/arch/parisc/kernel/binfmt_elf32.c b/arch/parisc/kernel/binfmt_elf32.c
index 00dc66f9c2ba..f2adcf33f8f2 100644
--- a/arch/parisc/kernel/binfmt_elf32.c
+++ b/arch/parisc/kernel/binfmt_elf32.c
@@ -91,14 +91,7 @@ struct elf_prpsinfo32
 	current->thread.map_base = DEFAULT_MAP_BASE32; \
 	current->thread.task_size = DEFAULT_TASK_SIZE32 \
 
-#undef cputime_to_timeval
-#define cputime_to_timeval cputime_to_compat_timeval
-static __inline__ void
-cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value)
-{
-	unsigned long jiffies = cputime_to_jiffies(cputime);
-	value->tv_usec = (jiffies % HZ) * (1000000L / HZ);
-	value->tv_sec = jiffies / HZ;
-}
+#undef ns_to_timeval
+#define ns_to_timeval ns_to_compat_timeval
 
 #include "../../../fs/binfmt_elf.c"
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6d451936a858..e7bf01373bc4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1411,8 +1411,6 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
 static void fill_prstatus(struct elf_prstatus *prstatus,
 		struct task_struct *p, long signr)
 {
-	struct timeval tv;
-
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
@@ -1423,29 +1421,25 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
-		struct task_cputime_t cputime;
+		struct task_cputime cputime;
 
 		/*
 		 * This is the record for the group leader.  It shows the
 		 * group-wide total, not its individual thread total.
 		 */
-		thread_group_cputime_t(p, &cputime);
-		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
-		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		prstatus->pr_utime = ns_to_timeval(cputime.utime);
+		prstatus->pr_stime = ns_to_timeval(cputime.stime);
 	} else {
-		cputime_t utime, stime;
+		u64 utime, stime;
 
-		task_cputime_t(p, &utime, &stime);
-		cputime_to_timeval(utime, &prstatus->pr_utime);
-		cputime_to_timeval(stime, &prstatus->pr_stime);
+		task_cputime(p, &utime, &stime);
+		prstatus->pr_utime = ns_to_timeval(utime);
+		prstatus->pr_stime = ns_to_timeval(stime);
 	}
-	tv = ns_to_timeval(p->signal->cutime);
-	prstatus->pr_cutime.tv_sec = tv.tv_sec;
-	prstatus->pr_cutime.tv_usec = tv.tv_usec;
 
-	tv = ns_to_timeval(p->signal->cstime);
-	prstatus->pr_cstime.tv_sec = tv.tv_sec;
-	prstatus->pr_cstime.tv_usec = tv.tv_usec;
+	prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
+	prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
 }
 
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index e1f373460257..ffca4bbc3d63 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1342,21 +1342,21 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
-		struct task_cputime_t cputime;
+		struct task_cputime cputime;
 
 		/*
 		 * This is the record for the group leader.  It shows the
 		 * group-wide total, not its individual thread total.
 		 */
-		thread_group_cputime_t(p, &cputime);
-		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
-		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		prstatus->pr_utime = ns_to_timeval(cputime.utime);
+		prstatus->pr_stime = ns_to_timeval(cputime.stime);
 	} else {
-		cputime_t utime, stime;
+		u64 utime, stime;
 
-		task_cputime_t(p, &utime, &stime);
-		cputime_to_timeval(utime, &prstatus->pr_utime);
-		cputime_to_timeval(stime, &prstatus->pr_stime);
+		task_cputime(p, &utime, &stime);
+		prstatus->pr_utime = ns_to_timeval(utime);
+		prstatus->pr_stime = ns_to_timeval(stime);
 	}
 	prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
 	prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 4d24d17bcfc1..504b3c3539dc 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -51,22 +51,8 @@
 #define elf_prstatus	compat_elf_prstatus
 #define elf_prpsinfo	compat_elf_prpsinfo
 
-/*
- * Compat version of cputime_to_compat_timeval, perhaps this
- * should be an inline in <linux/compat.h>.
- */
-static void cputime_to_compat_timeval(const cputime_t cputime,
-				      struct compat_timeval *value)
-{
-	struct timeval tv;
-	cputime_to_timeval(cputime, &tv);
-	value->tv_sec = tv.tv_sec;
-	value->tv_usec = tv.tv_usec;
-}
-
-#undef cputime_to_timeval
-#define cputime_to_timeval cputime_to_compat_timeval
-
+#undef ns_to_timeval
+#define ns_to_timeval ns_to_compat_timeval
 
 /*
  * To use this file, asm/elf.h must define compat_elf_check_arch.
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 63609398ef9f..9e40be522793 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -731,7 +731,25 @@ asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
 static inline bool in_compat_syscall(void) { return is_compat_task(); }
 #endif
 
-#else
+/**
+ * ns_to_compat_timeval - Compat version of ns_to_timeval
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the compat_timeval representation of the nsec parameter.
+ */
+static inline struct compat_timeval ns_to_compat_timeval(s64 nsec)
+{
+	struct timeval tv;
+	struct compat_timeval ctv;
+
+	tv = ns_to_timeval(nsec);
+	ctv.tv_sec = tv.tv_sec;
+	ctv.tv_usec = tv.tv_usec;
+
+	return ctv;
+}
+
+#else /* !CONFIG_COMPAT */
 
 #define is_compat_task() (0)
 static inline bool in_compat_syscall(void) { return false; }
-- 
cgit v1.2.3


From d4bc42af73bf297f7182ed978b19850553242195 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:28 +0100
Subject: acct: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-13-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 +-
 kernel/acct.c         | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7ccc54b35cc..75fc773c158d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -563,7 +563,7 @@ struct pacct_struct {
 	int			ac_flag;
 	long			ac_exitcode;
 	unsigned long		ac_mem;
-	cputime_t		ac_utime, ac_stime;
+	u64			ac_utime, ac_stime;
 	unsigned long		ac_minflt, ac_majflt;
 };
 
diff --git a/kernel/acct.c b/kernel/acct.c
index b9b190a8eecf..ca9cb55b5855 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -453,8 +453,8 @@ static void fill_ac(acct_t *ac)
 	spin_lock_irq(&current->sighand->siglock);
 	tty = current->signal->tty;	/* Safe as we hold the siglock */
 	ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-	ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-	ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+	ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
+	ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
 	ac->ac_flag = pacct->ac_flag;
 	ac->ac_mem = encode_comp_t(pacct->ac_mem);
 	ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
@@ -530,7 +530,7 @@ out:
 void acct_collect(long exitcode, int group_dead)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
-	cputime_t utime, stime;
+	u64 utime, stime;
 	unsigned long vsize = 0;
 
 	if (group_dead && current->mm) {
@@ -559,7 +559,8 @@ void acct_collect(long exitcode, int group_dead)
 		pacct->ac_flag |= ACORE;
 	if (current->flags & PF_SIGNALED)
 		pacct->ac_flag |= AXSIG;
-	task_cputime_t(current, &utime, &stime);
+
+	task_cputime(current, &utime, &stime);
 	pacct->ac_utime += utime;
 	pacct->ac_stime += stime;
 	pacct->ac_minflt += current->min_flt;
-- 
cgit v1.2.3


From 605dc2b31a2ab235570107cb650036b41e741165 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:30 +0100
Subject: tsacct: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-15-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  2 +-
 kernel/tsacct.c       | 27 ++++++++++++---------------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75fc773c158d..dc44366128d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1815,7 +1815,7 @@ struct task_struct {
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
-	cputime_t acct_timexpd;	/* stime + utime since last update */
+	u64 acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;	/* Protected by alloc_lock */
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 040d0a64d0d1..5c21f0535056 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,7 +31,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 		   struct taskstats *stats, struct task_struct *tsk)
 {
 	const struct cred *tcred;
-	cputime_t utime, stime, utimescaled, stimescaled;
+	u64 utime, stime, utimescaled, stimescaled;
 	u64 delta;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -66,13 +66,13 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
 	rcu_read_unlock();
 
-	task_cputime_t(tsk, &utime, &stime);
-	stats->ac_utime = cputime_to_usecs(utime);
-	stats->ac_stime = cputime_to_usecs(stime);
+	task_cputime(tsk, &utime, &stime);
+	stats->ac_utime = div_u64(utime, NSEC_PER_USEC);
+	stats->ac_stime = div_u64(stime, NSEC_PER_USEC);
 
-	task_cputime_t_scaled(tsk, &utimescaled, &stimescaled);
-	stats->ac_utimescaled = cputime_to_usecs(utimescaled);
-	stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC);
+	stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC);
 
 	stats->ac_minflt = tsk->min_flt;
 	stats->ac_majflt = tsk->maj_flt;
@@ -123,18 +123,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 #undef MB
 
 static void __acct_update_integrals(struct task_struct *tsk,
-				    cputime_t utime, cputime_t stime)
+				    u64 utime, u64 stime)
 {
-	cputime_t time, dtime;
-	u64 delta;
+	u64 time, delta;
 
 	if (!likely(tsk->mm))
 		return;
 
 	time = stime + utime;
-	dtime = time - tsk->acct_timexpd;
-	/* Avoid division: cputime_t is often in nanoseconds already. */
-	delta = cputime_to_nsecs(dtime);
+	delta = time - tsk->acct_timexpd;
 
 	if (delta < TICK_NSEC)
 		return;
@@ -155,11 +152,11 @@ static void __acct_update_integrals(struct task_struct *tsk,
  */
 void acct_update_integrals(struct task_struct *tsk)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	task_cputime_t(tsk, &utime, &stime);
+	task_cputime(tsk, &utime, &stime);
 	__acct_update_integrals(tsk, utime, stime);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From ebd7e7fc4bc63be5eaf9da903b8060b02dd711ea Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:34 +0100
Subject: timers/posix-timers: Convert internals to use nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Also convert posix-cpu-timers to use nsec based internal counters to
simplify it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-19-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/posix-timers.h   |  12 +--
 include/linux/sched.h          |   6 +-
 kernel/fork.c                  |   2 +-
 kernel/sched/cputime.c         |   6 +-
 kernel/sched/stats.h           |   4 +-
 kernel/time/itimer.c           |   6 +-
 kernel/time/posix-cpu-timers.c | 210 +++++++++++++++++------------------------
 7 files changed, 100 insertions(+), 146 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 62d44c176071..890de52362d0 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -8,19 +8,9 @@
 #include <linux/alarmtimer.h>
 
 
-static inline unsigned long long cputime_to_expires(cputime_t expires)
-{
-	return (__force unsigned long long)expires;
-}
-
-static inline cputime_t expires_to_cputime(unsigned long long expires)
-{
-	return (__force cputime_t)expires;
-}
-
 struct cpu_timer_list {
 	struct list_head entry;
-	unsigned long long expires, incr;
+	u64 expires, incr;
 	struct task_struct *task;
 	int firing;
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc44366128d8..baa6a2834e0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -755,7 +755,7 @@ struct signal_struct {
 	struct thread_group_cputimer cputimer;
 
 	/* Earliest-expiration cache. */
-	struct task_cputime_t cputime_expires;
+	struct task_cputime cputime_expires;
 
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t tick_dep_mask;
@@ -1689,7 +1689,7 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
-	struct task_cputime_t cputime_expires;
+	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 
 /* process credentials */
@@ -3527,7 +3527,7 @@ static __always_inline bool need_resched(void)
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
 static inline void thread_group_cputime_t(struct task_struct *tsk,
 					  struct task_cputime_t *cputime)
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..09992ff2f8fa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1313,7 +1313,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 
 	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
-		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
+		sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
 		sig->cputimer.running = true;
 	}
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bee6c97b1e83..f7b9624c7df0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 
 	/* Add user time to process. */
 	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime);
+	account_group_user_time(p, cputime_to_nsecs(cputime));
 
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
@@ -144,7 +144,7 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
 
 	/* Add guest time to process. */
 	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime);
+	account_group_user_time(p, cputime_to_nsecs(cputime));
 	p->gtime += cputime_to_nsecs(cputime);
 
 	/* Add guest time to cpustat. */
@@ -168,7 +168,7 @@ void account_system_index_time(struct task_struct *p,
 {
 	/* Add system time to process. */
 	p->stime += cputime_to_nsecs(cputime);
-	account_group_system_time(p, cputime);
+	account_group_system_time(p, cputime_to_nsecs(cputime));
 
 	/* Add system time to cpustat. */
 	task_group_account_field(p, index, cputime_to_nsecs(cputime));
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 34659a853505..9788478a66d4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -216,7 +216,7 @@ static inline bool cputimer_running(struct task_struct *tsk)
  * running CPU and update the utime field there.
  */
 static inline void account_group_user_time(struct task_struct *tsk,
-					   cputime_t cputime)
+					   u64 cputime)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
@@ -237,7 +237,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
  * running CPU and update the stime field there.
  */
 static inline void account_group_system_time(struct task_struct *tsk,
-					     cputime_t cputime)
+					     u64 cputime)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index f2d5097bcb6d..bb01ff445ce9 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -53,15 +53,15 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	cval = it->expires;
 	cinterval = it->incr;
 	if (cval) {
-		struct task_cputime_t cputime;
+		struct task_cputime cputime;
 		cputime_t t;
 
 		thread_group_cputimer(tsk, &cputime);
 		if (clock_id == CPUCLOCK_PROF)
-			t = cputime.utime + cputime.stime;
+			t = nsecs_to_cputime(cputime.utime + cputime.stime);
 		else
 			/* CPUCLOCK_VIRT */
-			t = cputime.utime;
+			t = nsecs_to_cputime(cputime.utime);
 
 		if (cval < t)
 			/* about to fire */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 8349e02b1c0c..45be3cec0dc2 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -50,39 +50,14 @@ static int check_clock(const clockid_t which_clock)
 	return error;
 }
 
-static inline unsigned long long
-timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
-{
-	unsigned long long ret;
-
-	ret = 0;		/* high half always zero when .cpu used */
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
-	} else {
-		ret = cputime_to_expires(timespec_to_cputime(tp));
-	}
-	return ret;
-}
-
-static void sample_to_timespec(const clockid_t which_clock,
-			       unsigned long long expires,
-			       struct timespec *tp)
-{
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
-		*tp = ns_to_timespec(expires);
-	else
-		cputime_to_timespec((__force cputime_t)expires, tp);
-}
-
 /*
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static void bump_cpu_timer(struct k_itimer *timer,
-			   unsigned long long now)
+static void bump_cpu_timer(struct k_itimer *timer, u64 now)
 {
 	int i;
-	unsigned long long delta, incr;
+	u64 delta, incr;
 
 	if (timer->it.cpu.incr == 0)
 		return;
@@ -115,28 +90,28 @@ static void bump_cpu_timer(struct k_itimer *timer,
  * Checks @cputime to see if all fields are zero.  Returns true if all fields
  * are zero, false if any field is nonzero.
  */
-static inline int task_cputime_zero(const struct task_cputime_t *cputime)
+static inline int task_cputime_zero(const struct task_cputime *cputime)
 {
 	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
 		return 1;
 	return 0;
 }
 
-static inline unsigned long long prof_ticks(struct task_struct *p)
+static inline u64 prof_ticks(struct task_struct *p)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 
-	task_cputime_t(p, &utime, &stime);
+	task_cputime(p, &utime, &stime);
 
-	return cputime_to_expires(utime + stime);
+	return utime + stime;
 }
-static inline unsigned long long virt_ticks(struct task_struct *p)
+static inline u64 virt_ticks(struct task_struct *p)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 
-	task_cputime_t(p, &utime, &stime);
+	task_cputime(p, &utime, &stime);
 
-	return cputime_to_expires(utime);
+	return utime;
 }
 
 static int
@@ -176,8 +151,8 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 /*
  * Sample a per-thread clock for the given task.
  */
-static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
-			    unsigned long long *sample)
+static int cpu_clock_sample(const clockid_t which_clock,
+			    struct task_struct *p, u64 *sample)
 {
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
@@ -210,7 +185,7 @@ retry:
 	}
 }
 
-static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime_t *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
 {
 	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
 	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
@@ -218,7 +193,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct
 }
 
 /* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-static inline void sample_cputime_atomic(struct task_cputime_t *times,
+static inline void sample_cputime_atomic(struct task_cputime *times,
 					 struct task_cputime_atomic *atomic_times)
 {
 	times->utime = atomic64_read(&atomic_times->utime);
@@ -226,10 +201,10 @@ static inline void sample_cputime_atomic(struct task_cputime_t *times,
 	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times)
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	struct task_cputime_t sum;
+	struct task_cputime sum;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running)) {
@@ -238,7 +213,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times
 		 * values through the TIMER_ABSTIME flag, therefore we have
 		 * to synchronize the timer to the clock every time we start it.
 		 */
-		thread_group_cputime_t(tsk, &sum);
+		thread_group_cputime(tsk, &sum);
 		update_gt_cputime(&cputimer->cputime_atomic, &sum);
 
 		/*
@@ -260,23 +235,23 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times
  */
 static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
-				  unsigned long long *sample)
+				  u64 *sample)
 {
-	struct task_cputime_t cputime;
+	struct task_cputime cputime;
 
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		thread_group_cputime_t(p, &cputime);
-		*sample = cputime_to_expires(cputime.utime + cputime.stime);
+		thread_group_cputime(p, &cputime);
+		*sample = cputime.utime + cputime.stime;
 		break;
 	case CPUCLOCK_VIRT:
-		thread_group_cputime_t(p, &cputime);
-		*sample = cputime_to_expires(cputime.utime);
+		thread_group_cputime(p, &cputime);
+		*sample = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		thread_group_cputime_t(p, &cputime);
+		thread_group_cputime(p, &cputime);
 		*sample = cputime.sum_exec_runtime;
 		break;
 	}
@@ -288,7 +263,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 				    struct timespec *tp)
 {
 	int err = -EINVAL;
-	unsigned long long rtn;
+	u64 rtn;
 
 	if (CPUCLOCK_PERTHREAD(which_clock)) {
 		if (same_thread_group(tsk, current))
@@ -299,7 +274,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 	}
 
 	if (!err)
-		sample_to_timespec(which_clock, rtn, tp);
+		*tp = ns_to_timespec(rtn);
 
 	return err;
 }
@@ -453,7 +428,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 	cleanup_timers(tsk->signal->cpu_timers);
 }
 
-static inline int expires_gt(cputime_t expires, cputime_t new_exp)
+static inline int expires_gt(u64 expires, u64 new_exp)
 {
 	return expires == 0 || expires > new_exp;
 }
@@ -466,7 +441,7 @@ static void arm_timer(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	struct list_head *head, *listpos;
-	struct task_cputime_t *cputime_expires;
+	struct task_cputime *cputime_expires;
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
 
@@ -488,7 +463,7 @@ static void arm_timer(struct k_itimer *timer)
 	list_add(&nt->entry, listpos);
 
 	if (listpos == head) {
-		unsigned long long exp = nt->expires;
+		u64 exp = nt->expires;
 
 		/*
 		 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -499,16 +474,15 @@ static void arm_timer(struct k_itimer *timer)
 
 		switch (CPUCLOCK_WHICH(timer->it_clock)) {
 		case CPUCLOCK_PROF:
-			if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
-				cputime_expires->prof_exp = expires_to_cputime(exp);
+			if (expires_gt(cputime_expires->prof_exp, exp))
+				cputime_expires->prof_exp = exp;
 			break;
 		case CPUCLOCK_VIRT:
-			if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
-				cputime_expires->virt_exp = expires_to_cputime(exp);
+			if (expires_gt(cputime_expires->virt_exp, exp))
+				cputime_expires->virt_exp = exp;
 			break;
 		case CPUCLOCK_SCHED:
-			if (cputime_expires->sched_exp == 0 ||
-			    cputime_expires->sched_exp > exp)
+			if (expires_gt(cputime_expires->sched_exp, exp))
 				cputime_expires->sched_exp = exp;
 			break;
 		}
@@ -559,20 +533,19 @@ static void cpu_timer_fire(struct k_itimer *timer)
  * traversal.
  */
 static int cpu_timer_sample_group(const clockid_t which_clock,
-				  struct task_struct *p,
-				  unsigned long long *sample)
+				  struct task_struct *p, u64 *sample)
 {
-	struct task_cputime_t cputime;
+	struct task_cputime cputime;
 
 	thread_group_cputimer(p, &cputime);
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		*sample = cputime_to_expires(cputime.utime + cputime.stime);
+		*sample = cputime.utime + cputime.stime;
 		break;
 	case CPUCLOCK_VIRT:
-		*sample = cputime_to_expires(cputime.utime);
+		*sample = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
 		*sample = cputime.sum_exec_runtime;
@@ -593,12 +566,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	unsigned long flags;
 	struct sighand_struct *sighand;
 	struct task_struct *p = timer->it.cpu.task;
-	unsigned long long old_expires, new_expires, old_incr, val;
+	u64 old_expires, new_expires, old_incr, val;
 	int ret;
 
 	WARN_ON_ONCE(p == NULL);
 
-	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+	new_expires = timespec_to_ns(&new->it_value);
 
 	/*
 	 * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -659,9 +632,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 			bump_cpu_timer(timer, val);
 			if (val < timer->it.cpu.expires) {
 				old_expires = timer->it.cpu.expires - val;
-				sample_to_timespec(timer->it_clock,
-						   old_expires,
-						   &old->it_value);
+				old->it_value = ns_to_timespec(old_expires);
 			} else {
 				old->it_value.tv_nsec = 1;
 				old->it_value.tv_sec = 0;
@@ -699,8 +670,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * Install the new reload setting, and
 	 * set up the signal and overrun bookkeeping.
 	 */
-	timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
-						&new->it_interval);
+	timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
 
 	/*
 	 * This acts as a modification timestamp for the timer,
@@ -723,17 +693,15 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
 	ret = 0;
  out:
-	if (old) {
-		sample_to_timespec(timer->it_clock,
-				   old_incr, &old->it_interval);
-	}
+	if (old)
+		old->it_interval = ns_to_timespec(old_incr);
 
 	return ret;
 }
 
 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
-	unsigned long long now;
+	u64 now;
 	struct task_struct *p = timer->it.cpu.task;
 
 	WARN_ON_ONCE(p == NULL);
@@ -741,8 +709,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	/*
 	 * Easy part: convert the reload time.
 	 */
-	sample_to_timespec(timer->it_clock,
-			   timer->it.cpu.incr, &itp->it_interval);
+	itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
 
 	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */
 		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -761,7 +728,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		/*
 		 * Protect against sighand release/switch in exit/exec and
 		 * also make timer sampling safe if it ends up calling
-		 * thread_group_cputime_t().
+		 * thread_group_cputime().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -771,8 +738,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			 * Call the timer disarmed, nothing else to do.
 			 */
 			timer->it.cpu.expires = 0;
-			sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
-					   &itp->it_value);
+			itp->it_value = ns_to_timespec(timer->it.cpu.expires);
 			return;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -781,9 +747,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	}
 
 	if (now < timer->it.cpu.expires) {
-		sample_to_timespec(timer->it_clock,
-				   timer->it.cpu.expires - now,
-				   &itp->it_value);
+		itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
 	} else {
 		/*
 		 * The timer should have expired already, but the firing
@@ -826,8 +790,8 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 	struct list_head *timers = tsk->cpu_timers;
 	struct signal_struct *const sig = tsk->signal;
-	struct task_cputime_t *tsk_expires = &tsk->cputime_expires;
-	unsigned long long expires;
+	struct task_cputime *tsk_expires = &tsk->cputime_expires;
+	u64 expires;
 	unsigned long soft;
 
 	/*
@@ -838,10 +802,10 @@ static void check_thread_timers(struct task_struct *tsk,
 		return;
 
 	expires = check_timers_list(timers, firing, prof_ticks(tsk));
-	tsk_expires->prof_exp = expires_to_cputime(expires);
+	tsk_expires->prof_exp = expires;
 
 	expires = check_timers_list(++timers, firing, virt_ticks(tsk));
-	tsk_expires->virt_exp = expires_to_cputime(expires);
+	tsk_expires->virt_exp = expires;
 
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
 						   tsk->se.sum_exec_runtime);
@@ -891,13 +855,12 @@ static inline void stop_process_timers(struct signal_struct *sig)
 }
 
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
-			     unsigned long long *expires,
-			     unsigned long long cur_time, int signo)
+			     u64 *expires, u64 cur_time, int signo)
 {
 	if (!it->expires)
 		return;
 
-	if (cur_time >= it->expires) {
+	if (cur_time >= cputime_to_nsecs(it->expires)) {
 		if (it->incr) {
 			it->expires += it->incr;
 			it->error += it->incr_error;
@@ -915,8 +878,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
 	}
 
-	if (it->expires && (!*expires || it->expires < *expires)) {
-		*expires = it->expires;
+	if (it->expires && (!*expires || cputime_to_nsecs(it->expires) < *expires)) {
+		*expires = cputime_to_nsecs(it->expires);
 	}
 }
 
@@ -929,10 +892,10 @@ static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
 	struct signal_struct *const sig = tsk->signal;
-	unsigned long long utime, ptime, virt_expires, prof_expires;
-	unsigned long long sum_sched_runtime, sched_expires;
+	u64 utime, ptime, virt_expires, prof_expires;
+	u64 sum_sched_runtime, sched_expires;
 	struct list_head *timers = sig->cpu_timers;
-	struct task_cputime_t cputime;
+	struct task_cputime cputime;
 	unsigned long soft;
 
 	/*
@@ -952,8 +915,8 @@ static void check_process_timers(struct task_struct *tsk,
 	 * Collect the current process totals.
 	 */
 	thread_group_cputimer(tsk, &cputime);
-	utime = cputime_to_expires(cputime.utime);
-	ptime = utime + cputime_to_expires(cputime.stime);
+	utime = cputime.utime;
+	ptime = utime + cputime.stime;
 	sum_sched_runtime = cputime.sum_exec_runtime;
 
 	prof_expires = check_timers_list(timers, firing, ptime);
@@ -969,10 +932,10 @@ static void check_process_timers(struct task_struct *tsk,
 			 SIGVTALRM);
 	soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (soft != RLIM_INFINITY) {
-		unsigned long psecs = cputime_to_secs(ptime);
+		unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
 		unsigned long hard =
 			READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
-		cputime_t x;
+		u64 x;
 		if (psecs >= hard) {
 			/*
 			 * At the hard limit, we just die.
@@ -991,14 +954,13 @@ static void check_process_timers(struct task_struct *tsk,
 				sig->rlim[RLIMIT_CPU].rlim_cur = soft;
 			}
 		}
-		x = secs_to_cputime(soft);
-		if (!prof_expires || x < prof_expires) {
+		x = soft * NSEC_PER_SEC;
+		if (!prof_expires || x < prof_expires)
 			prof_expires = x;
-		}
 	}
 
-	sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
-	sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
+	sig->cputime_expires.prof_exp = prof_expires;
+	sig->cputime_expires.virt_exp = virt_expires;
 	sig->cputime_expires.sched_exp = sched_expires;
 	if (task_cputime_zero(&sig->cputime_expires))
 		stop_process_timers(sig);
@@ -1015,7 +977,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	struct sighand_struct *sighand;
 	unsigned long flags;
 	struct task_struct *p = timer->it.cpu.task;
-	unsigned long long now;
+	u64 now;
 
 	WARN_ON_ONCE(p == NULL);
 
@@ -1035,7 +997,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	} else {
 		/*
 		 * Protect arm_timer() and timer sampling in case of call to
-		 * thread_group_cputime_t().
+		 * thread_group_cputime().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -1078,8 +1040,8 @@ out:
  * Returns true if any field of the former is greater than the corresponding
  * field of the latter if the latter field is set.  Otherwise returns false.
  */
-static inline int task_cputime_expired(const struct task_cputime_t *sample,
-					const struct task_cputime_t *expires)
+static inline int task_cputime_expired(const struct task_cputime *sample,
+					const struct task_cputime *expires)
 {
 	if (expires->utime && sample->utime >= expires->utime)
 		return 1;
@@ -1106,9 +1068,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	struct signal_struct *sig;
 
 	if (!task_cputime_zero(&tsk->cputime_expires)) {
-		struct task_cputime_t task_sample;
+		struct task_cputime task_sample;
 
-		task_cputime_t(tsk, &task_sample.utime, &task_sample.stime);
+		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
 		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
 			return 1;
@@ -1131,7 +1093,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	 */
 	if (READ_ONCE(sig->cputimer.running) &&
 	    !READ_ONCE(sig->cputimer.checking_timer)) {
-		struct task_cputime_t group_sample;
+		struct task_cputime group_sample;
 
 		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
@@ -1214,7 +1176,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
 {
-	unsigned long long now;
+	u64 now, new;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1226,31 +1188,33 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		 * it to be absolute.
 		 */
 		if (*oldval) {
-			if (*oldval <= now) {
+			if (cputime_to_nsecs(*oldval) <= now) {
 				/* Just about to fire. */
 				*oldval = cputime_one_jiffy;
 			} else {
-				*oldval -= now;
+				*oldval -= nsecs_to_cputime(now);
 			}
 		}
 
 		if (!*newval)
 			return;
-		*newval += now;
+		*newval += nsecs_to_cputime(now);
 	}
 
+	new = cputime_to_nsecs(*newval);
+
 	/*
 	 * Update expiration cache if we are the earliest timer, or eventually
 	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
 	 */
 	switch (clock_idx) {
 	case CPUCLOCK_PROF:
-		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
-			tsk->signal->cputime_expires.prof_exp = *newval;
+		if (expires_gt(tsk->signal->cputime_expires.prof_exp, new))
+			tsk->signal->cputime_expires.prof_exp = new;
 		break;
 	case CPUCLOCK_VIRT:
-		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
-			tsk->signal->cputime_expires.virt_exp = *newval;
+		if (expires_gt(tsk->signal->cputime_expires.virt_exp, new))
+			tsk->signal->cputime_expires.virt_exp = new;
 		break;
 	}
 
@@ -1308,7 +1272,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		/*
 		 * We were interrupted by a signal.
 		 */
-		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+		*rqtp = ns_to_timespec(timer.it.cpu.expires);
 		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
 		if (!error) {
 			/*
-- 
cgit v1.2.3


From 858cf3a8c59968e7c5f7c1a1192459a0d52d1ab4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:35 +0100
Subject: timers/itimer: Convert internal cputime_t units to nsec

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Also convert itimers to use nsec based internal counters. This simplifies
it and removes the whole game with error/inc_error which served to deal
with cputime_t random granularity.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-20-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/posix-timers.h   |  2 +-
 include/linux/sched.h          |  6 ++--
 include/trace/events/timer.h   | 26 ++++++++---------
 kernel/time/itimer.c           | 64 +++++++++++++++---------------------------
 kernel/time/posix-cpu-timers.c | 43 +++++++++++-----------------
 5 files changed, 55 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 890de52362d0..64aa189efe21 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -119,7 +119,7 @@ void run_posix_cpu_timers(struct task_struct *task);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
-			   cputime_t *newval, cputime_t *oldval);
+			   u64 *newval, u64 *oldval);
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index baa6a2834e0f..268fdd713089 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -568,10 +568,8 @@ struct pacct_struct {
 };
 
 struct cpu_itimer {
-	cputime_t expires;
-	cputime_t incr;
-	u32 error;
-	u32 incr_error;
+	u64 expires;
+	u64 incr;
 };
 
 /**
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 1448637616d6..1bca99dbb98f 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -269,17 +269,17 @@ DEFINE_EVENT(hrtimer_class, hrtimer_cancel,
 TRACE_EVENT(itimer_state,
 
 	TP_PROTO(int which, const struct itimerval *const value,
-		 cputime_t expires),
+		 unsigned long long expires),
 
 	TP_ARGS(which, value, expires),
 
 	TP_STRUCT__entry(
-		__field(	int,		which		)
-		__field(	cputime_t,	expires		)
-		__field(	long,		value_sec	)
-		__field(	long,		value_usec	)
-		__field(	long,		interval_sec	)
-		__field(	long,		interval_usec	)
+		__field(	int,			which		)
+		__field(	unsigned long long,	expires		)
+		__field(	long,			value_sec	)
+		__field(	long,			value_usec	)
+		__field(	long,			interval_sec	)
+		__field(	long,			interval_usec	)
 	),
 
 	TP_fast_assign(
@@ -292,7 +292,7 @@ TRACE_EVENT(itimer_state,
 	),
 
 	TP_printk("which=%d expires=%llu it_value=%ld.%ld it_interval=%ld.%ld",
-		  __entry->which, (unsigned long long)__entry->expires,
+		  __entry->which, __entry->expires,
 		  __entry->value_sec, __entry->value_usec,
 		  __entry->interval_sec, __entry->interval_usec)
 );
@@ -305,14 +305,14 @@ TRACE_EVENT(itimer_state,
  */
 TRACE_EVENT(itimer_expire,
 
-	TP_PROTO(int which, struct pid *pid, cputime_t now),
+	TP_PROTO(int which, struct pid *pid, unsigned long long now),
 
 	TP_ARGS(which, pid, now),
 
 	TP_STRUCT__entry(
-		__field( int ,		which	)
-		__field( pid_t,		pid	)
-		__field( cputime_t,	now	)
+		__field( int ,			which	)
+		__field( pid_t,			pid	)
+		__field( unsigned long long,	now	)
 	),
 
 	TP_fast_assign(
@@ -322,7 +322,7 @@ TRACE_EVENT(itimer_expire,
 	),
 
 	TP_printk("which=%d pid=%d now=%llu", __entry->which,
-		  (int) __entry->pid, (unsigned long long)__entry->now)
+		  (int) __entry->pid, __entry->now)
 );
 
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index bb01ff445ce9..a95f13c31464 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -45,35 +45,35 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
 static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 			   struct itimerval *const value)
 {
-	cputime_t cval, cinterval;
+	u64 val, interval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
-	cval = it->expires;
-	cinterval = it->incr;
-	if (cval) {
+	val = it->expires;
+	interval = it->incr;
+	if (val) {
 		struct task_cputime cputime;
-		cputime_t t;
+		u64 t;
 
 		thread_group_cputimer(tsk, &cputime);
 		if (clock_id == CPUCLOCK_PROF)
-			t = nsecs_to_cputime(cputime.utime + cputime.stime);
+			t = cputime.utime + cputime.stime;
 		else
 			/* CPUCLOCK_VIRT */
-			t = nsecs_to_cputime(cputime.utime);
+			t = cputime.utime;
 
-		if (cval < t)
+		if (val < t)
 			/* about to fire */
-			cval = cputime_one_jiffy;
+			val = TICK_NSEC;
 		else
-			cval = cval - t;
+			val -= t;
 	}
 
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	cputime_to_timeval(cval, &value->it_value);
-	cputime_to_timeval(cinterval, &value->it_interval);
+	value->it_value = ns_to_timeval(val);
+	value->it_interval = ns_to_timeval(interval);
 }
 
 int do_getitimer(int which, struct itimerval *value)
@@ -129,55 +129,35 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
-{
-	struct timespec ts;
-	s64 cpu_ns;
-
-	cputime_to_timespec(ct, &ts);
-	cpu_ns = timespec_to_ns(&ts);
-
-	return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
-}
-
 static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 			   const struct itimerval *const value,
 			   struct itimerval *const ovalue)
 {
-	cputime_t cval, nval, cinterval, ninterval;
-	s64 ns_ninterval, ns_nval;
-	u32 error, incr_error;
+	u64 oval, nval, ointerval, ninterval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
 
-	nval = timeval_to_cputime(&value->it_value);
-	ns_nval = timeval_to_ns(&value->it_value);
-	ninterval = timeval_to_cputime(&value->it_interval);
-	ns_ninterval = timeval_to_ns(&value->it_interval);
-
-	error = cputime_sub_ns(nval, ns_nval);
-	incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+	nval = timeval_to_ns(&value->it_value);
+	ninterval = timeval_to_ns(&value->it_interval);
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
-	cval = it->expires;
-	cinterval = it->incr;
-	if (cval || nval) {
+	oval = it->expires;
+	ointerval = it->incr;
+	if (oval || nval) {
 		if (nval > 0)
-			nval += cputime_one_jiffy;
-		set_process_cpu_timer(tsk, clock_id, &nval, &cval);
+			nval += TICK_NSEC;
+		set_process_cpu_timer(tsk, clock_id, &nval, &oval);
 	}
 	it->expires = nval;
 	it->incr = ninterval;
-	it->error = error;
-	it->incr_error = incr_error;
 	trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
 			   ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
 
 	spin_unlock_irq(&tsk->sighand->siglock);
 
 	if (ovalue) {
-		cputime_to_timeval(cval, &ovalue->it_value);
-		cputime_to_timeval(cinterval, &ovalue->it_interval);
+		ovalue->it_value = ns_to_timeval(oval);
+		ovalue->it_interval = ns_to_timeval(ointerval);
 	}
 }
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 45be3cec0dc2..b4377a5e4269 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -20,10 +20,10 @@
  */
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 {
-	cputime_t cputime = secs_to_cputime(rlim_new);
+	u64 nsecs = rlim_new * NSEC_PER_SEC;
 
 	spin_lock_irq(&task->sighand->siglock);
-	set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+	set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
@@ -860,17 +860,11 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 	if (!it->expires)
 		return;
 
-	if (cur_time >= cputime_to_nsecs(it->expires)) {
-		if (it->incr) {
+	if (cur_time >= it->expires) {
+		if (it->incr)
 			it->expires += it->incr;
-			it->error += it->incr_error;
-			if (it->error >= TICK_NSEC) {
-				it->expires -= cputime_one_jiffy;
-				it->error -= TICK_NSEC;
-			}
-		} else {
+		else
 			it->expires = 0;
-		}
 
 		trace_itimer_expire(signo == SIGPROF ?
 				    ITIMER_PROF : ITIMER_VIRTUAL,
@@ -878,9 +872,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
 	}
 
-	if (it->expires && (!*expires || cputime_to_nsecs(it->expires) < *expires)) {
-		*expires = cputime_to_nsecs(it->expires);
-	}
+	if (it->expires && (!*expires || it->expires < *expires))
+		*expires = it->expires;
 }
 
 /*
@@ -1174,9 +1167,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
  * The tsk->sighand->siglock must be held by the caller.
  */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
-			   cputime_t *newval, cputime_t *oldval)
+			   u64 *newval, u64 *oldval)
 {
-	u64 now, new;
+	u64 now;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1188,33 +1181,31 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		 * it to be absolute.
 		 */
 		if (*oldval) {
-			if (cputime_to_nsecs(*oldval) <= now) {
+			if (*oldval <= now) {
 				/* Just about to fire. */
-				*oldval = cputime_one_jiffy;
+				*oldval = TICK_NSEC;
 			} else {
-				*oldval -= nsecs_to_cputime(now);
+				*oldval -= now;
 			}
 		}
 
 		if (!*newval)
 			return;
-		*newval += nsecs_to_cputime(now);
+		*newval += now;
 	}
 
-	new = cputime_to_nsecs(*newval);
-
 	/*
 	 * Update expiration cache if we are the earliest timer, or eventually
 	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
 	 */
 	switch (clock_idx) {
 	case CPUCLOCK_PROF:
-		if (expires_gt(tsk->signal->cputime_expires.prof_exp, new))
-			tsk->signal->cputime_expires.prof_exp = new;
+		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
+			tsk->signal->cputime_expires.prof_exp = *newval;
 		break;
 	case CPUCLOCK_VIRT:
-		if (expires_gt(tsk->signal->cputime_expires.virt_exp, new))
-			tsk->signal->cputime_expires.virt_exp = new;
+		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
+			tsk->signal->cputime_expires.virt_exp = *newval;
 		break;
 	}
 
-- 
cgit v1.2.3


From 71ea47b197de9a53bc3747a8b1c667df9c6a4c68 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:36 +0100
Subject: sched/cputime: Remove temporary cputime_t accessors

Now that the whole cputime conversion to nsec units is complete, we
can remove the compatibility accessors.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-21-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 268fdd713089..d17645402767 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,7 +29,6 @@ struct sched_param {
 
 #include <asm/page.h>
 #include <asm/ptrace.h>
-#include <linux/cputime.h>
 
 #include <linux/smp.h>
 #include <linux/sem.h>
@@ -613,13 +612,6 @@ struct task_cputime {
 	unsigned long long sum_exec_runtime;
 };
 
-/* Temporary type to ease cputime_t to nsecs conversion */
-struct task_cputime_t {
-	cputime_t utime;
-	cputime_t stime;
-	unsigned long long sum_exec_runtime;
-};
-
 /* Alternate field names when used to cache expirations. */
 #define virt_exp	utime
 #define prof_exp	stime
@@ -2291,27 +2283,6 @@ static inline void task_cputime_scaled(struct task_struct *t,
 }
 #endif
 
-static inline void task_cputime_t(struct task_struct *t,
-				  cputime_t *utime, cputime_t *stime)
-{
-	u64 ut, st;
-
-	task_cputime(t, &ut, &st);
-	*utime = nsecs_to_cputime(ut);
-	*stime = nsecs_to_cputime(st);
-}
-
-static inline void task_cputime_t_scaled(struct task_struct *t,
-					 cputime_t *utimescaled,
-					 cputime_t *stimescaled)
-{
-	u64 ut, st;
-
-	task_cputime_scaled(t, &ut, &st);
-	*utimescaled = nsecs_to_cputime(ut);
-	*stimescaled = nsecs_to_cputime(st);
-}
-
 extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 
@@ -3527,17 +3498,6 @@ static __always_inline bool need_resched(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
-static inline void thread_group_cputime_t(struct task_struct *tsk,
-					  struct task_cputime_t *cputime)
-{
-	struct task_cputime times;
-
-	thread_group_cputime(tsk, &times);
-	cputime->utime = nsecs_to_cputime(times.utime);
-	cputime->stime = nsecs_to_cputime(times.stime);
-	cputime->sum_exec_runtime = times.sum_exec_runtime;
-}
-
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
-- 
cgit v1.2.3


From 23244a5c8003d4154161a8289a7d3783b0237c08 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:37 +0100
Subject: sched/cputime: Push time to account_user_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-22-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     |  2 +-
 arch/powerpc/kernel/time.c  |  2 +-
 arch/s390/kernel/vtime.c    |  2 +-
 include/linux/kernel_stat.h |  2 +-
 kernel/sched/cputime.c      | 42 ++++++++++++++++++++++++------------------
 5 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index d040f12ea9f9..ed98b26047c2 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -67,7 +67,7 @@ void vtime_flush(struct task_struct *tsk)
 	cputime_t delta;
 
 	if (ti->utime)
-		account_user_time(tsk, cycle_to_cputime(ti->utime));
+		account_user_time(tsk, cputime_to_nsecs(cycle_to_cputime(ti->utime)));
 
 	if (ti->gtime)
 		account_guest_time(tsk, cycle_to_cputime(ti->gtime));
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 3cca82e065c9..c3931d816190 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -393,7 +393,7 @@ void vtime_flush(struct task_struct *tsk)
 	struct cpu_accounting_data *acct = get_accounting(tsk);
 
 	if (acct->utime)
-		account_user_time(tsk, acct->utime);
+		account_user_time(tsk, cputime_to_nsecs(acct->utime));
 
 	if (acct->utime_scaled)
 		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index f2fc27491604..ca206d122302 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -166,7 +166,7 @@ static int do_account_vtime(struct task_struct *tsk)
 
 	/* Push account value */
 	if (user) {
-		account_user_time(tsk, user);
+		account_user_time(tsk, cputime_to_nsecs(user));
 		tsk->utimescaled += cputime_to_nsecs(scale_vtime(user));
 	}
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index c3e38ded2d73..b716001ac23e 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 	return kstat_cpu(cpu).irqs_sum;
 }
 
-extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_user_time(struct task_struct *, u64);
 extern void account_guest_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f7b9624c7df0..55d31c35833a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -116,18 +116,18 @@ static inline void task_group_account_field(struct task_struct *p, int index,
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  */
-void account_user_time(struct task_struct *p, cputime_t cputime)
+void account_user_time(struct task_struct *p, u64 cputime)
 {
 	int index;
 
 	/* Add user time to process. */
-	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime_to_nsecs(cputime));
+	p->utime += cputime;
+	account_group_user_time(p, cputime);
 
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
 	/* Add user time to cpustat. */
-	task_group_account_field(p, index, cputime_to_nsecs(cputime));
+	task_group_account_field(p, index, cputime);
 
 	/* Account for user time used */
 	acct_account_cputime(p);
@@ -363,8 +363,9 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 					 struct rq *rq, int ticks)
 {
-	u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+	u64 old_cputime = (__force u64) cputime_one_jiffy * ticks;
 	cputime_t other;
+	u64 cputime;
 
 	/*
 	 * When returning from idle, many ticks can get accounted at
@@ -374,9 +375,11 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	 * other time can exceed ticks occasionally.
 	 */
 	other = account_other_time(ULONG_MAX);
-	if (other >= cputime)
+	if (other >= old_cputime)
 		return;
-	cputime -= other;
+
+	old_cputime -= other;
+	cputime = cputime_to_nsecs(old_cputime);
 
 	if (this_cpu_ksoftirqd() == p) {
 		/*
@@ -384,15 +387,16 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
+		account_system_index_time(p, old_cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
-		account_idle_time(cputime);
+		account_idle_time(old_cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
-		account_guest_time(p, cputime);
+
+		account_guest_time(p, old_cputime);
 	} else {
-		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
+		account_system_index_time(p, old_cputime, CPUTIME_SYSTEM);
 	}
 }
 
@@ -473,7 +477,8 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
-	cputime_t cputime, steal;
+	cputime_t old_cputime, steal;
+	u64 cputime;
 	struct rq *rq = this_rq();
 
 	if (vtime_accounting_cpu_enabled())
@@ -484,20 +489,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		return;
 	}
 
-	cputime = cputime_one_jiffy;
+	old_cputime = cputime_one_jiffy;
 	steal = steal_account_process_time(ULONG_MAX);
 
-	if (steal >= cputime)
+	if (steal >= old_cputime)
 		return;
 
-	cputime -= steal;
+	old_cputime -= steal;
+	cputime = cputime_to_nsecs(old_cputime);
 
 	if (user_tick)
 		account_user_time(p, cputime);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, cputime);
+		account_system_time(p, HARDIRQ_OFFSET, old_cputime);
 	else
-		account_idle_time(cputime);
+		account_idle_time(old_cputime);
 }
 
 /*
@@ -736,7 +742,7 @@ void vtime_account_user(struct task_struct *tsk)
 	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk)) {
 		delta_cpu = get_vtime_delta(tsk);
-		account_user_time(tsk, delta_cpu);
+		account_user_time(tsk, cputime_to_nsecs(delta_cpu));
 	}
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
-- 
cgit v1.2.3


From be9095ed4fb3cf69e9fdf64e28ff6b5bd0ec7215 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:38 +0100
Subject: sched/cputime: Push time to account_steal_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-23-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/time.c  |  2 +-
 arch/s390/kernel/vtime.c    |  2 +-
 include/linux/kernel_stat.h |  2 +-
 kernel/sched/cputime.c      | 11 ++++++-----
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index c3931d816190..53e5982edacf 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -402,7 +402,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_guest_time(tsk, acct->gtime);
 
 	if (acct->steal_time)
-		account_steal_time(acct->steal_time);
+		account_steal_time(cputime_to_nsecs(acct->steal_time));
 
 	if (acct->idle_time)
 		account_idle_time(acct->idle_time);
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index ca206d122302..1e7023c6ef23 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -188,7 +188,7 @@ static int do_account_vtime(struct task_struct *tsk)
 	steal = S390_lowcore.steal_timer;
 	if ((s64) steal > 0) {
 		S390_lowcore.steal_timer = 0;
-		account_steal_time(steal);
+		account_steal_time(cputime_to_nsecs(steal));
 	}
 
 	return virt_timer_forward(user + guest + system + hardirq + softirq);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b716001ac23e..1d55d10abf9d 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -83,7 +83,7 @@ extern void account_guest_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
 				      enum cpu_usage_stat);
-extern void account_steal_time(cputime_t);
+extern void account_steal_time(u64);
 extern void account_idle_time(cputime_t);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 55d31c35833a..9a8028760930 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -207,11 +207,11 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
  * Account for involuntary wait time.
  * @cputime: the cpu time spent in involuntary wait
  */
-void account_steal_time(cputime_t cputime)
+void account_steal_time(u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-	cpustat[CPUTIME_STEAL] += cputime_to_nsecs(cputime);
+	cpustat[CPUTIME_STEAL] += cputime;
 }
 
 /*
@@ -239,14 +239,15 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
 		cputime_t steal_cputime;
-		u64 steal;
+		u64 steal, rounded;
 
 		steal = paravirt_steal_clock(smp_processor_id());
 		steal -= this_rq()->prev_steal_time;
 
 		steal_cputime = min(nsecs_to_cputime(steal), maxtime);
-		account_steal_time(steal_cputime);
-		this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
+		rounded = cputime_to_nsecs(steal_cputime);
+		account_steal_time(rounded);
+		this_rq()->prev_steal_time += rounded;
 
 		return steal_cputime;
 	}
-- 
cgit v1.2.3


From 18b43a9bd7ae91185e398dd983fb4fffb9e81b3a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:39 +0100
Subject: sched/cputime: Push time to account_idle_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-24-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     |  2 +-
 arch/powerpc/kernel/time.c  |  2 +-
 arch/s390/kernel/idle.c     |  2 +-
 include/linux/kernel_stat.h |  2 +-
 kernel/sched/cputime.c      | 18 +++++++++---------
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index ed98b26047c2..5dc801d97790 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -73,7 +73,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_guest_time(tsk, cycle_to_cputime(ti->gtime));
 
 	if (ti->idle_time)
-		account_idle_time(cycle_to_cputime(ti->idle_time));
+		account_idle_time(cputime_to_nsecs(cycle_to_cputime(ti->idle_time)));
 
 	if (ti->stime) {
 		delta = cycle_to_cputime(ti->stime);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 53e5982edacf..739897a10fd3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -405,7 +405,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_steal_time(cputime_to_nsecs(acct->steal_time));
 
 	if (acct->idle_time)
-		account_idle_time(acct->idle_time);
+		account_idle_time(cputime_to_nsecs(acct->idle_time));
 
 	if (acct->stime)
 		account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM);
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 7a55c29b0b33..99f1d8185ae2 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -43,7 +43,7 @@ void enabled_wait(void)
 	idle->clock_idle_enter = idle->clock_idle_exit = 0ULL;
 	idle->idle_time += idle_time;
 	idle->idle_count++;
-	account_idle_time(idle_time);
+	account_idle_time(cputime_to_nsecs(idle_time));
 	write_seqcount_end(&idle->seqcount);
 }
 NOKPROBE_SYMBOL(enabled_wait);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 1d55d10abf9d..e1cd8970e096 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -84,7 +84,7 @@ extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
 				      enum cpu_usage_stat);
 extern void account_steal_time(u64);
-extern void account_idle_time(cputime_t);
+extern void account_idle_time(u64);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline void account_process_tick(struct task_struct *tsk, int user)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9a8028760930..fd5375f956fe 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -218,15 +218,15 @@ void account_steal_time(u64 cputime)
  * Account for idle time.
  * @cputime: the cpu time spent in idle wait
  */
-void account_idle_time(cputime_t cputime)
+void account_idle_time(u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	struct rq *rq = this_rq();
 
 	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_IOWAIT] += cputime;
 	else
-		cpustat[CPUTIME_IDLE] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_IDLE] += cputime;
 }
 
 /*
@@ -392,7 +392,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
-		account_idle_time(old_cputime);
+		account_idle_time(cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
 
 		account_guest_time(p, old_cputime);
@@ -504,7 +504,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 		account_system_time(p, HARDIRQ_OFFSET, old_cputime);
 	else
-		account_idle_time(old_cputime);
+		account_idle_time(cputime);
 }
 
 /*
@@ -513,15 +513,15 @@ void account_process_tick(struct task_struct *p, int user_tick)
  */
 void account_idle_ticks(unsigned long ticks)
 {
-	cputime_t cputime, steal;
+	u64 cputime, steal;
 
 	if (sched_clock_irqtime) {
 		irqtime_account_idle_ticks(ticks);
 		return;
 	}
 
-	cputime = jiffies_to_cputime(ticks);
-	steal = steal_account_process_time(ULONG_MAX);
+	cputime = ticks * TICK_NSEC;
+	steal = cputime_to_nsecs(steal_account_process_time(ULONG_MAX));
 
 	if (steal >= cputime)
 		return;
@@ -787,7 +787,7 @@ void vtime_account_idle(struct task_struct *tsk)
 {
 	cputime_t delta_cpu = get_vtime_delta(tsk);
 
-	account_idle_time(delta_cpu);
+	account_idle_time(cputime_to_nsecs(delta_cpu));
 }
 
 void arch_vtime_task_switch(struct task_struct *prev)
-- 
cgit v1.2.3


From fb8b049c988f1ff460b063b8a41ea9a3c79921c2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:40 +0100
Subject: sched/cputime: Push time to account_system_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-25-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     | 11 ++++++-----
 arch/powerpc/kernel/time.c  | 15 ++++++++-------
 arch/s390/kernel/idle.c     |  2 +-
 arch/s390/kernel/vtime.c    |  6 +++---
 include/linux/kernel_stat.h |  7 +++----
 kernel/sched/cputime.c      | 39 +++++++++++++++++++--------------------
 6 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 5dc801d97790..f15bca4776a7 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -21,6 +21,7 @@
 #include <linux/timex.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/platform_device.h>
+#include <linux/cputime.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
@@ -64,29 +65,29 @@ extern cputime_t cycle_to_cputime(u64 cyc);
 void vtime_flush(struct task_struct *tsk)
 {
 	struct thread_info *ti = task_thread_info(tsk);
-	cputime_t delta;
+	u64 delta;
 
 	if (ti->utime)
 		account_user_time(tsk, cputime_to_nsecs(cycle_to_cputime(ti->utime)));
 
 	if (ti->gtime)
-		account_guest_time(tsk, cycle_to_cputime(ti->gtime));
+		account_guest_time(tsk, cputime_to_nsecs(cycle_to_cputime(ti->gtime)));
 
 	if (ti->idle_time)
 		account_idle_time(cputime_to_nsecs(cycle_to_cputime(ti->idle_time)));
 
 	if (ti->stime) {
-		delta = cycle_to_cputime(ti->stime);
+		delta = cputime_to_nsecs(cycle_to_cputime(ti->stime));
 		account_system_index_time(tsk, delta, CPUTIME_SYSTEM);
 	}
 
 	if (ti->hardirq_time) {
-		delta = cycle_to_cputime(ti->hardirq_time);
+		delta = cputime_to_nsecs(cycle_to_cputime(ti->hardirq_time));
 		account_system_index_time(tsk, delta, CPUTIME_IRQ);
 	}
 
 	if (ti->softirq_time) {
-		delta = cycle_to_cputime(ti->softirq_time);
+		delta = cputime_to_nsecs(cycle_to_cputime(ti->softirq_time));
 		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
 	}
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 739897a10fd3..01f53bfe100b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -57,6 +57,7 @@
 #include <linux/clk-provider.h>
 #include <linux/suspend.h>
 #include <linux/rtc.h>
+#include <linux/cputime.h>
 #include <asm/trace.h>
 
 #include <asm/io.h>
@@ -72,7 +73,6 @@
 #include <asm/smp.h>
 #include <asm/vdso_datapage.h>
 #include <asm/firmware.h>
-#include <asm/cputime.h>
 #include <asm/asm-prototypes.h>
 
 /* powerpc clocksource/clockevent code */
@@ -399,7 +399,7 @@ void vtime_flush(struct task_struct *tsk)
 		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
 
 	if (acct->gtime)
-		account_guest_time(tsk, acct->gtime);
+		account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
 
 	if (acct->steal_time)
 		account_steal_time(cputime_to_nsecs(acct->steal_time));
@@ -408,16 +408,17 @@ void vtime_flush(struct task_struct *tsk)
 		account_idle_time(cputime_to_nsecs(acct->idle_time));
 
 	if (acct->stime)
-		account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM);
-
+		account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
+					  CPUTIME_SYSTEM);
 	if (acct->stime_scaled)
 		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
 
 	if (acct->hardirq_time)
-		account_system_index_time(tsk, acct->hardirq_time, CPUTIME_IRQ);
-
+		account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
+					  CPUTIME_IRQ);
 	if (acct->softirq_time)
-		account_system_index_time(tsk, acct->softirq_time, CPUTIME_SOFTIRQ);
+		account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time),
+					  CPUTIME_SOFTIRQ);
 
 	acct->utime = 0;
 	acct->utime_scaled = 0;
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 99f1d8185ae2..5c0e08e761c3 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -12,7 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include "entry.h"
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 1e7023c6ef23..b4a3e9e06ef2 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -6,13 +6,13 @@
  */
 
 #include <linux/kernel_stat.h>
+#include <linux/cputime.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/timex.h>
 #include <linux/types.h>
 #include <linux/time.h>
 
-#include <asm/cputime.h>
 #include <asm/vtimer.h>
 #include <asm/vtime.h>
 #include <asm/cpu_mf.h>
@@ -115,7 +115,7 @@ static void account_system_index_scaled(struct task_struct *p,
 					enum cpu_usage_stat index)
 {
 	p->stimescaled += cputime_to_nsecs(scaled);
-	account_system_index_time(p, cputime, index);
+	account_system_index_time(p, cputime_to_nsecs(cputime), index);
 }
 
 /*
@@ -171,7 +171,7 @@ static int do_account_vtime(struct task_struct *tsk)
 	}
 
 	if (guest) {
-		account_guest_time(tsk, guest);
+		account_guest_time(tsk, cputime_to_nsecs(guest));
 		tsk->utimescaled += cputime_to_nsecs(scale_vtime(guest));
 	}
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index e1cd8970e096..66be8b6beceb 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -9,7 +9,6 @@
 #include <linux/sched.h>
 #include <linux/vtime.h>
 #include <asm/irq.h>
-#include <linux/cputime.h>
 
 /*
  * 'kernel_stat.h' contains the definitions needed for doing
@@ -79,9 +78,9 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 }
 
 extern void account_user_time(struct task_struct *, u64);
-extern void account_guest_time(struct task_struct *, cputime_t);
-extern void account_system_time(struct task_struct *, int, cputime_t);
-extern void account_system_index_time(struct task_struct *, cputime_t,
+extern void account_guest_time(struct task_struct *, u64);
+extern void account_system_time(struct task_struct *, int, u64);
+extern void account_system_index_time(struct task_struct *, u64,
 				      enum cpu_usage_stat);
 extern void account_steal_time(u64);
 extern void account_idle_time(u64);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index fd5375f956fe..d28e9c53727c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
+#include <linux/cputime.h>
 #include "sched.h"
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -138,22 +139,22 @@ void account_user_time(struct task_struct *p, u64 cputime)
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  */
-void account_guest_time(struct task_struct *p, cputime_t cputime)
+void account_guest_time(struct task_struct *p, u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
 	/* Add guest time to process. */
-	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime_to_nsecs(cputime));
-	p->gtime += cputime_to_nsecs(cputime);
+	p->utime += cputime;
+	account_group_user_time(p, cputime);
+	p->gtime += cputime;
 
 	/* Add guest time to cpustat. */
 	if (task_nice(p) > 0) {
-		cpustat[CPUTIME_NICE] += cputime_to_nsecs(cputime);
-		cpustat[CPUTIME_GUEST_NICE] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_NICE] += cputime;
+		cpustat[CPUTIME_GUEST_NICE] += cputime;
 	} else {
-		cpustat[CPUTIME_USER] += cputime_to_nsecs(cputime);
-		cpustat[CPUTIME_GUEST] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_USER] += cputime;
+		cpustat[CPUTIME_GUEST] += cputime;
 	}
 }
 
@@ -164,14 +165,14 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
  * @index: pointer to cpustat field that has to be updated
  */
 void account_system_index_time(struct task_struct *p,
-			       cputime_t cputime, enum cpu_usage_stat index)
+			       u64 cputime, enum cpu_usage_stat index)
 {
 	/* Add system time to process. */
-	p->stime += cputime_to_nsecs(cputime);
-	account_group_system_time(p, cputime_to_nsecs(cputime));
+	p->stime += cputime;
+	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
-	task_group_account_field(p, index, cputime_to_nsecs(cputime));
+	task_group_account_field(p, index, cputime);
 
 	/* Account for system time used */
 	acct_account_cputime(p);
@@ -183,8 +184,7 @@ void account_system_index_time(struct task_struct *p,
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime)
+void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 {
 	int index;
 
@@ -388,16 +388,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		account_system_index_time(p, old_cputime, CPUTIME_SOFTIRQ);
+		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
 		account_idle_time(cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
-
-		account_guest_time(p, old_cputime);
+		account_guest_time(p, cputime);
 	} else {
-		account_system_index_time(p, old_cputime, CPUTIME_SYSTEM);
+		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 	}
 }
 
@@ -502,7 +501,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	if (user_tick)
 		account_user_time(p, cputime);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, old_cputime);
+		account_system_time(p, HARDIRQ_OFFSET, cputime);
 	else
 		account_idle_time(cputime);
 }
@@ -722,7 +721,7 @@ static void __vtime_account_system(struct task_struct *tsk)
 {
 	cputime_t delta_cpu = get_vtime_delta(tsk);
 
-	account_system_time(tsk, irq_count(), delta_cpu);
+	account_system_time(tsk, irq_count(), cputime_to_nsecs(delta_cpu));
 }
 
 void vtime_account_system(struct task_struct *tsk)
-- 
cgit v1.2.3


From 934ad473552a48d303a54279518aa19cf674567d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:50 +0100
Subject: sched/cputime: Remove unused nsec_to_cputime()

It's unused now.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-35-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cputime.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cputime.h b/include/linux/cputime.h
index f2eb2ee535ca..a257d6690621 100644
--- a/include/linux/cputime.h
+++ b/include/linux/cputime.h
@@ -8,9 +8,4 @@
 	(cputime_to_usecs(__ct) * NSEC_PER_USEC)
 #endif
 
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)	\
-	usecs_to_cputime((__nsecs) / NSEC_PER_USEC)
-#endif
-
 #endif /* __LINUX_CPUTIME_H */
-- 
cgit v1.2.3


From b672592f022152155fde7db99aafbcf04a2c3ba5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:51 +0100
Subject: sched/cputime: Remove generic asm headers

cputime_t is now only used by two architectures:

	* powerpc (when CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y)
	* s390

And since the core doesn't use it anymore, we don't need any arch support
from the others. So we can remove their stub implementations.

A final cleanup would be to provide an efficient pure arch
implementation of cputime_to_nsec() for s390 and powerpc and finally
remove include/linux/cputime.h .

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-36-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/Kbuild      | 1 -
 arch/arc/include/asm/Kbuild        | 1 -
 arch/arm/include/asm/Kbuild        | 1 -
 arch/arm64/include/asm/Kbuild      | 1 -
 arch/avr32/include/asm/Kbuild      | 1 -
 arch/blackfin/include/asm/Kbuild   | 1 -
 arch/c6x/include/asm/Kbuild        | 1 -
 arch/cris/include/asm/Kbuild       | 1 -
 arch/frv/include/asm/Kbuild        | 1 -
 arch/h8300/include/asm/Kbuild      | 1 -
 arch/hexagon/include/asm/Kbuild    | 1 -
 arch/ia64/include/asm/cputime.h    | 4 +---
 arch/m32r/include/asm/Kbuild       | 1 -
 arch/m68k/include/asm/Kbuild       | 1 -
 arch/metag/include/asm/Kbuild      | 1 -
 arch/microblaze/include/asm/Kbuild | 1 -
 arch/mips/include/asm/Kbuild       | 1 -
 arch/mn10300/include/asm/Kbuild    | 1 -
 arch/nios2/include/asm/Kbuild      | 1 -
 arch/openrisc/include/asm/Kbuild   | 1 -
 arch/parisc/include/asm/Kbuild     | 1 -
 arch/powerpc/include/asm/cputime.h | 4 +---
 arch/score/include/asm/Kbuild      | 1 -
 arch/sh/include/asm/Kbuild         | 1 -
 arch/sparc/include/asm/Kbuild      | 1 -
 arch/tile/include/asm/Kbuild       | 1 -
 arch/um/include/asm/Kbuild         | 1 -
 arch/unicore32/include/asm/Kbuild  | 1 -
 arch/x86/include/asm/Kbuild        | 1 -
 arch/xtensa/include/asm/Kbuild     | 1 -
 include/asm-generic/cputime.h      | 7 -------
 include/linux/cputime.h            | 2 ++
 32 files changed, 4 insertions(+), 41 deletions(-)
 delete mode 100644 include/asm-generic/cputime.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index bf8475ce85ee..baa152b9348e 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -1,7 +1,6 @@
 
 
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += exec.h
 generic-y += export.h
 generic-y += irq_work.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index c332604606dd..63a04013d05a 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -2,7 +2,6 @@ generic-y += auxvec.h
 generic-y += bitsperlong.h
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index efb21757d41f..b14e8c7d71bd 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -2,7 +2,6 @@
 
 generic-y += bitsperlong.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += early_ioremap.h
 generic-y += emergency-restart.h
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 8365a84c2640..a12f1afc95a3 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -1,6 +1,5 @@
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += delay.h
 generic-y += div64.h
 generic-y += dma.h
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild
index 241b9b9729d8..3d7ef2c17a7c 100644
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -1,6 +1,5 @@
 
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += delay.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 2fb67b59d188..d6fa60b158be 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -2,7 +2,6 @@
 generic-y += auxvec.h
 generic-y += bitsperlong.h
 generic-y += bugs.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 64465e7e2245..4e9f57433f3a 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -5,7 +5,6 @@ generic-y += barrier.h
 generic-y += bitsperlong.h
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 1778805f6380..9f19e19bff9d 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -4,7 +4,6 @@ generic-y += barrier.h
 generic-y += bitsperlong.h
 generic-y += clkdev.h
 generic-y += cmpxchg.h
-generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += errno.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index 1fa084cf1a43..0f5b0d5d313c 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -1,6 +1,5 @@
 
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += exec.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 373cb23301e3..5efd0c87f3c0 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -5,7 +5,6 @@ generic-y += bugs.h
 generic-y += cacheflush.h
 generic-y += checksum.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += delay.h
 generic-y += device.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index db8ddabc6bd2..a43a7c90e4af 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -6,7 +6,6 @@ generic-y += barrier.h
 generic-y += bug.h
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
index 44bcffc5681c..3d665c0627a8 100644
--- a/arch/ia64/include/asm/cputime.h
+++ b/arch/ia64/include/asm/cputime.h
@@ -18,9 +18,7 @@
 #ifndef __IA64_CPUTIME_H
 #define __IA64_CPUTIME_H
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-# include <asm-generic/cputime.h>
-#else
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 extern void arch_vtime_task_switch(struct task_struct *tsk);
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 860e440611c9..652100b64a71 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -1,6 +1,5 @@
 
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += exec.h
 generic-y += irq_work.h
 generic-y += kvm_para.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 1f2e5d31cb24..6c76d6c24b3d 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -1,7 +1,6 @@
 generic-y += barrier.h
 generic-y += bitsperlong.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += device.h
 generic-y += emergency-restart.h
 generic-y += errno.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index 167150c701d1..d3731f0db73b 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -2,7 +2,6 @@ generic-y += auxvec.h
 generic-y += bitsperlong.h
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += device.h
 generic-y += dma.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index b0ae88c9fed9..6275eb051801 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -1,7 +1,6 @@
 
 generic-y += barrier.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += device.h
 generic-y += exec.h
 generic-y += irq_work.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 3269b742a75e..994b1c4392be 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -1,7 +1,6 @@
 # MIPS headers
 generic-(CONFIG_GENERIC_CSUM) += checksum.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += dma-contiguous.h
 generic-y += emergency-restart.h
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index 1c8dd0f5cd5d..97f64c723a0c 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -1,7 +1,6 @@
 
 generic-y += barrier.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += exec.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index d63330e88379..35b0e883761a 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -6,7 +6,6 @@ generic-y += bitsperlong.h
 generic-y += bug.h
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 2832f031fb11..ef8d1ccc3e45 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -12,7 +12,6 @@ generic-y += checksum.h
 generic-y += clkdev.h
 generic-y += cmpxchg-local.h
 generic-y += cmpxchg.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 91f53c07f410..4e179d770d69 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -2,7 +2,6 @@
 generic-y += auxvec.h
 generic-y += barrier.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 6ec0ba6f1a61..99b541865d8d 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -16,9 +16,7 @@
 #ifndef __POWERPC_CPUTIME_H
 #define __POWERPC_CPUTIME_H
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#include <asm-generic/cputime.h>
-#else
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 
 #include <linux/types.h>
 #include <linux/time.h>
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index a05218ff3fe4..51970bb6c4fe 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -4,7 +4,6 @@ header-y +=
 
 generic-y += barrier.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 751c3373a92c..cf2a75063b53 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -1,7 +1,6 @@
 
 generic-y += bitsperlong.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += delay.h
 generic-y += div64.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 0569bfac4afb..e9e837bc3158 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -2,7 +2,6 @@
 
 
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += exec.h
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 2d1f5638974c..51a339feceac 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -4,7 +4,6 @@ header-y += ../arch/
 generic-y += bug.h
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 052f7f6d0551..90c281cd7e1d 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -1,7 +1,6 @@
 generic-y += barrier.h
 generic-y += bug.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += delay.h
 generic-y += device.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 256c45b3ae34..5d51ade89f4c 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -4,7 +4,6 @@ generic-y += auxvec.h
 generic-y += bitsperlong.h
 generic-y += bugs.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 2b892e2313a9..5d6a53fd7521 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -7,7 +7,6 @@ generated-y += unistd_64_x32.h
 generated-y += xen-hypercalls.h
 
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += dma-contiguous.h
 generic-y += early_ioremap.h
 generic-y += mcs_spinlock.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index b7fbaa56b51a..9e9760b20be5 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -1,7 +1,6 @@
 generic-y += bitsperlong.h
 generic-y += bug.h
 generic-y += clkdev.h
-generic-y += cputime.h
 generic-y += div64.h
 generic-y += dma-contiguous.h
 generic-y += emergency-restart.h
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
deleted file mode 100644
index 358e54777b56..000000000000
--- a/include/asm-generic/cputime.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ASM_GENERIC_CPUTIME_H
-#define _ASM_GENERIC_CPUTIME_H
-
-#include <linux/time.h>
-#include <linux/jiffies.h>
-
-#endif
diff --git a/include/linux/cputime.h b/include/linux/cputime.h
index a257d6690621..a691dc4ddc13 100644
--- a/include/linux/cputime.h
+++ b/include/linux/cputime.h
@@ -1,6 +1,7 @@
 #ifndef __LINUX_CPUTIME_H
 #define __LINUX_CPUTIME_H
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #include <asm/cputime.h>
 
 #ifndef cputime_to_nsecs
@@ -8,4 +9,5 @@
 	(cputime_to_usecs(__ct) * NSEC_PER_USEC)
 #endif
 
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 #endif /* __LINUX_CPUTIME_H */
-- 
cgit v1.2.3


From 7e1f9467d1e48c64c27d6a32de1bfd1b9cdb1002 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 29 Jan 2017 07:42:12 -0800
Subject: sched/wait, rcuwait: Fix typo in comment

Forgot to update the comment after renaming the call.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1485704532-9290-1-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcuwait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 0e93d56c7ab2..a4ede51b3e7c 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -47,7 +47,7 @@ extern void rcuwait_wake_up(struct rcuwait *w);
 	for (;;) {							\
 		/*							\
 		 * Implicit barrier (A) pairs with (B) in		\
-		 * rcuwait_trywake().					\
+		 * rcuwait_wake_up().					\
 		 */							\
 		set_current_state(TASK_UNINTERRUPTIBLE);		\
 		if (condition)						\
-- 
cgit v1.2.3


From 0754445d71c37a7afd4f0790a9be4cf53c1b8cc4 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 29 Jan 2017 07:15:31 -0800
Subject: sched/wake_q: Clarify queue reinit comment

As of:

  bcc9a76d5ac ("locking/rwsem: Reinit wake_q after use")

the comment regarding the list reinitialization no longer applies,
update it with the new wake_q_init() helper.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Waiman Long <longman@redhat.com>
Cc: peterz@infradead.org
Cc: longman@redhat.com
Cc: akpm@linux-foundation.org
Cc: paulmck@linux.vnet.ibm.com
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20170129151531.GA2444@linux-80c1.suse
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4e62b378bd65..1cc0dede2122 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1000,8 +1000,8 @@ enum cpu_idle_type {
  *
  * The DEFINE_WAKE_Q macro declares and initializes the list head.
  * wake_up_q() does NOT reinitialize the list; it's expected to be
- * called near the end of a function, where the fact that the queue is
- * not used again will be easy to see by inspection.
+ * called near the end of a function. Otherwise, the list can be
+ * re-initialized for later re-use by wake_q_init().
  *
  * Note that this can cause spurious wakeups. schedule() callers
  * must ensure the call is done inside a loop, confirming that the
-- 
cgit v1.2.3


From 733ce725aa4bfa9063be053bfe7f4597d76f0dd1 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 27 Jan 2017 12:38:16 -0500
Subject: sched/clock: Add dummy clear_sched_clock_stable() stub function

In commit:

  acb04058de494 ("sched/clock: Fix hotplug crash")

the PARISC code gained a call to this function.

However the prototype for it is within a CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
 #ifdef/#endif.  That, combined with this:

  arch/parisc/Kconfig:    select HAVE_UNSTABLE_SCHED_CLOCK if SMP

means that PARISC can have it either enabled or disabled, resulting
in the following build fail:

  arch/parisc/kernel/setup.c:180:2: error: implicit declaration of
  function 'clear_sched_clock_stable' [-Werror=implicit-function-declaration]

Add a no-op stub for the non-SMP case to prevent this.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: acb04058de49 ("sched/clock: Fix hotplug crash")
Link: http://lkml.kernel.org/r/20170127173816.22733-1-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d17645402767..e2ed46d3ed71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2511,6 +2511,10 @@ static inline void sched_clock_tick(void)
 {
 }
 
+static inline void clear_sched_clock_stable(void)
+{
+}
+
 static inline void sched_clock_idle_sleep_event(void)
 {
 }
-- 
cgit v1.2.3


From 975e155ed8732cb81f55c021c441ae662dd040b5 Mon Sep 17 00:00:00 2001
From: Shile Zhang <shile.zhang@nokia.com>
Date: Sat, 28 Jan 2017 22:00:49 +0800
Subject: sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning
 knob in milliseconds

We added the 'sched_rr_timeslice_ms' SCHED_RR tuning knob in this commit:

  ce0dbbbb30ae ("sched/rt: Add a tuning knob to allow changing SCHED_RR timeslice")

... which name suggests to users that it's in milliseconds, while in reality
it's being set in milliseconds but the result is shown in jiffies.

This is obviously confusing when HZ is not 1000, it makes it appear like the
value set failed, such as HZ=100:

  root# echo 100 > /proc/sys/kernel/sched_rr_timeslice_ms
  root# cat /proc/sys/kernel/sched_rr_timeslice_ms
  10

Fix this to be milliseconds all around.

Signed-off-by: Shile Zhang <shile.zhang@nokia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485612049-20923-1-git-send-email-shile.zhang@nokia.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/sysctl.h | 1 +
 kernel/sched/core.c          | 5 +++--
 kernel/sched/rt.c            | 1 +
 kernel/sysctl.c              | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 441145351301..49308e142aae 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -59,6 +59,7 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
 
+extern int sysctl_sched_rr_timeslice;
 extern int sched_rr_timeslice;
 
 extern int sched_rr_handler(struct ctl_table *table, int write,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d01f9d047397..10e18faaa632 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8471,8 +8471,9 @@ int sched_rr_handler(struct ctl_table *table, int write,
 	/* make sure that internally we keep jiffies */
 	/* also, writing zero resets timeslice to default */
 	if (!ret && write) {
-		sched_rr_timeslice = sched_rr_timeslice <= 0 ?
-			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+		sched_rr_timeslice =
+			sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+			msecs_to_jiffies(sysctl_sched_rr_timeslice);
 	}
 	mutex_unlock(&mutex);
 	return ret;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 704f2b89abf1..4101f9d1aa40 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
 #include <linux/irq_work.h>
 
 int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1aea594a54db..bb260ceb3718 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -416,7 +416,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "sched_rr_timeslice_ms",
-		.data		= &sched_rr_timeslice,
+		.data		= &sysctl_sched_rr_timeslice,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= sched_rr_handler,
-- 
cgit v1.2.3


From 93faccbbfa958a9668d3ab4e30f38dd205cee8d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Feb 2017 06:06:16 +1300
Subject: fs: Better permission checking for submounts

To support unprivileged users mounting filesystems two permission
checks have to be performed: a test to see if the user allowed to
create a mount in the mount namespace, and a test to see if
the user is allowed to access the specified filesystem.

The automount case is special in that mounting the original filesystem
grants permission to mount the sub-filesystems, to any user who
happens to stumble across the their mountpoint and satisfies the
ordinary filesystem permission checks.

Attempting to handle the automount case by using override_creds
almost works.  It preserves the idea that permission to mount
the original filesystem is permission to mount the sub-filesystem.
Unfortunately using override_creds messes up the filesystems
ordinary permission checks.

Solve this by being explicit that a mount is a submount by introducing
vfs_submount, and using it where appropriate.

vfs_submount uses a new mount internal mount flags MS_SUBMOUNT, to let
sget and friends know that a mount is a submount so they can take appropriate
action.

sget and sget_userns are modified to not perform any permission checks
on submounts.

follow_automount is modified to stop using override_creds as that
has proven problemantic.

do_mount is modified to always remove the new MS_SUBMOUNT flag so
that we know userspace will never by able to specify it.

autofs4 is modified to stop using current_real_cred that was put in
there to handle the previous version of submount permission checking.

cifs is modified to pass the mountpoint all of the way down to vfs_submount.

debugfs is modified to pass the mountpoint all of the way down to
trace_automount by adding a new parameter.  To make this change easier
a new typedef debugfs_automount_t is introduced to capture the type of
the debugfs automount function.

Cc: stable@vger.kernel.org
Fixes: 069d5ac9ae0d ("autofs:  Fix automounts by using current_real_cred()->uid")
Fixes: aeaa4a79ff6a ("fs: Call d_automount with the filesystems creds")
Reviewed-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/afs/mntpt.c          |  2 +-
 fs/autofs4/waitq.c      |  4 ++--
 fs/cifs/cifs_dfs_ref.c  |  7 ++++---
 fs/debugfs/inode.c      |  8 ++++----
 fs/namei.c              |  3 ---
 fs/namespace.c          | 17 ++++++++++++++++-
 fs/nfs/namespace.c      |  2 +-
 fs/nfs/nfs4namespace.c  |  2 +-
 fs/super.c              | 13 ++++++++++---
 include/linux/debugfs.h |  3 ++-
 include/linux/mount.h   |  3 +++
 include/uapi/linux/fs.h |  1 +
 kernel/trace/trace.c    |  4 ++--
 13 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	_debug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+	mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
 	_debug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
-		wq->uid = current_real_cred()->uid;
-		wq->gid = current_real_cred()->gid;
+		wq->uid = current_cred()->uid;
+		wq->gid = current_cred()->gid;
 		wq->pid = pid;
 		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
  * @fullpath:		full path in UNC format
  * @ref:		server's referral
  */
-static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
+		struct cifs_sb_info *cifs_sb,
 		const char *fullpath, const struct dfs_info3_param *ref)
 {
 	struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
 
-	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
 	kfree(mountdata);
 	kfree(devname);
 	return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 			mnt = ERR_PTR(-EINVAL);
 			break;
 		}
-		mnt = cifs_dfs_do_refmount(cifs_sb,
+		mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
 				full_path, referrals + i);
 		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
 			 __func__, referrals[i].node_name, mnt);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f17fcf89e18e..1e30f74a9527 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
 
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-	struct vfsmount *(*f)(void *);
-	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
-	return f(d_inode(path->dentry)->i_private);
+	debugfs_automount_t f;
+	f = (debugfs_automount_t)path->dentry->d_fsdata;
+	return f(path->dentry, d_inode(path->dentry)->i_private);
 }
 
 static const struct dentry_operations debugfs_dops = {
@@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
  */
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data)
 {
 	struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/namei.c b/fs/namei.c
index 6fa3e9138fe4..da689c9c005e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
-	const struct cred *old_cred;
 	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
-	old_cred = override_creds(&init_cred);
 	mnt = path->dentry->d_op->d_automount(path);
-	revert_creds(old_cred);
 	if (IS_ERR(mnt)) {
 		/*
 		 * The filesystem is allowed to return -EISDIR here to indicate
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..089a6b23135a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -989,6 +989,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
+struct vfsmount *
+vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
+	     const char *name, void *data)
+{
+	/* Until it is worked out how to pass the user namespace
+	 * through from the parent mount to the submount don't support
+	 * unprivileged mounts with submounts.
+	 */
+	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
+		return ERR_PTR(-EPERM);
+
+	return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
+}
+EXPORT_SYMBOL_GPL(vfs_submount);
+
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
@@ -2794,7 +2809,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME | MS_NOREMOTELOCK);
+		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
 
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 					   const char *devname,
 					   struct nfs_clone_mount *mountdata)
 {
-	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+	return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
 }
 
 /**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				mountdata->hostname,
 				mountdata->mnt_path);
 
-		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
 		if (!IS_ERR(mnt))
 			break;
 	}
diff --git a/fs/super.c b/fs/super.c
index 1709ed029a2c..4185844f7a12 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	if (!(flags & MS_KERNMOUNT) &&
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
 	    !(type->fs_flags & FS_USERNS_MOUNT) &&
 	    !capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
+		s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
 {
 	struct user_namespace *user_ns = current_user_ns();
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & MS_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	return sget_userns(type, test, set, flags, user_ns, data);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 014cc564d1c4..233006be30aa 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -97,9 +97,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 				      const char *dest);
 
+typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data);
 
 void debugfs_remove(struct dentry *dentry);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c6f55158d5e5..8e0352af06b7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -90,6 +90,9 @@ struct file_system_type;
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
+extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
+				     struct file_system_type *type,
+				     const char *name, void *data);
 
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93fbf188..048a85e9f017 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@ struct inodes_stat_t {
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
+#define MS_SUBMOUNT     (1<<26)
 #define MS_NOREMOTELOCK	(1<<27)
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..310f0ea0d1a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	ftrace_init_tracefs(tr, d_tracer);
 }
 
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 {
 	struct vfsmount *mnt;
 	struct file_system_type *type;
@@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
 	type = get_fs_type("tracefs");
 	if (!type)
 		return NULL;
-	mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+	mnt = vfs_submount(mntpt, type, "tracefs", NULL);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return NULL;
-- 
cgit v1.2.3


From cb9c68363efb6d1f950ec55fb06e031ee70db5fc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 23 Jan 2017 18:21:56 +0100
Subject: skbuff: add and use skb_nfct helper

Followup patch renames skb->nfct and changes its type so add a helper to
avoid intrusive rename change later.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h                         | 13 ++++++++++---
 include/net/netfilter/nf_conntrack_core.h      |  2 +-
 net/core/skbuff.c                              |  2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c              |  8 ++++----
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c            |  4 ++--
 net/ipv4/netfilter/nf_dup_ipv4.c               |  2 +-
 net/ipv6/netfilter/ip6t_SYNPROXY.c             |  8 ++++----
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  4 ++--
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      |  4 ++--
 net/netfilter/nf_conntrack_core.c              |  4 ++--
 net/netfilter/nf_nat_helper.c                  |  2 +-
 net/netfilter/xt_CT.c                          |  2 +-
 net/openvswitch/conntrack.c                    |  6 +++---
 net/sched/cls_flow.c                           |  2 +-
 15 files changed, 36 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..276431e047af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3553,6 +3553,15 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
 	skb->csum = csum_add(skb->csum, delta);
 }
 
+static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	return skb->nfct;
+#else
+	return NULL;
+#endif
+}
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
@@ -3652,9 +3661,7 @@ static inline bool skb_irq_freeable(const struct sk_buff *skb)
 #if IS_ENABLED(CONFIG_XFRM)
 		!skb->sp &&
 #endif
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-		!skb->nfct &&
-#endif
+		!skb_nfct(skb) &&
 		!skb->_skb_refdst &&
 		!skb_has_frag_list(skb);
 }
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 62e17d1319ff..84ec7ca5f195 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -62,7 +62,7 @@ int __nf_conntrack_confirm(struct sk_buff *skb);
 /* Confirm a connection: returns NF_DROP if packet must be dropped. */
 static inline int nf_conntrack_confirm(struct sk_buff *skb)
 {
-	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+	struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb);
 	int ret = NF_ACCEPT;
 
 	if (ct && !nf_ct_is_untracked(ct)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..cac3ebfb4b45 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -655,7 +655,7 @@ static void skb_release_head_state(struct sk_buff *skb)
 		skb->destructor(skb);
 	}
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	nf_conntrack_put(skb->nfct);
+	nf_conntrack_put(skb_nfct(skb));
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(skb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 30c0de53e254..a12d4f0aa674 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -107,8 +107,8 @@ synproxy_send_client_synack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static void
@@ -230,8 +230,8 @@ synproxy_send_client_ack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 566afac98a88..478a025909fc 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -137,7 +137,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(skb->nfct == NULL);
+	NF_CT_ASSERT(!skb_nfct(skb));
 	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 
 	/* Are they talking about one of our connections? */
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 49bd6a54404f..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct) {
+	if (skb_nfct(skb)) {
 		enum ip_conntrack_info ctinfo;
 		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
@@ -75,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
 #if !IS_ENABLED(CONFIG_NF_NAT)
 	/* Previously seen (loopback)?  Ignore.  Do this before
 	   fragment check. */
-	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
 #endif
 #endif
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index a981ef7151ca..1a5e1f53ceaa 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -71,7 +71,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 	nf_reset(skb);
 	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	nf_conntrack_get(skb_nfct(skb));
 #endif
 	/*
 	 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 98c8dd38575a..2dc01d2c6ec0 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -121,8 +121,8 @@ synproxy_send_client_synack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static void
@@ -244,8 +244,8 @@ synproxy_send_client_ack(struct net *net,
 
 	synproxy_build_options(nth, opts);
 
-	synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
-			  niph, nth, tcp_hdr_size);
+	synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+			  IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
 }
 
 static bool
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 44b9af3f813e..09f1661a4e88 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -153,7 +153,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(skb->nfct == NULL);
+	NF_CT_ASSERT(!skb_nfct(skb));
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
@@ -224,7 +224,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	    noct_valid_new[type]) {
 		skb->nfct = &nf_ct_untracked_get()->ct_general;
 		skb->nfctinfo = IP_CT_NEW;
-		nf_conntrack_get(skb->nfct);
+		nf_conntrack_get(skb_nfct(skb));
 		return NF_ACCEPT;
 	}
 
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 8e0bdd058787..ada60d1a991b 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -37,7 +37,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (skb->nfct) {
+	if (skb_nfct(skb)) {
 		enum ip_conntrack_info ctinfo;
 		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
@@ -61,7 +61,7 @@ static unsigned int ipv6_defrag(void *priv,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Previously seen (loopback)?	*/
-	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
 		return NF_ACCEPT;
 #endif
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index adb7af3a4c4c..78aebf0ee6e3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1357,7 +1357,7 @@ repeat:
 		goto out;
 	}
 
-	NF_CT_ASSERT(skb->nfct);
+	NF_CT_ASSERT(skb_nfct(skb));
 
 	/* Decide what timeout policy we want to apply to this flow. */
 	timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
@@ -1528,7 +1528,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 	/* Attach to new skbuff, and increment count */
 	nskb->nfct = &ct->ct_general;
 	nskb->nfctinfo = ctinfo;
-	nf_conntrack_get(nskb->nfct);
+	nf_conntrack_get(skb_nfct(nskb));
 }
 
 /* Bring out ya dead! */
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 2840abb5bb99..211661cb2c90 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -60,7 +60,7 @@ static void mangle_contents(struct sk_buff *skb,
 		__skb_trim(skb, skb->len + rep_len - match_len);
 	}
 
-	if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) {
+	if (nf_ct_l3num((struct nf_conn *)skb_nfct(skb)) == NFPROTO_IPV4) {
 		/* fix IP hdr checksum information */
 		ip_hdr(skb)->tot_len = htons(skb->len);
 		ip_send_check(ip_hdr(skb));
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 26b0bccfa0c5..cd7e29910ae1 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -415,7 +415,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	skb->nfct = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	nf_conntrack_get(skb_nfct(skb));
 
 	return XT_CONTINUE;
 }
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 6b78bab27755..452557946147 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -721,8 +721,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 
 		/* Associate skb with specified zone. */
 		if (tmpl) {
-			if (skb->nfct)
-				nf_conntrack_put(skb->nfct);
+			if (skb_nfct(skb))
+				nf_conntrack_put(skb_nfct(skb));
 			nf_conntrack_get(&tmpl->ct_general);
 			skb->nfct = &tmpl->ct_general;
 			skb->nfctinfo = IP_CT_NEW;
@@ -819,7 +819,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		if (err)
 			return err;
 
-		ct = (struct nf_conn *)skb->nfct;
+		ct = (struct nf_conn *)skb_nfct(skb);
 		if (ct)
 			nf_ct_deliver_cached_events(ct);
 	}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6575aba87630..3d6b9286c203 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -129,7 +129,7 @@ static u32 flow_get_mark(const struct sk_buff *skb)
 static u32 flow_get_nfct(const struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	return addr_fold(skb->nfct);
+	return addr_fold(skb_nfct(skb));
 #else
 	return 0;
 #endif
-- 
cgit v1.2.3


From a9e419dc7be6997409dca6d1b9daf3cc7046902f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 23 Jan 2017 18:21:59 +0100
Subject: netfilter: merge ctinfo into nfct pointer storage area

After this change conntrack operations (lookup, creation, matching from
ruleset) only access one instead of two sk_buff cache lines.

This works for normal conntracks because those are allocated from a slab
that guarantees hw cacheline or 8byte alignment (whatever is larger)
so the 3 bits needed for ctinfo won't overlap with nf_conn addresses.

Template allocation now does manual address alignment (see previous change)
on arches that don't have sufficent kmalloc min alignment.

Some spots intentionally use skb->_nfct instead of skb_nfct() helpers,
this is to avoid undoing the skb_nfct() use when we remove untracked
conntrack object in the future.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h                  | 21 +++++++++------------
 include/net/netfilter/nf_conntrack.h    | 11 ++++++-----
 net/ipv6/netfilter/nf_dup_ipv6.c        |  2 +-
 net/netfilter/core.c                    |  2 +-
 net/netfilter/nf_conntrack_core.c       | 11 ++++++-----
 net/netfilter/nf_conntrack_standalone.c |  3 +++
 net/netfilter/xt_CT.c                   |  4 ++--
 7 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 276431e047af..ac0bc085b139 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -585,7 +585,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@cloned: Head may be cloned (check refcnt to be sure)
  *	@ip_summed: Driver fed us an IP checksum
  *	@nohdr: Payload reference only, must not modify header
- *	@nfctinfo: Relationship of this skb to the connection
  *	@pkt_type: Packet class
  *	@fclone: skbuff clone status
  *	@ipvs_property: skbuff is owned by ipvs
@@ -594,7 +593,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@nf_trace: netfilter packet trace flag
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
- *	@nfct: Associated connection, if any
+ *	@_nfct: Associated connection, if any (with nfctinfo bits)
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
  *	@tc_index: Traffic control index
@@ -668,7 +667,7 @@ struct sk_buff {
 	struct	sec_path	*sp;
 #endif
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	struct nf_conntrack	*nfct;
+	unsigned long		 _nfct;
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	struct nf_bridge_info	*nf_bridge;
@@ -721,7 +720,6 @@ struct sk_buff {
 	__u8			pkt_type:3;
 	__u8			pfmemalloc:1;
 	__u8			ignore_df:1;
-	__u8			nfctinfo:3;
 
 	__u8			nf_trace:1;
 	__u8			ip_summed:2;
@@ -836,6 +834,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb)
 #define SKB_DST_NOREF	1UL
 #define SKB_DST_PTRMASK	~(SKB_DST_NOREF)
 
+#define SKB_NFCT_PTRMASK	~(7UL)
 /**
  * skb_dst - returns skb dst_entry
  * @skb: buffer
@@ -3556,7 +3555,7 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
 static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	return skb->nfct;
+	return (void *)(skb->_nfct & SKB_NFCT_PTRMASK);
 #else
 	return NULL;
 #endif
@@ -3590,8 +3589,8 @@ static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge)
 static inline void nf_reset(struct sk_buff *skb)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	nf_conntrack_put(skb->nfct);
-	skb->nfct = NULL;
+	nf_conntrack_put(skb_nfct(skb));
+	skb->_nfct = 0;
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(skb->nf_bridge);
@@ -3611,10 +3610,8 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
 			     bool copy)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	dst->nfct = src->nfct;
-	nf_conntrack_get(src->nfct);
-	if (copy)
-		dst->nfctinfo = src->nfctinfo;
+	dst->_nfct = src->_nfct;
+	nf_conntrack_get(skb_nfct(src));
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	dst->nf_bridge  = src->nf_bridge;
@@ -3629,7 +3626,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
 static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	nf_conntrack_put(dst->nfct);
+	nf_conntrack_put(skb_nfct(dst));
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(dst->nf_bridge);
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 06d3d2d24fe0..f540f9ad2af4 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -76,7 +76,7 @@ struct nf_conn {
 	/* Usage count in here is 1 for hash table, 1 per skb,
 	 * plus 1 for any connection(s) we are `master' for
 	 *
-	 * Hint, SKB address this struct and refcnt via skb->nfct and
+	 * Hint, SKB address this struct and refcnt via skb->_nfct and
 	 * helpers nf_conntrack_get() and nf_conntrack_put().
 	 * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
 	 * beware nf_ct_get() is different and don't inc refcnt.
@@ -164,13 +164,15 @@ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			     const struct nf_conn *ignored_conntrack);
 
 #define NFCT_INFOMASK	7UL
+#define NFCT_PTRMASK	~(NFCT_INFOMASK)
 
 /* Return conntrack_info and tuple hash for given skb. */
 static inline struct nf_conn *
 nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
 {
-	*ctinfo = skb->nfctinfo;
-	return (struct nf_conn *)skb->nfct;
+	*ctinfo = skb->_nfct & NFCT_INFOMASK;
+
+	return (struct nf_conn *)(skb->_nfct & NFCT_PTRMASK);
 }
 
 /* decrement reference count on a conntrack */
@@ -347,8 +349,7 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl);
 static inline void
 nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info)
 {
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = info;
+	skb->_nfct = (unsigned long)ct | info;
 }
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index ff04f6a7f45b..888ecd106e5f 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -59,7 +59,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	nf_reset(skb);
 	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
-	nf_conntrack_get(skb->nfct);
+	nf_conntrack_get(skb_nfct(skb));
 #endif
 	if (hooknum == NF_INET_PRE_ROUTING ||
 	    hooknum == NF_INET_LOCAL_IN) {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index ce6adfae521a..a87a6f8a74d8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -375,7 +375,7 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 {
 	void (*attach)(struct sk_buff *, const struct sk_buff *);
 
-	if (skb->nfct) {
+	if (skb->_nfct) {
 		rcu_read_lock();
 		attach = rcu_dereference(ip_ct_attach);
 		if (attach)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 768968fba7f6..47c4ea53daa6 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1239,7 +1239,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
 }
 
-/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */
 static inline struct nf_conn *
 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		  struct sk_buff *skb,
@@ -1323,7 +1323,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			NF_CT_STAT_INC_ATOMIC(net, ignore);
 			return NF_ACCEPT;
 		}
-		skb->nfct = NULL;
+		skb->_nfct = 0;
 	}
 
 	/* rcu_read_lock()ed by nf_hook_thresh */
@@ -1352,7 +1352,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			goto out;
 		}
 		/* ICMP[v6] protocol trackers may assign one conntrack. */
-		if (skb->nfct)
+		if (skb->_nfct)
 			goto out;
 	}
 repeat:
@@ -1383,7 +1383,7 @@ repeat:
 		 * the netfilter core what to do */
 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
 		nf_conntrack_put(&ct->ct_general);
-		skb->nfct = NULL;
+		skb->_nfct = 0;
 		NF_CT_STAT_INC_ATOMIC(net, invalid);
 		if (ret == -NF_DROP)
 			NF_CT_STAT_INC_ATOMIC(net, drop);
@@ -1878,7 +1878,8 @@ int nf_conntrack_init_start(void)
 	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
-						sizeof(struct nf_conn), 0,
+						sizeof(struct nf_conn),
+						NFCT_INFOMASK + 1,
 						SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
 	if (!nf_conntrack_cachep)
 		goto err_cachep;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index d009ae663453..2256147dcaad 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -642,6 +642,9 @@ static int __init nf_conntrack_standalone_init(void)
 	if (ret < 0)
 		goto out_start;
 
+	BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
+	BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
+
 #ifdef CONFIG_SYSCTL
 	nf_ct_netfilter_header =
 		register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 51f00e1e1208..b008db0184b8 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -23,7 +23,7 @@
 static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
 {
 	/* Previously seen (loopback)? Ignore. */
-	if (skb->nfct != NULL)
+	if (skb->_nfct != 0)
 		return XT_CONTINUE;
 
 	/* special case the untracked ct : we want the percpu object */
@@ -409,7 +409,7 @@ static unsigned int
 notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	/* Previously seen (loopback)? Ignore. */
-	if (skb->nfct != NULL)
+	if (skb->_nfct != 0)
 		return XT_CONTINUE;
 
 	nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
-- 
cgit v1.2.3


From f44f1ab5a2dcd4e16eab850fd08e40ff2d0c28d4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Feb 2017 15:56:49 +0100
Subject: block: Unhash block device inodes on gendisk destruction

Currently, block device inodes stay around after corresponding gendisk
hash died until memory reclaim finds them and frees them. Since we will
make block device inode pin the bdi, we want to free the block device
inode as soon as the device goes away so that bdi does not stay around
unnecessarily. Furthermore we need to avoid issues when new device with
the same major,minor pair gets created since reusing the bdi structure
would be rather difficult in this case.

Unhashing block device inode on gendisk destruction nicely deals with
these problems. Once last block device inode reference is dropped (which
may be directly in del_gendisk()), the inode gets evicted. Furthermore if
the major,minor pair gets reallocated, we are guaranteed to get new
block device inode even if old block device inode is not yet evicted and
thus we avoid issues with possible reuse of bdi.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c      |  2 ++
 fs/block_dev.c     | 15 +++++++++++++++
 include/linux/fs.h |  1 +
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index fcd6d4fae657..f2f22d0e8e14 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -648,6 +648,8 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 	while ((part = disk_part_iter_next(&piter))) {
+		bdev_unhash_inode(MKDEV(disk->major,
+					disk->first_minor + part->partno));
 		invalidate_partition(disk, part->partno);
 		delete_partition(disk, part->partno);
 	}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5db5d1340d69..ed6a34be7a1e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -954,6 +954,21 @@ static int bdev_set(struct inode *inode, void *data)
 
 static LIST_HEAD(all_bdevs);
 
+/*
+ * If there is a bdev inode for this device, unhash it so that it gets evicted
+ * as soon as last inode reference is dropped.
+ */
+void bdev_unhash_inode(dev_t dev)
+{
+	struct inode *inode;
+
+	inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev);
+	if (inode) {
+		remove_inode_hash(inode);
+		iput(inode);
+	}
+}
+
 struct block_device *bdget(dev_t dev)
 {
 	struct block_device *bdev;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba074328894..702cb6c50194 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2342,6 +2342,7 @@ extern struct kmem_cache *names_cachep;
 #ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
 extern void unregister_blkdev(unsigned int, const char *);
+extern void bdev_unhash_inode(dev_t dev);
 extern struct block_device *bdget(dev_t);
 extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
-- 
cgit v1.2.3


From dc3b17cc8bf21307c7e076e7c778d5db756f7871 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Feb 2017 15:56:50 +0100
Subject: block: Use pointer to backing_dev_info from request_queue

We will want to have struct backing_dev_info allocated separately from
struct request_queue. As the first step add pointer to backing_dev_info
to request_queue and convert all users touching it. No functional
changes in this patch.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c             |  6 +++---
 block/blk-core.c               | 27 ++++++++++++++-------------
 block/blk-integrity.c          |  4 ++--
 block/blk-settings.c           |  2 +-
 block/blk-sysfs.c              |  8 ++++----
 block/blk-wbt.c                |  8 ++++----
 block/genhd.c                  |  2 +-
 drivers/block/aoe/aoeblk.c     |  4 ++--
 drivers/block/drbd/drbd_main.c |  6 +++---
 drivers/block/drbd/drbd_nl.c   | 12 +++++++-----
 drivers/block/drbd/drbd_proc.c |  2 +-
 drivers/block/drbd/drbd_req.c  |  2 +-
 drivers/block/pktcdvd.c        |  4 ++--
 drivers/block/rbd.c            |  2 +-
 drivers/md/bcache/request.c    | 10 +++++-----
 drivers/md/bcache/super.c      |  8 ++++----
 drivers/md/dm-cache-target.c   |  2 +-
 drivers/md/dm-era-target.c     |  2 +-
 drivers/md/dm-table.c          |  2 +-
 drivers/md/dm-thin.c           |  2 +-
 drivers/md/dm.c                |  6 +++---
 drivers/md/linear.c            |  2 +-
 drivers/md/md.c                |  6 +++---
 drivers/md/multipath.c         |  2 +-
 drivers/md/raid0.c             |  6 +++---
 drivers/md/raid1.c             |  4 ++--
 drivers/md/raid10.c            | 10 +++++-----
 drivers/md/raid5.c             | 12 ++++++------
 fs/gfs2/ops_fstype.c           |  2 +-
 fs/nilfs2/super.c              |  2 +-
 fs/super.c                     |  2 +-
 include/linux/blkdev.h         |  3 ++-
 mm/page-writeback.c            |  4 ++--
 33 files changed, 90 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fb59a3edc778..37fe595cfd70 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -184,7 +184,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
-	wb_congested = wb_congested_get_create(&q->backing_dev_info,
+	wb_congested = wb_congested_get_create(q->backing_dev_info,
 					       blkcg->css.id,
 					       GFP_NOWAIT | __GFP_NOWARN);
 	if (!wb_congested) {
@@ -469,8 +469,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
 	/* some drivers (floppy) instantiate a queue w/o disk registered */
-	if (blkg->q->backing_dev_info.dev)
-		return dev_name(blkg->q->backing_dev_info.dev);
+	if (blkg->q->backing_dev_info->dev)
+		return dev_name(blkg->q->backing_dev_info->dev);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(blkg_dev_name);
diff --git a/block/blk-core.c b/block/blk-core.c
index 3266daaa343f..dcac0352c14c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -75,7 +75,7 @@ static void blk_clear_congested(struct request_list *rl, int sync)
 	 * flip its congestion state for events on other blkcgs.
 	 */
 	if (rl == &rl->q->root_rl)
-		clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+		clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 #endif
 }
 
@@ -86,7 +86,7 @@ static void blk_set_congested(struct request_list *rl, int sync)
 #else
 	/* see blk_clear_congested() */
 	if (rl == &rl->q->root_rl)
-		set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+		set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 #endif
 }
 
@@ -117,7 +117,7 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
-	return &q->backing_dev_info;
+	return q->backing_dev_info;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 
@@ -575,7 +575,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	blk_flush_integrity();
 
 	/* @q won't process any more request, flush async actions */
-	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+	del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
 	blk_sync_queue(q);
 
 	if (q->mq_ops)
@@ -587,7 +587,7 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	bdi_unregister(&q->backing_dev_info);
+	bdi_unregister(q->backing_dev_info);
 
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
@@ -728,17 +728,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q->bio_split)
 		goto fail_id;
 
-	q->backing_dev_info.ra_pages =
+	q->backing_dev_info = &q->_backing_dev_info;
+	q->backing_dev_info->ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
-	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
-	q->backing_dev_info.name = "block";
+	q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
+	q->backing_dev_info->name = "block";
 	q->node = node_id;
 
-	err = bdi_init(&q->backing_dev_info);
+	err = bdi_init(q->backing_dev_info);
 	if (err)
 		goto fail_split;
 
-	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+	setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->queue_head);
@@ -788,7 +789,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 fail_ref:
 	percpu_ref_exit(&q->q_usage_counter);
 fail_bdi:
-	bdi_destroy(&q->backing_dev_info);
+	bdi_destroy(q->backing_dev_info);
 fail_split:
 	bioset_free(q->bio_split);
 fail_id:
@@ -1182,7 +1183,7 @@ fail_elvpriv:
 	 * disturb iosched and blkcg but weird is bettern than dead.
 	 */
 	printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
-			   __func__, dev_name(q->backing_dev_info.dev));
+			   __func__, dev_name(q->backing_dev_info->dev));
 
 	rq->rq_flags &= ~RQF_ELVPRIV;
 	rq->elv.icq = NULL;
@@ -2659,7 +2660,7 @@ void blk_finish_request(struct request *req, int error)
 	BUG_ON(blk_queued_rq(req));
 
 	if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
-		laptop_io_completion(&req->q->backing_dev_info);
+		laptop_io_completion(req->q->backing_dev_info);
 
 	blk_delete_timer(req);
 
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index d69c5c79f98e..9f0ff5ba4f84 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -443,10 +443,10 @@ void blk_integrity_revalidate(struct gendisk *disk)
 		return;
 
 	if (bi->profile)
-		disk->queue->backing_dev_info.capabilities |=
+		disk->queue->backing_dev_info->capabilities |=
 			BDI_CAP_STABLE_WRITES;
 	else
-		disk->queue->backing_dev_info.capabilities &=
+		disk->queue->backing_dev_info->capabilities &=
 			~BDI_CAP_STABLE_WRITES;
 }
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 529e55f52a03..6eb19bcbf3cb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -253,7 +253,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	limits->max_sectors = max_sectors;
-	q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
+	q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 894f77342fd4..05841be1f30f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -89,7 +89,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_ra_show(struct request_queue *q, char *page)
 {
-	unsigned long ra_kb = q->backing_dev_info.ra_pages <<
+	unsigned long ra_kb = q->backing_dev_info->ra_pages <<
 					(PAGE_SHIFT - 10);
 
 	return queue_var_show(ra_kb, (page));
@@ -104,7 +104,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 	if (ret < 0)
 		return ret;
 
-	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+	q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
 
 	return ret;
 }
@@ -236,7 +236,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 	spin_lock_irq(q->queue_lock);
 	q->limits.max_sectors = max_sectors_kb << 1;
-	q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
+	q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
@@ -799,7 +799,7 @@ static void blk_release_queue(struct kobject *kobj)
 		container_of(kobj, struct request_queue, kobj);
 
 	wbt_exit(q);
-	bdi_exit(&q->backing_dev_info);
+	bdi_exit(q->backing_dev_info);
 	blkcg_exit_queue(q);
 
 	if (q->elevator) {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index f0a9c07b4c7a..1aedb1f7ee0c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -96,7 +96,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  */
 static bool wb_recent_wait(struct rq_wb *rwb)
 {
-	struct bdi_writeback *wb = &rwb->queue->backing_dev_info.wb;
+	struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
 
 	return time_before(jiffies, wb->dirty_sleep + HZ);
 }
@@ -279,7 +279,7 @@ enum {
 
 static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 {
-	struct backing_dev_info *bdi = &rwb->queue->backing_dev_info;
+	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
 	u64 thislat;
 
 	/*
@@ -339,7 +339,7 @@ static int latency_exceeded(struct rq_wb *rwb)
 
 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 {
-	struct backing_dev_info *bdi = &rwb->queue->backing_dev_info;
+	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
 
 	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
 			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
@@ -423,7 +423,7 @@ static void wb_timer_fn(unsigned long data)
 
 	status = latency_exceeded(rwb);
 
-	trace_wbt_timer(&rwb->queue->backing_dev_info, status, rwb->scale_step,
+	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
 			inflight);
 
 	/*
diff --git a/block/genhd.c b/block/genhd.c
index f2f22d0e8e14..d9ccd42f3675 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -613,7 +613,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	disk_alloc_events(disk);
 
 	/* Register BDI before referencing it from bdev */
-	bdi = &disk->queue->backing_dev_info;
+	bdi = disk->queue->backing_dev_info;
 	bdi_register_owner(bdi, disk_to_dev(disk));
 
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index ec9d8610b25f..027b876370bc 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -396,8 +396,8 @@ aoeblk_gdalloc(void *vp)
 	WARN_ON(d->gd);
 	WARN_ON(d->flags & DEVFL_UP);
 	blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
-	q->backing_dev_info.name = "aoe";
-	q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_SIZE;
+	q->backing_dev_info->name = "aoe";
+	q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_SIZE;
 	d->bufpool = mp;
 	d->blkq = gd->queue = q;
 	q->queuedata = d;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 83482721bc01..d305f05be648 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2462,7 +2462,7 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 
 	if (get_ldev(device)) {
 		q = bdev_get_queue(device->ldev->backing_bdev);
-		r = bdi_congested(&q->backing_dev_info, bdi_bits);
+		r = bdi_congested(q->backing_dev_info, bdi_bits);
 		put_ldev(device);
 		if (r)
 			reason = 'b';
@@ -2834,8 +2834,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	/* we have no partitions. we contain only ourselves. */
 	device->this_bdev->bd_contains = device->this_bdev;
 
-	q->backing_dev_info.congested_fn = drbd_congested;
-	q->backing_dev_info.congested_data = device;
+	q->backing_dev_info->congested_fn = drbd_congested;
+	q->backing_dev_info->congested_data = device;
 
 	blk_queue_make_request(q, drbd_make_request);
 	blk_queue_write_cache(q, true, true);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index f35db29cac76..908c704e20aa 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1328,11 +1328,13 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 	if (b) {
 		blk_queue_stack_limits(q, b);
 
-		if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+		if (q->backing_dev_info->ra_pages !=
+		    b->backing_dev_info->ra_pages) {
 			drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
-				 q->backing_dev_info.ra_pages,
-				 b->backing_dev_info.ra_pages);
-			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+				 q->backing_dev_info->ra_pages,
+				 b->backing_dev_info->ra_pages);
+			q->backing_dev_info->ra_pages =
+						b->backing_dev_info->ra_pages;
 		}
 	}
 	fixup_discard_if_not_supported(q);
@@ -3345,7 +3347,7 @@ static void device_to_statistics(struct device_statistics *s,
 		s->dev_disk_flags = md->flags;
 		q = bdev_get_queue(device->ldev->backing_bdev);
 		s->dev_lower_blocked =
-			bdi_congested(&q->backing_dev_info,
+			bdi_congested(q->backing_dev_info,
 				      (1 << WB_async_congested) |
 				      (1 << WB_sync_congested));
 		put_ldev(device);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be2b93fd2c11..8378142f7a55 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -288,7 +288,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
 		} else {
 			/* reset device->congestion_reason */
-			bdi_rw_congested(&device->rq_queue->backing_dev_info);
+			bdi_rw_congested(device->rq_queue->backing_dev_info);
 
 			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
 			wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index de279fe4e4fd..cb6bdb75d52d 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -938,7 +938,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se
 
 	switch (rbm) {
 	case RB_CONGESTED_REMOTE:
-		bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+		bdi = device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
 		return bdi_read_congested(bdi);
 	case RB_LEAST_PENDING:
 		return atomic_read(&device->local_cnt) >
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f4bd95943dc2..66d846ba85a9 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1243,7 +1243,7 @@ try_next_bio:
 	 		&& pd->bio_queue_size <= pd->write_congestion_off);
 	spin_unlock(&pd->lock);
 	if (wakeup) {
-		clear_bdi_congested(&pd->disk->queue->backing_dev_info,
+		clear_bdi_congested(pd->disk->queue->backing_dev_info,
 					BLK_RW_ASYNC);
 	}
 
@@ -2370,7 +2370,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
 	spin_lock(&pd->lock);
 	if (pd->write_congestion_on > 0
 	    && pd->bio_queue_size >= pd->write_congestion_on) {
-		set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC);
+		set_bdi_congested(q->backing_dev_info, BLK_RW_ASYNC);
 		do {
 			spin_unlock(&pd->lock);
 			congestion_wait(BLK_RW_ASYNC, HZ);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4d78cf605b21..588721f30a22 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4526,7 +4526,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	q->limits.discard_zeroes_data = 1;
 
 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
-		q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
 
 	disk->queue = q;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 01035e718c1c..709c9cc34369 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1009,7 +1009,7 @@ static int cached_dev_congested(void *data, int bits)
 	struct request_queue *q = bdev_get_queue(dc->bdev);
 	int ret = 0;
 
-	if (bdi_congested(&q->backing_dev_info, bits))
+	if (bdi_congested(q->backing_dev_info, bits))
 		return 1;
 
 	if (cached_dev_get(dc)) {
@@ -1018,7 +1018,7 @@ static int cached_dev_congested(void *data, int bits)
 
 		for_each_cache(ca, d->c, i) {
 			q = bdev_get_queue(ca->bdev);
-			ret |= bdi_congested(&q->backing_dev_info, bits);
+			ret |= bdi_congested(q->backing_dev_info, bits);
 		}
 
 		cached_dev_put(dc);
@@ -1032,7 +1032,7 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
 	struct gendisk *g = dc->disk.disk;
 
 	g->queue->make_request_fn		= cached_dev_make_request;
-	g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+	g->queue->backing_dev_info->congested_fn = cached_dev_congested;
 	dc->disk.cache_miss			= cached_dev_cache_miss;
 	dc->disk.ioctl				= cached_dev_ioctl;
 }
@@ -1125,7 +1125,7 @@ static int flash_dev_congested(void *data, int bits)
 
 	for_each_cache(ca, d->c, i) {
 		q = bdev_get_queue(ca->bdev);
-		ret |= bdi_congested(&q->backing_dev_info, bits);
+		ret |= bdi_congested(q->backing_dev_info, bits);
 	}
 
 	return ret;
@@ -1136,7 +1136,7 @@ void bch_flash_dev_request_init(struct bcache_device *d)
 	struct gendisk *g = d->disk;
 
 	g->queue->make_request_fn		= flash_dev_make_request;
-	g->queue->backing_dev_info.congested_fn = flash_dev_congested;
+	g->queue->backing_dev_info->congested_fn = flash_dev_congested;
 	d->cache_miss				= flash_dev_cache_miss;
 	d->ioctl				= flash_dev_ioctl;
 }
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3a19cbc8b230..85e3f21c2514 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -807,7 +807,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	blk_queue_make_request(q, NULL);
 	d->disk->queue			= q;
 	q->queuedata			= d;
-	q->backing_dev_info.congested_data = d;
+	q->backing_dev_info->congested_data = d;
 	q->limits.max_hw_sectors	= UINT_MAX;
 	q->limits.max_sectors		= UINT_MAX;
 	q->limits.max_segment_size	= UINT_MAX;
@@ -1132,9 +1132,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 	set_capacity(dc->disk.disk,
 		     dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
 
-	dc->disk.disk->queue->backing_dev_info.ra_pages =
-		max(dc->disk.disk->queue->backing_dev_info.ra_pages,
-		    q->backing_dev_info.ra_pages);
+	dc->disk.disk->queue->backing_dev_info->ra_pages =
+		max(dc->disk.disk->queue->backing_dev_info->ra_pages,
+		    q->backing_dev_info->ra_pages);
 
 	bch_cached_dev_request_init(dc);
 	bch_cached_dev_writeback_init(dc);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5b9cf56de8ef..894bc14469c8 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2284,7 +2284,7 @@ static void do_waker(struct work_struct *ws)
 static int is_congested(struct dm_dev *dev, int bdi_bits)
 {
 	struct request_queue *q = bdev_get_queue(dev->bdev);
-	return bdi_congested(&q->backing_dev_info, bdi_bits);
+	return bdi_congested(q->backing_dev_info, bdi_bits);
 }
 
 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index bf2b2676cb8a..9fab33b113c4 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1379,7 +1379,7 @@ static void stop_worker(struct era *era)
 static int dev_is_congested(struct dm_dev *dev, int bdi_bits)
 {
 	struct request_queue *q = bdev_get_queue(dev->bdev);
-	return bdi_congested(&q->backing_dev_info, bdi_bits);
+	return bdi_congested(q->backing_dev_info, bdi_bits);
 }
 
 static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0a427de23ed2..3ad16d9c9d5a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1750,7 +1750,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 		char b[BDEVNAME_SIZE];
 
 		if (likely(q))
-			r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+			r |= bdi_congested(q->backing_dev_info, bdi_bits);
 		else
 			DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
 				     dm_device_name(t->md),
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 110982db4b48..2b266a2b5035 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2711,7 +2711,7 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
 		return 1;
 
 	q = bdev_get_queue(pt->data_dev->bdev);
-	return bdi_congested(&q->backing_dev_info, bdi_bits);
+	return bdi_congested(q->backing_dev_info, bdi_bits);
 }
 
 static void requeue_bios(struct pool *pool)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ff4a29a97ad3..9e958bc94fed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1313,7 +1313,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 			 * With request-based DM we only need to check the
 			 * top-level queue for congestion.
 			 */
-			r = md->queue->backing_dev_info.wb.state & bdi_bits;
+			r = md->queue->backing_dev_info->wb.state & bdi_bits;
 		} else {
 			map = dm_get_live_table_fast(md);
 			if (map)
@@ -1396,7 +1396,7 @@ void dm_init_md_queue(struct mapped_device *md)
 	 * - must do so here (in alloc_dev callchain) before queue is used
 	 */
 	md->queue->queuedata = md;
-	md->queue->backing_dev_info.congested_data = md;
+	md->queue->backing_dev_info->congested_data = md;
 }
 
 void dm_init_normal_md_queue(struct mapped_device *md)
@@ -1407,7 +1407,7 @@ void dm_init_normal_md_queue(struct mapped_device *md)
 	/*
 	 * Initialize aspects of queue that aren't relevant for blk-mq
 	 */
-	md->queue->backing_dev_info.congested_fn = dm_any_congested;
+	md->queue->backing_dev_info->congested_fn = dm_any_congested;
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 }
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5975c9915684..f1c7bbac31a5 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -62,7 +62,7 @@ static int linear_congested(struct mddev *mddev, int bits)
 
 	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
 		struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
-		ret |= bdi_congested(&q->backing_dev_info, bits);
+		ret |= bdi_congested(q->backing_dev_info, bits);
 	}
 
 	return ret;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 82821ee0d57f..ede3b2ad45db 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5341,8 +5341,8 @@ int md_run(struct mddev *mddev)
 			queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
-		mddev->queue->backing_dev_info.congested_data = mddev;
-		mddev->queue->backing_dev_info.congested_fn = md_congested;
+		mddev->queue->backing_dev_info->congested_data = mddev;
+		mddev->queue->backing_dev_info->congested_fn = md_congested;
 	}
 	if (pers->sync_request) {
 		if (mddev->kobj.sd &&
@@ -5699,7 +5699,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
 
 		__md_stop_writes(mddev);
 		__md_stop(mddev);
-		mddev->queue->backing_dev_info.congested_fn = NULL;
+		mddev->queue->backing_dev_info->congested_fn = NULL;
 
 		/* tell userspace to handle 'inactive' */
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index aa8c4e5c1ee2..d457afa672d5 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -169,7 +169,7 @@ static int multipath_congested(struct mddev *mddev, int bits)
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 
-			ret |= bdi_congested(&q->backing_dev_info, bits);
+			ret |= bdi_congested(q->backing_dev_info, bits);
 			/* Just like multipath_map, we just check the
 			 * first available device
 			 */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 848365d474f3..d6585239bff2 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -41,7 +41,7 @@ static int raid0_congested(struct mddev *mddev, int bits)
 	for (i = 0; i < raid_disks && !ret ; i++) {
 		struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
 
-		ret |= bdi_congested(&q->backing_dev_info, bits);
+		ret |= bdi_congested(q->backing_dev_info, bits);
 	}
 	return ret;
 }
@@ -420,8 +420,8 @@ static int raid0_run(struct mddev *mddev)
 		 */
 		int stripe = mddev->raid_disks *
 			(mddev->chunk_sectors << 9) / PAGE_SIZE;
-		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+		if (mddev->queue->backing_dev_info->ra_pages < 2* stripe)
+			mddev->queue->backing_dev_info->ra_pages = 2* stripe;
 	}
 
 	dump_zones(mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 67b0365d49b5..830ff2b20346 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -744,9 +744,9 @@ static int raid1_congested(struct mddev *mddev, int bits)
 			 * non-congested targets, it can be removed
 			 */
 			if ((bits & (1 << WB_async_congested)) || 1)
-				ret |= bdi_congested(&q->backing_dev_info, bits);
+				ret |= bdi_congested(q->backing_dev_info, bits);
 			else
-				ret &= bdi_congested(&q->backing_dev_info, bits);
+				ret &= bdi_congested(q->backing_dev_info, bits);
 		}
 	}
 	rcu_read_unlock();
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1920756828df..6bc5c2a85160 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -860,7 +860,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 
-			ret |= bdi_congested(&q->backing_dev_info, bits);
+			ret |= bdi_congested(q->backing_dev_info, bits);
 		}
 	}
 	rcu_read_unlock();
@@ -3841,8 +3841,8 @@ static int raid10_run(struct mddev *mddev)
 		 * maybe...
 		 */
 		stripe /= conf->geo.near_copies;
-		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+		if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+			mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
 	}
 
 	if (md_integrity_register(mddev))
@@ -4643,8 +4643,8 @@ static void end_reshape(struct r10conf *conf)
 		int stripe = conf->geo.raid_disks *
 			((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
 		stripe /= conf->geo.near_copies;
-		if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-			conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+		if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+			conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
 	}
 	conf->fullsync = 0;
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 36c13e4be9c9..c0312d3f516a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6264,10 +6264,10 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
 		mddev_suspend(mddev);
 		conf->skip_copy = new;
 		if (new)
-			mddev->queue->backing_dev_info.capabilities |=
+			mddev->queue->backing_dev_info->capabilities |=
 				BDI_CAP_STABLE_WRITES;
 		else
-			mddev->queue->backing_dev_info.capabilities &=
+			mddev->queue->backing_dev_info->capabilities &=
 				~BDI_CAP_STABLE_WRITES;
 		mddev_resume(mddev);
 	}
@@ -7086,8 +7086,8 @@ static int raid5_run(struct mddev *mddev)
 		int data_disks = conf->previous_raid_disks - conf->max_degraded;
 		int stripe = data_disks *
 			((mddev->chunk_sectors << 9) / PAGE_SIZE);
-		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+		if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+			mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
 
 		chunk_size = mddev->chunk_sectors << 9;
 		blk_queue_io_min(mddev->queue, chunk_size);
@@ -7696,8 +7696,8 @@ static void end_reshape(struct r5conf *conf)
 			int data_disks = conf->raid_disks - conf->max_degraded;
 			int stripe = data_disks * ((conf->chunk_sectors << 9)
 						   / PAGE_SIZE);
-			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+			if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+				conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
 		}
 	}
 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a34308df927f..c0e5b9a8bf5f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1226,7 +1226,7 @@ static int set_gfs2_super(struct super_block *s, void *data)
 	 * We set the bdi here to the queue backing, file systems can
 	 * overwrite this in ->fill_super()
 	 */
-	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 12eeae62a2b1..e1872f36147f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_time_gran = 1;
 	sb->s_max_links = NILFS_LINK_MAX;
 
-	sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+	sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info;
 
 	err = load_nilfs(nilfs, sb);
 	if (err)
diff --git a/fs/super.c b/fs/super.c
index 1709ed029a2c..ea662b0e5e78 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1047,7 +1047,7 @@ static int set_bdev_super(struct super_block *s, void *data)
 	 * We set the bdi here to the queue backing, file systems can
 	 * overwrite this in ->fill_super()
 	 */
-	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 11f7a8e86a89..a75e42de34ab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -432,7 +432,8 @@ struct request_queue {
 	 */
 	struct delayed_work	delay_work;
 
-	struct backing_dev_info	backing_dev_info;
+	struct backing_dev_info	*backing_dev_info;
+	struct backing_dev_info	_backing_dev_info;
 
 	/*
 	 * The queue owner gets to use this for whatever they like.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 290e8b7d3181..216449825859 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1988,11 +1988,11 @@ void laptop_mode_timer_fn(unsigned long data)
 	 * We want to write everything out, not just down to the dirty
 	 * threshold
 	 */
-	if (!bdi_has_dirty_io(&q->backing_dev_info))
+	if (!bdi_has_dirty_io(q->backing_dev_info))
 		return;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
+	list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
 		if (wb_has_dirty_io(wb))
 			wb_start_writeback(wb, nr_pages, true,
 					   WB_REASON_LAPTOP_TIMER);
-- 
cgit v1.2.3


From d03f6cdc1fc422accb734c7c07a661a0018d8631 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Feb 2017 15:56:51 +0100
Subject: block: Dynamically allocate and refcount backing_dev_info

Instead of storing backing_dev_info inside struct request_queue,
allocate it dynamically, reference count it, and free it when the last
reference is dropped. Currently only request_queue holds the reference
but in the following patch we add other users referencing
backing_dev_info.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                 | 12 +++++-------
 block/blk-sysfs.c                |  2 +-
 include/linux/backing-dev-defs.h |  2 ++
 include/linux/backing-dev.h      | 10 +++++++++-
 include/linux/blkdev.h           |  1 -
 mm/backing-dev.c                 | 34 +++++++++++++++++++++++++++++++++-
 6 files changed, 50 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index dcac0352c14c..d2bba4700e65 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -713,7 +713,6 @@ static void blk_rq_timed_out_timer(unsigned long data)
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
-	int err;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
@@ -728,17 +727,16 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q->bio_split)
 		goto fail_id;
 
-	q->backing_dev_info = &q->_backing_dev_info;
+	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
+	if (!q->backing_dev_info)
+		goto fail_split;
+
 	q->backing_dev_info->ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
 	q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
 	q->backing_dev_info->name = "block";
 	q->node = node_id;
 
-	err = bdi_init(q->backing_dev_info);
-	if (err)
-		goto fail_split;
-
 	setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
@@ -789,7 +787,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 fail_ref:
 	percpu_ref_exit(&q->q_usage_counter);
 fail_bdi:
-	bdi_destroy(q->backing_dev_info);
+	bdi_put(q->backing_dev_info);
 fail_split:
 	bioset_free(q->bio_split);
 fail_id:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 05841be1f30f..3e204789b8d3 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -799,7 +799,7 @@ static void blk_release_queue(struct kobject *kobj)
 		container_of(kobj, struct request_queue, kobj);
 
 	wbt_exit(q);
-	bdi_exit(q->backing_dev_info);
+	bdi_put(q->backing_dev_info);
 	blkcg_exit_queue(q);
 
 	if (q->elevator) {
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e850e76acaaf..ad955817916d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -10,6 +10,7 @@
 #include <linux/flex_proportions.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
+#include <linux/kref.h>
 
 struct page;
 struct device;
@@ -144,6 +145,7 @@ struct backing_dev_info {
 
 	char *name;
 
+	struct kref refcnt;	/* Reference counter for the structure */
 	unsigned int capabilities; /* Device capabilities */
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 43b93a947e61..efb6ca992d05 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,7 +18,14 @@
 #include <linux/slab.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
-void bdi_exit(struct backing_dev_info *bdi);
+
+static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
+{
+	kref_get(&bdi->refcnt);
+	return bdi;
+}
+
+void bdi_put(struct backing_dev_info *bdi);
 
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -29,6 +36,7 @@ void bdi_unregister(struct backing_dev_info *bdi);
 
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_destroy(struct backing_dev_info *bdi);
+struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 			bool range_cyclic, enum wb_reason reason);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a75e42de34ab..e77c1039fd0e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -433,7 +433,6 @@ struct request_queue {
 	struct delayed_work	delay_work;
 
 	struct backing_dev_info	*backing_dev_info;
-	struct backing_dev_info	_backing_dev_info;
 
 	/*
 	 * The queue owner gets to use this for whatever they like.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3bfed5ab2475..28ce6cf7b2ff 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -237,6 +237,7 @@ static __init int bdi_class_init(void)
 
 	bdi_class->dev_groups = bdi_dev_groups;
 	bdi_debug_init();
+
 	return 0;
 }
 postcore_initcall(bdi_class_init);
@@ -776,6 +777,7 @@ int bdi_init(struct backing_dev_info *bdi)
 
 	bdi->dev = NULL;
 
+	kref_init(&bdi->refcnt);
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
@@ -791,6 +793,22 @@ int bdi_init(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_init);
 
+struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
+{
+	struct backing_dev_info *bdi;
+
+	bdi = kmalloc_node(sizeof(struct backing_dev_info),
+			   gfp_mask | __GFP_ZERO, node_id);
+	if (!bdi)
+		return NULL;
+
+	if (bdi_init(bdi)) {
+		kfree(bdi);
+		return NULL;
+	}
+	return bdi;
+}
+
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...)
 {
@@ -871,12 +889,26 @@ void bdi_unregister(struct backing_dev_info *bdi)
 	}
 }
 
-void bdi_exit(struct backing_dev_info *bdi)
+static void bdi_exit(struct backing_dev_info *bdi)
 {
 	WARN_ON_ONCE(bdi->dev);
 	wb_exit(&bdi->wb);
 }
 
+static void release_bdi(struct kref *ref)
+{
+	struct backing_dev_info *bdi =
+			container_of(ref, struct backing_dev_info, refcnt);
+
+	bdi_exit(bdi);
+	kfree(bdi);
+}
+
+void bdi_put(struct backing_dev_info *bdi)
+{
+	kref_put(&bdi->refcnt, release_bdi);
+}
+
 void bdi_destroy(struct backing_dev_info *bdi)
 {
 	bdi_unregister(bdi);
-- 
cgit v1.2.3


From b1d2dc5659b41741f5a29b2ade76ffb4e5bb13d8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Feb 2017 15:56:52 +0100
Subject: block: Make blk_get_backing_dev_info() safe without open bdev

Currenly blk_get_backing_dev_info() is not safe to be called when the
block device is not open as bdev->bd_disk is NULL in that case. However
inode_to_bdi() uses this function and may be call called from flusher
worker or other writeback related functions without bdev being open
which leads to crashes such as:

[113031.075540] Unable to handle kernel paging request for data at address 0x00000000
[113031.075614] Faulting instruction address: 0xc0000000003692e0
0:mon> t
[c0000000fb65f900] c00000000036cb6c writeback_sb_inodes+0x30c/0x590
[c0000000fb65fa10] c00000000036ced4 __writeback_inodes_wb+0xe4/0x150
[c0000000fb65fa70] c00000000036d33c wb_writeback+0x30c/0x450
[c0000000fb65fb40] c00000000036e198 wb_workfn+0x268/0x580
[c0000000fb65fc50] c0000000000f3470 process_one_work+0x1e0/0x590
[c0000000fb65fce0] c0000000000f38c8 worker_thread+0xa8/0x660
[c0000000fb65fd80] c0000000000fc4b0 kthread+0x110/0x130
[c0000000fb65fe30] c0000000000098f0 ret_from_kernel_thread+0x5c/0x6c

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c   | 8 +++-----
 fs/block_dev.c     | 7 +++++++
 include/linux/fs.h | 1 +
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index d2bba4700e65..6375674e719b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -110,14 +110,12 @@ void blk_queue_congestion_threshold(struct request_queue *q)
  * @bdev:	device
  *
  * Locates the passed device's request queue and returns the address of its
- * backing_dev_info.  This function can only be called if @bdev is opened
- * and the return value is never NULL.
+ * backing_dev_info. The return value is never NULL however we may return
+ * &noop_backing_dev_info if the bdev is not currently open.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	return q->backing_dev_info;
+	return bdev->bd_bdi;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ed6a34be7a1e..601b71b76d7f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -884,6 +884,8 @@ static void bdev_evict_inode(struct inode *inode)
 	spin_lock(&bdev_lock);
 	list_del_init(&bdev->bd_list);
 	spin_unlock(&bdev_lock);
+	if (bdev->bd_bdi != &noop_backing_dev_info)
+		bdi_put(bdev->bd_bdi);
 }
 
 static const struct super_operations bdev_sops = {
@@ -986,6 +988,7 @@ struct block_device *bdget(dev_t dev)
 		bdev->bd_contains = NULL;
 		bdev->bd_super = NULL;
 		bdev->bd_inode = inode;
+		bdev->bd_bdi = &noop_backing_dev_info;
 		bdev->bd_block_size = (1 << inode->i_blkbits);
 		bdev->bd_part_count = 0;
 		bdev->bd_invalidated = 0;
@@ -1542,6 +1545,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
+		if (bdev->bd_bdi == &noop_backing_dev_info)
+			bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
 
 		if (!partno) {
 			ret = -ENXIO;
@@ -1637,6 +1642,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_queue = NULL;
+	bdi_put(bdev->bd_bdi);
+	bdev->bd_bdi = &noop_backing_dev_info;
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 702cb6c50194..c930cbc19342 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -423,6 +423,7 @@ struct block_device {
 	int			bd_invalidated;
 	struct gendisk *	bd_disk;
 	struct request_queue *  bd_queue;
+	struct backing_dev_info *bd_bdi;
 	struct list_head	bd_list;
 	/*
 	 * Private data.  You must have bd_claim'ed the block_device
-- 
cgit v1.2.3


From efa7c9f97e3ef624e9a398bf69c15f58eea9f0e8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Feb 2017 15:56:53 +0100
Subject: block: Get rid of blk_get_backing_dev_info()

blk_get_backing_dev_info() is now a simple dereference. Remove that
function and simplify some code around that.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c            | 14 --------------
 block/compat_ioctl.c        |  7 ++-----
 block/ioctl.c               |  7 ++-----
 fs/btrfs/disk-io.c          |  2 +-
 fs/btrfs/volumes.c          |  2 +-
 fs/xfs/xfs_buf.c            |  3 +--
 fs/xfs/xfs_buf.h            |  1 -
 include/linux/backing-dev.h |  2 +-
 include/linux/blkdev.h      |  1 -
 9 files changed, 8 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6375674e719b..a3c29270807e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -105,20 +105,6 @@ void blk_queue_congestion_threshold(struct request_queue *q)
 	q->nr_congestion_off = nr;
 }
 
-/**
- * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
- * @bdev:	device
- *
- * Locates the passed device's request queue and returns the address of its
- * backing_dev_info. The return value is never NULL however we may return
- * &noop_backing_dev_info if the bdev is not currently open.
- */
-struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
-{
-	return bdev->bd_bdi;
-}
-EXPORT_SYMBOL(blk_get_backing_dev_info);
-
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 556826ac7cb4..570021a0dc1c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -661,7 +661,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	struct block_device *bdev = inode->i_bdev;
 	struct gendisk *disk = bdev->bd_disk;
 	fmode_t mode = file->f_mode;
-	struct backing_dev_info *bdi;
 	loff_t size;
 	unsigned int max_sectors;
 
@@ -708,9 +707,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKFRAGET:
 		if (!arg)
 			return -EINVAL;
-		bdi = blk_get_backing_dev_info(bdev);
 		return compat_put_long(arg,
-				       (bdi->ra_pages * PAGE_SIZE) / 512);
+			       (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKROGET: /* compatible */
 		return compat_put_int(arg, bdev_read_only(bdev) != 0);
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -728,8 +726,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
-		bdi = blk_get_backing_dev_info(bdev);
-		bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+		bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKGETSIZE:
 		size = i_size_read(bdev->bd_inode);
diff --git a/block/ioctl.c b/block/ioctl.c
index be7f4de3eb3c..7b88820b93d9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -505,7 +505,6 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
 int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			unsigned long arg)
 {
-	struct backing_dev_info *bdi;
 	void __user *argp = (void __user *)arg;
 	loff_t size;
 	unsigned int max_sectors;
@@ -532,8 +531,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKFRAGET:
 		if (!arg)
 			return -EINVAL;
-		bdi = blk_get_backing_dev_info(bdev);
-		return put_long(arg, (bdi->ra_pages * PAGE_SIZE) / 512);
+		return put_long(arg, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512);
 	case BLKROGET:
 		return put_int(arg, bdev_read_only(bdev) != 0);
 	case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -560,8 +558,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
 			return -EACCES;
-		bdi = blk_get_backing_dev_info(bdev);
-		bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+		bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKBSZSET:
 		return blkdev_bszset(bdev, mode, argp);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 18004169552c..37a31b12bb0c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1800,7 +1800,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
-		bdi = blk_get_backing_dev_info(device->bdev);
+		bdi = device->bdev->bd_bdi;
 		if (bdi_congested(bdi, bdi_bits)) {
 			ret = 1;
 			break;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3c3c69c0eee4..b2e70073a10d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,7 +366,7 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 	 */
 	blk_start_plug(&plug);
 
-	bdi = blk_get_backing_dev_info(device->bdev);
+	bdi = device->bdev->bd_bdi;
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 7f0a01f7b592..8bbec20aa138 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -757,7 +757,7 @@ xfs_buf_readahead_map(
 	int			nmaps,
 	const struct xfs_buf_ops *ops)
 {
-	if (bdi_read_congested(target->bt_bdi))
+	if (bdi_read_congested(target->bt_bdev->bd_bdi))
 		return;
 
 	xfs_buf_read_map(target, map, nmaps,
@@ -1790,7 +1790,6 @@ xfs_alloc_buftarg(
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
-	btp->bt_bdi = blk_get_backing_dev_info(bdev);
 
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8a9d3a9599f0..3c867e5a63e1 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -109,7 +109,6 @@ typedef unsigned int xfs_buf_flags_t;
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
-	struct backing_dev_info	*bt_bdi;
 	struct xfs_mount	*bt_mount;
 	unsigned int		bt_meta_sectorsize;
 	size_t			bt_meta_sectormask;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index efb6ca992d05..c52a48cb9a66 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -191,7 +191,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 	sb = inode->i_sb;
 #ifdef CONFIG_BLOCK
 	if (sb_is_blkdev_sb(sb))
-		return blk_get_backing_dev_info(I_BDEV(inode));
+		return I_BDEV(inode)->bd_bdi;
 #endif
 	return sb->s_bdi;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e77c1039fd0e..b31137e2afd0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1187,7 +1187,6 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
-extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
 {
-- 
cgit v1.2.3


From 0dba1314d4f81115dce711292ec7981d17231064 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 1 Feb 2017 14:05:23 -0800
Subject: scsi, block: fix duplicate bdi name registration crashes

Warnings of the following form occur because scsi reuses a devt number
while the block layer still has it referenced as the name of the bdi
[1]:

 WARNING: CPU: 1 PID: 93 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x62/0x80
 sysfs: cannot create duplicate filename '/devices/virtual/bdi/8:192'
 [..]
 Call Trace:
  dump_stack+0x86/0xc3
  __warn+0xcb/0xf0
  warn_slowpath_fmt+0x5f/0x80
  ? kernfs_path_from_node+0x4f/0x60
  sysfs_warn_dup+0x62/0x80
  sysfs_create_dir_ns+0x77/0x90
  kobject_add_internal+0xb2/0x350
  kobject_add+0x75/0xd0
  device_add+0x15a/0x650
  device_create_groups_vargs+0xe0/0xf0
  device_create_vargs+0x1c/0x20
  bdi_register+0x90/0x240
  ? lockdep_init_map+0x57/0x200
  bdi_register_owner+0x36/0x60
  device_add_disk+0x1bb/0x4e0
  ? __pm_runtime_use_autosuspend+0x5c/0x70
  sd_probe_async+0x10d/0x1c0
  async_run_entry_fn+0x39/0x170

This is a brute-force fix to pass the devt release information from
sd_probe() to the locations where we register the bdi,
device_add_disk(), and unregister the bdi, blk_cleanup_queue().

Thanks to Omar for the quick reproducer script [2]. This patch survives
where an unmodified kernel fails in a few seconds.

[1]: https://marc.info/?l=linux-scsi&m=147116857810716&w=4
[2]: http://marc.info/?l=linux-block&m=148554717109098&w=2

Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Jan Kara <jack@suse.cz>
Reported-by: Omar Sandoval <osandov@osandov.com>
Tested-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  1 +
 block/genhd.c          | 21 +++++++++++++++++++++
 drivers/scsi/sd.c      | 41 +++++++++++++++++++++++++++++++++--------
 include/linux/blkdev.h |  1 +
 include/linux/genhd.h  |  8 ++++++++
 5 files changed, 64 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index a3c29270807e..a5726e01f839 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -572,6 +572,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	spin_unlock_irq(lock);
 
 	bdi_unregister(q->backing_dev_info);
+	put_disk_devt(q->disk_devt);
 
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
diff --git a/block/genhd.c b/block/genhd.c
index d9ccd42f3675..3631cd480295 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -572,6 +572,20 @@ exit:
 	disk_part_iter_exit(&piter);
 }
 
+void put_disk_devt(struct disk_devt *disk_devt)
+{
+	if (disk_devt && atomic_dec_and_test(&disk_devt->count))
+		disk_devt->release(disk_devt);
+}
+EXPORT_SYMBOL(put_disk_devt);
+
+void get_disk_devt(struct disk_devt *disk_devt)
+{
+	if (disk_devt)
+		atomic_inc(&disk_devt->count);
+}
+EXPORT_SYMBOL(get_disk_devt);
+
 /**
  * device_add_disk - add partitioning information to kernel list
  * @parent: parent device for the disk
@@ -612,6 +626,13 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 
 	disk_alloc_events(disk);
 
+	/*
+	 * Take a reference on the devt and assign it to queue since it
+	 * must not be reallocated while the bdi is registered
+	 */
+	disk->queue->disk_devt = disk->disk_devt;
+	get_disk_devt(disk->disk_devt);
+
 	/* Register BDI before referencing it from bdev */
 	bdi = disk->queue->backing_dev_info;
 	bdi_register_owner(bdi, disk_to_dev(disk));
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bd2fb4d52efa..dff709e22370 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3064,6 +3064,23 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 	put_device(&sdkp->dev);
 }
 
+struct sd_devt {
+	int idx;
+	struct disk_devt disk_devt;
+};
+
+void sd_devt_release(struct disk_devt *disk_devt)
+{
+	struct sd_devt *sd_devt = container_of(disk_devt, struct sd_devt,
+			disk_devt);
+
+	spin_lock(&sd_index_lock);
+	ida_remove(&sd_index_ida, sd_devt->idx);
+	spin_unlock(&sd_index_lock);
+
+	kfree(sd_devt);
+}
+
 /**
  *	sd_probe - called during driver initialization and whenever a
  *	new scsi device is attached to the system. It is called once
@@ -3085,6 +3102,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 static int sd_probe(struct device *dev)
 {
 	struct scsi_device *sdp = to_scsi_device(dev);
+	struct sd_devt *sd_devt;
 	struct scsi_disk *sdkp;
 	struct gendisk *gd;
 	int index;
@@ -3110,9 +3128,13 @@ static int sd_probe(struct device *dev)
 	if (!sdkp)
 		goto out;
 
+	sd_devt = kzalloc(sizeof(*sd_devt), GFP_KERNEL);
+	if (!sd_devt)
+		goto out_free;
+
 	gd = alloc_disk(SD_MINORS);
 	if (!gd)
-		goto out_free;
+		goto out_free_devt;
 
 	do {
 		if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
@@ -3128,6 +3150,11 @@ static int sd_probe(struct device *dev)
 		goto out_put;
 	}
 
+	atomic_set(&sd_devt->disk_devt.count, 1);
+	sd_devt->disk_devt.release = sd_devt_release;
+	sd_devt->idx = index;
+	gd->disk_devt = &sd_devt->disk_devt;
+
 	error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
 	if (error) {
 		sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n");
@@ -3167,13 +3194,14 @@ static int sd_probe(struct device *dev)
 	return 0;
 
  out_free_index:
-	spin_lock(&sd_index_lock);
-	ida_remove(&sd_index_ida, index);
-	spin_unlock(&sd_index_lock);
+	put_disk_devt(&sd_devt->disk_devt);
+	sd_devt = NULL;
  out_put:
 	put_disk(gd);
  out_free:
 	kfree(sdkp);
+ out_free_devt:
+	kfree(sd_devt);
  out:
 	scsi_autopm_put_device(sdp);
 	return error;
@@ -3232,10 +3260,7 @@ static void scsi_disk_release(struct device *dev)
 	struct scsi_disk *sdkp = to_scsi_disk(dev);
 	struct gendisk *disk = sdkp->disk;
 	
-	spin_lock(&sd_index_lock);
-	ida_remove(&sd_index_ida, sdkp->index);
-	spin_unlock(&sd_index_lock);
-
+	put_disk_devt(disk->disk_devt);
 	disk->private_data = NULL;
 	put_disk(disk);
 	put_device(&sdkp->device->sdev_gendev);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b31137e2afd0..f84fbe55d3b3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -433,6 +433,7 @@ struct request_queue {
 	struct delayed_work	delay_work;
 
 	struct backing_dev_info	*backing_dev_info;
+	struct disk_devt	*disk_devt;
 
 	/*
 	 * The queue owner gets to use this for whatever they like.
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 76f39754e7b0..a999d281a2f1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -167,6 +167,13 @@ struct blk_integrity {
 };
 
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
+struct disk_devt {
+	atomic_t count;
+	void (*release)(struct disk_devt *disk_devt);
+};
+
+void put_disk_devt(struct disk_devt *disk_devt);
+void get_disk_devt(struct disk_devt *disk_devt);
 
 struct gendisk {
 	/* major, first_minor and minors are input parameters only,
@@ -176,6 +183,7 @@ struct gendisk {
 	int first_minor;
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
+	struct disk_devt *disk_devt;
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
 	char *(*devnode)(struct gendisk *gd, umode_t *mode);
-- 
cgit v1.2.3


From 1c83a9aab807f7452c4957b2401e1cbf43941820 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 2 Feb 2017 11:52:14 -0500
Subject: ext4: move halfmd4 into hash.c directly

The "half md4" transform should not be used by any new code. And
fortunately, it's only used now by ext4. Since ext4 supports several
hashing methods, at some point it might be desirable to move to
something like SipHash. As an intermediate step, remove half md4 from
cryptohash.h and lib, and make it just a local function in ext4's
hash.c. There's precedent for doing this; the other function ext can use
for its hashes -- TEA -- is also implemented in the same place. Also, by
being a local function, this might allow gcc to perform some additional
optimizations.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/hash.c             | 71 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cryptohash.h |  2 --
 lib/Makefile               |  2 +-
 lib/halfmd4.c              | 67 -------------------------------------------
 4 files changed, 71 insertions(+), 71 deletions(-)
 delete mode 100644 lib/halfmd4.c

(limited to 'include/linux')

diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e026aa941fd5..38b8a96eb97c 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -10,7 +10,8 @@
  */
 
 #include <linux/fs.h>
-#include <linux/cryptohash.h>
+#include <linux/compiler.h>
+#include <linux/bitops.h>
 #include "ext4.h"
 
 #define DELTA 0x9E3779B9
@@ -32,6 +33,74 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 	buf[1] += b1;
 }
 
+/* F, G and H are basic MD4 functions: selection, majority, parity */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/*
+ * The generic round function.  The application is so specific that
+ * we don't bother protecting all the arguments with parens, as is generally
+ * good macro practice, in favor of extra legibility.
+ * Rotation is separate from addition to prevent recomputation
+ */
+#define ROUND(f, a, b, c, d, x, s)	\
+	(a += f(b, c, d) + x, a = rol32(a, s))
+#define K1 0
+#define K2 013240474631UL
+#define K3 015666365641UL
+
+/*
+ * Basic cut-down MD4 transform.  Returns only 32 bits of result.
+ */
+static __u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
+{
+	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+	/* Round 1 */
+	ROUND(F, a, b, c, d, in[0] + K1,  3);
+	ROUND(F, d, a, b, c, in[1] + K1,  7);
+	ROUND(F, c, d, a, b, in[2] + K1, 11);
+	ROUND(F, b, c, d, a, in[3] + K1, 19);
+	ROUND(F, a, b, c, d, in[4] + K1,  3);
+	ROUND(F, d, a, b, c, in[5] + K1,  7);
+	ROUND(F, c, d, a, b, in[6] + K1, 11);
+	ROUND(F, b, c, d, a, in[7] + K1, 19);
+
+	/* Round 2 */
+	ROUND(G, a, b, c, d, in[1] + K2,  3);
+	ROUND(G, d, a, b, c, in[3] + K2,  5);
+	ROUND(G, c, d, a, b, in[5] + K2,  9);
+	ROUND(G, b, c, d, a, in[7] + K2, 13);
+	ROUND(G, a, b, c, d, in[0] + K2,  3);
+	ROUND(G, d, a, b, c, in[2] + K2,  5);
+	ROUND(G, c, d, a, b, in[4] + K2,  9);
+	ROUND(G, b, c, d, a, in[6] + K2, 13);
+
+	/* Round 3 */
+	ROUND(H, a, b, c, d, in[3] + K3,  3);
+	ROUND(H, d, a, b, c, in[7] + K3,  9);
+	ROUND(H, c, d, a, b, in[2] + K3, 11);
+	ROUND(H, b, c, d, a, in[6] + K3, 15);
+	ROUND(H, a, b, c, d, in[1] + K3,  3);
+	ROUND(H, d, a, b, c, in[5] + K3,  9);
+	ROUND(H, c, d, a, b, in[0] + K3, 11);
+	ROUND(H, b, c, d, a, in[4] + K3, 15);
+
+	buf[0] += a;
+	buf[1] += b;
+	buf[2] += c;
+	buf[3] += d;
+
+	return buf[1]; /* "most hashed" word */
+}
+#undef ROUND
+#undef K1
+#undef K2
+#undef K3
+#undef F
+#undef G
+#undef H
 
 /* The old legacy hash */
 static __u32 dx_hack_hash_unsigned(const char *name, int len)
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index f4754282c9c2..3252799832cf 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -15,6 +15,4 @@ void sha_transform(__u32 *digest, const char *data, __u32 *W);
 
 void md5_transform(__u32 *hash, __u32 const *in);
 
-__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]);
-
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index bc4073a8cd08..19ea76149a37 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -31,7 +31,7 @@ lib-$(CONFIG_HAS_DMA) += dma-noop.o
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
 
-obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
+obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
deleted file mode 100644
index 137e861d9690..000000000000
--- a/lib/halfmd4.c
+++ /dev/null
@@ -1,67 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/cryptohash.h>
-#include <linux/bitops.h>
-
-/* F, G and H are basic MD4 functions: selection, majority, parity */
-#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-
-/*
- * The generic round function.  The application is so specific that
- * we don't bother protecting all the arguments with parens, as is generally
- * good macro practice, in favor of extra legibility.
- * Rotation is separate from addition to prevent recomputation
- */
-#define ROUND(f, a, b, c, d, x, s)	\
-	(a += f(b, c, d) + x, a = rol32(a, s))
-#define K1 0
-#define K2 013240474631UL
-#define K3 015666365641UL
-
-/*
- * Basic cut-down MD4 transform.  Returns only 32 bits of result.
- */
-__u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
-{
-	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-
-	/* Round 1 */
-	ROUND(F, a, b, c, d, in[0] + K1,  3);
-	ROUND(F, d, a, b, c, in[1] + K1,  7);
-	ROUND(F, c, d, a, b, in[2] + K1, 11);
-	ROUND(F, b, c, d, a, in[3] + K1, 19);
-	ROUND(F, a, b, c, d, in[4] + K1,  3);
-	ROUND(F, d, a, b, c, in[5] + K1,  7);
-	ROUND(F, c, d, a, b, in[6] + K1, 11);
-	ROUND(F, b, c, d, a, in[7] + K1, 19);
-
-	/* Round 2 */
-	ROUND(G, a, b, c, d, in[1] + K2,  3);
-	ROUND(G, d, a, b, c, in[3] + K2,  5);
-	ROUND(G, c, d, a, b, in[5] + K2,  9);
-	ROUND(G, b, c, d, a, in[7] + K2, 13);
-	ROUND(G, a, b, c, d, in[0] + K2,  3);
-	ROUND(G, d, a, b, c, in[2] + K2,  5);
-	ROUND(G, c, d, a, b, in[4] + K2,  9);
-	ROUND(G, b, c, d, a, in[6] + K2, 13);
-
-	/* Round 3 */
-	ROUND(H, a, b, c, d, in[3] + K3,  3);
-	ROUND(H, d, a, b, c, in[7] + K3,  9);
-	ROUND(H, c, d, a, b, in[2] + K3, 11);
-	ROUND(H, b, c, d, a, in[6] + K3, 15);
-	ROUND(H, a, b, c, d, in[1] + K3,  3);
-	ROUND(H, d, a, b, c, in[5] + K3,  9);
-	ROUND(H, c, d, a, b, in[0] + K3, 11);
-	ROUND(H, b, c, d, a, in[4] + K3, 15);
-
-	buf[0] += a;
-	buf[1] += b;
-	buf[2] += c;
-	buf[3] += d;
-
-	return buf[1]; /* "most hashed" word */
-}
-EXPORT_SYMBOL(half_md4_transform);
-- 
cgit v1.2.3


From a7c5437b0bbec5165df6eb1efc5e37dc40b9e05d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 31 Jan 2017 14:53:17 -0800
Subject: debugfs: add debugfs_lookup()

We don't always have easy access to the dentry of a file or directory we
created in debugfs. Add a helper which allows us to get a dentry we
previously created.

The motivation for this change is a problem with blktrace and the blk-mq
debugfs entries introduced in 07e4fead45e6 ("blk-mq: create debugfs
directory tree"). Namely, in some cases, the directory that blktrace
needs to create may already exist, but in other cases, it may not. We
_could_ rely on a bunch of implied knowledge to decide whether to create
the directory or not, but it's much cleaner on our end to just look it
up.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/debugfs/inode.c      | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h |  8 ++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f17fcf89e18e..7fb1732a3630 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -248,6 +248,42 @@ static struct file_system_type debug_fs_type = {
 };
 MODULE_ALIAS_FS("debugfs");
 
+/**
+ * debugfs_lookup() - look up an existing debugfs file
+ * @name: a pointer to a string containing the name of the file to look up.
+ * @parent: a pointer to the parent dentry of the file.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  If the file
+ * doesn't exist or an error occurs, %NULL will be returned.  The returned
+ * dentry must be passed to dput() when it is no longer needed.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
+{
+	struct dentry *dentry;
+
+	if (IS_ERR(parent))
+		return NULL;
+
+	if (!parent)
+		parent = debugfs_mount->mnt_root;
+
+	inode_lock(d_inode(parent));
+	dentry = lookup_one_len(name, parent, strlen(name));
+	inode_unlock(d_inode(parent));
+
+	if (IS_ERR(dentry))
+		return NULL;
+	if (!d_really_is_positive(dentry)) {
+		dput(dentry);
+		return NULL;
+	}
+	return dentry;
+}
+EXPORT_SYMBOL_GPL(debugfs_lookup);
+
 static struct dentry *start_creating(const char *name, struct dentry *parent)
 {
 	struct dentry *dentry;
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 014cc564d1c4..c0befcf41b58 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -80,6 +80,8 @@ static const struct file_operations __fops = {				\
 
 #if defined(CONFIG_DEBUG_FS)
 
+struct dentry *debugfs_lookup(const char *name, struct dentry *parent);
+
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
@@ -181,6 +183,12 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
  * want to duplicate the design decision mistakes of procfs and devfs again.
  */
 
+static inline struct dentry *debugfs_lookup(const char *name,
+					    struct dentry *parent)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
 					const struct file_operations *fops)
-- 
cgit v1.2.3


From 03796c149a99e14506db9de3adba710c26f83ba9 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 31 Jan 2017 14:53:18 -0800
Subject: block: fix debugfs config conditional in struct request_queue

The debugfs dentries are only used for CONFIG_BLK_DEBUG_FS, so make them
conditional on that instead of CONFIG_DEBUG_FS.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f84fbe55d3b3..e0bac14347e6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -570,7 +570,7 @@ struct request_queue {
 	struct list_head	tag_set_list;
 	struct bio_set		*bio_split;
 
-#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
 	struct dentry		*mq_debugfs_dir;
 #endif
-- 
cgit v1.2.3


From a428d314ebcf65842fd64ad850c02c280586e74d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 31 Jan 2017 14:53:19 -0800
Subject: blktrace: make do_blk_trace_setup() static

This isn't used outside of blktrace.c anymore.

Fixes: 62c2a7d969f3 ("block: push BKL into blktrace ioctls")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blktrace_api.h | 4 ----
 kernel/trace/blktrace.c      | 6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 341f9418bd68..d2e908586e3d 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -30,9 +30,6 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q, char *name,
-			      dev_t dev, struct block_device *bdev,
-			      struct blk_user_trace_setup *buts);
 extern __printf(2, 3)
 void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
@@ -80,7 +77,6 @@ extern struct attribute_group blk_trace_attr_group;
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 # define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 # define blk_trace_shutdown(q)				do { } while (0)
-# define do_blk_trace_setup(q, name, dev, bdev, buts)	(-ENOTTY)
 # define blk_add_driver_data(q, rq, data, len)		do {} while (0)
 # define blk_trace_setup(q, name, dev, bdev, arg)	(-ENOTTY)
 # define blk_trace_startstop(q, start)			(-ENOTTY)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e33050e3ea1c..84763e0c83cf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -433,9 +433,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
 /*
  * Setup everything required to start tracing
  */
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		       struct block_device *bdev,
-		       struct blk_user_trace_setup *buts)
+static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			      struct block_device *bdev,
+			      struct blk_user_trace_setup *buts)
 {
 	struct blk_trace *bt = NULL;
 	struct dentry *dir = NULL;
-- 
cgit v1.2.3


From b343d77b94b6702082f4f402e1492e71ef87095b Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Mon, 30 Jan 2017 13:38:22 +0100
Subject: soc: samsung: pmu: Add register defines for pad retention control

Add PMU defines related to pad retention control. Will be later used
by the Exynos pin controller driver.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
---
 include/linux/soc/samsung/exynos-regs-pmu.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 9786c62d7159..49df0a01a2cc 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -631,4 +631,20 @@
 					 | EXYNOS5420_KFC_USE_STANDBY_WFI2  \
 					 | EXYNOS5420_KFC_USE_STANDBY_WFI3)
 
+/* For EXYNOS5433 */
+#define EXYNOS5433_PAD_RETENTION_AUD_OPTION			(0x3028)
+#define EXYNOS5433_PAD_RETENTION_MMC2_OPTION			(0x30C8)
+#define EXYNOS5433_PAD_RETENTION_TOP_OPTION			(0x3108)
+#define EXYNOS5433_PAD_RETENTION_UART_OPTION			(0x3128)
+#define EXYNOS5433_PAD_RETENTION_MMC0_OPTION			(0x3148)
+#define EXYNOS5433_PAD_RETENTION_MMC1_OPTION			(0x3168)
+#define EXYNOS5433_PAD_RETENTION_EBIA_OPTION			(0x3188)
+#define EXYNOS5433_PAD_RETENTION_EBIB_OPTION			(0x31A8)
+#define EXYNOS5433_PAD_RETENTION_SPI_OPTION			(0x31C8)
+#define EXYNOS5433_PAD_RETENTION_MIF_OPTION			(0x31E8)
+#define EXYNOS5433_PAD_RETENTION_USBXTI_OPTION			(0x3228)
+#define EXYNOS5433_PAD_RETENTION_BOOTLDO_OPTION			(0x3248)
+#define EXYNOS5433_PAD_RETENTION_UFS_OPTION			(0x3268)
+#define EXYNOS5433_PAD_RETENTION_FSYSGENIO_OPTION		(0x32A8)
+
 #endif /* __LINUX_SOC_EXYNOS_REGS_PMU_H */
-- 
cgit v1.2.3


From a45463cbf3f9dcdae683033c256f50bded513d6a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 1 Feb 2017 18:01:17 +0100
Subject: workqueue: avoid clang warning

Building with clang shows lots of warning like:

drivers/amba/bus.c:447:8: warning: implicit conversion from 'long long' to 'int' changes value from 4294967248 to -48
      [-Wconstant-conversion]
static DECLARE_DELAYED_WORK(deferred_retry_work, amba_deferred_retry_func);
       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/workqueue.h:187:26: note: expanded from macro 'DECLARE_DELAYED_WORK'
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)
                                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/workqueue.h:177:10: note: expanded from macro '__DELAYED_WORK_INITIALIZER'
        .work = __WORK_INITIALIZER((n).work, (f)),                      \
                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/workqueue.h:170:10: note: expanded from macro '__WORK_INITIALIZER'
        .data = WORK_DATA_STATIC_INIT(),                                \
                ^~~~~~~~~~~~~~~~~~~~~~~
include/linux/workqueue.h:111:39: note: expanded from macro 'WORK_DATA_STATIC_INIT'
        ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~
include/asm-generic/atomic-long.h:32:41: note: expanded from macro 'ATOMIC_LONG_INIT'
 #define ATOMIC_LONG_INIT(i)     ATOMIC_INIT(i)
                                ~~~~~~~~~~~~^~
arch/arm/include/asm/atomic.h:21:27: note: expanded from macro 'ATOMIC_INIT'
 #define ATOMIC_INIT(i)  { (i) }
                        ~  ^

This makes the type cast explicit, which shuts up the warning.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a26cc437293c..bde063cefd04 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -106,9 +106,9 @@ struct work_struct {
 #endif
 };
 
-#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL)
+#define WORK_DATA_INIT()	ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
 #define WORK_DATA_STATIC_INIT()	\
-	ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)
+	ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
 
 struct delayed_work {
 	struct work_struct work;
-- 
cgit v1.2.3


From e4cf8a38fc0d257b4070066e46a678f4777fecaa Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 1 Feb 2017 03:40:06 +0100
Subject: net: phy: Marvell: Add mv88e6390 internal PHY

The mv88e6390 Ethernet switch has internal PHYs. These PHYs don't have
an model ID in the ID2 register. So the MDIO driver in the switch
intercepts reads to this register, and returns the switch family ID.
Extend the Marvell PHY driver by including this ID, and treat the PHY
as a 88E1540.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c   | 20 ++++++++++++++++++++
 include/linux/marvell_phy.h |  6 ++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index a3e3733813a7..1a0ac48cbc50 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2142,6 +2142,25 @@ static struct phy_driver marvell_drivers[] = {
 		.get_strings = marvell_get_strings,
 		.get_stats = marvell_get_stats,
 	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E6390,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E6390",
+		.features = PHY_GBIT_FEATURES,
+		.flags = PHY_HAS_INTERRUPT,
+		.probe = m88e1510_probe,
+		.config_init = &marvell_config_init,
+		.config_aneg = &m88e1510_config_aneg,
+		.read_status = &marvell_read_status,
+		.ack_interrupt = &marvell_ack_interrupt,
+		.config_intr = &marvell_config_intr,
+		.did_interrupt = &m88e1121_did_interrupt,
+		.resume = &genphy_resume,
+		.suspend = &genphy_suspend,
+		.get_sset_count = marvell_get_sset_count,
+		.get_strings = marvell_get_strings,
+		.get_stats = marvell_get_stats,
+	},
 };
 
 module_phy_driver(marvell_drivers);
@@ -2160,6 +2179,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1510, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1540, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E6390, MARVELL_PHY_ID_MASK },
 	{ }
 };
 
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index a57f0dfb6db7..3d616d7f65bf 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -19,6 +19,12 @@
 #define MARVELL_PHY_ID_88E1540		0x01410eb0
 #define MARVELL_PHY_ID_88E3016		0x01410e60
 
+/* The MV88e6390 Ethernet switch contains embedded PHYs. These PHYs do
+ * not have a model ID. So the switch driver traps reads to the ID2
+ * register and returns the switch family ID
+ */
+#define MARVELL_PHY_ID_88E6390		0x01410f90
+
 /* struct phy_device dev_flags definitions */
 #define MARVELL_PHY_M1145_FLAGS_RESISTANCE	0x00000001
 #define MARVELL_PHY_M1118_DNS323_LEDS		0x00000002
-- 
cgit v1.2.3


From 60f06fde4cff6f6134decbf09199b5e047bc3355 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 2 Feb 2017 00:35:03 +0100
Subject: net: phy: marvell: Add support for 88e1545 PHY

The 88e1545 PHYs are discrete Marvell PHYs, found in a quad package on
the zii-devel-b board. Add support for it to the Marvell PHY driver.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c   | 21 +++++++++++++++++++++
 include/linux/marvell_phy.h |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 1a0ac48cbc50..f9d0fa315a47 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2122,6 +2122,26 @@ static struct phy_driver marvell_drivers[] = {
 		.get_strings = marvell_get_strings,
 		.get_stats = marvell_get_stats,
 	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E1545,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E1545",
+		.probe = m88e1510_probe,
+		.remove = &marvell_remove,
+		.features = PHY_GBIT_FEATURES,
+		.flags = PHY_HAS_INTERRUPT,
+		.config_init = &marvell_config_init,
+		.config_aneg = &m88e1510_config_aneg,
+		.read_status = &marvell_read_status,
+		.ack_interrupt = &marvell_ack_interrupt,
+		.config_intr = &marvell_config_intr,
+		.did_interrupt = &m88e1121_did_interrupt,
+		.resume = &genphy_resume,
+		.suspend = &genphy_suspend,
+		.get_sset_count = marvell_get_sset_count,
+		.get_strings = marvell_get_strings,
+		.get_stats = marvell_get_stats,
+	},
 	{
 		.phy_id = MARVELL_PHY_ID_88E3016,
 		.phy_id_mask = MARVELL_PHY_ID_MASK,
@@ -2178,6 +2198,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1116R, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1510, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1540, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E1545, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E6390, MARVELL_PHY_ID_MASK },
 	{ }
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 3d616d7f65bf..4055cf8cc978 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -17,6 +17,7 @@
 #define MARVELL_PHY_ID_88E1116R		0x01410e40
 #define MARVELL_PHY_ID_88E1510		0x01410dd0
 #define MARVELL_PHY_ID_88E1540		0x01410eb0
+#define MARVELL_PHY_ID_88E1545		0x01410ea0
 #define MARVELL_PHY_ID_88E3016		0x01410e60
 
 /* The MV88e6390 Ethernet switch contains embedded PHYs. These PHYs do
-- 
cgit v1.2.3


From 0f1b92cbdd0309afae0af1963e8cccddb3d2eaff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 30 Jan 2017 18:06:11 +0300
Subject: introduce the walk_process_tree() helper

Add the new helper to walk the process tree, the next patch adds a user.
Note that it visits the group leaders only, proc_visitor can do
for_each_thread itself or we can trivially extend walk_process_tree() to
do this.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sched.h |  3 +++
 kernel/fork.c         | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2334229167f..6261bfc12853 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3053,6 +3053,9 @@ extern bool current_is_single_threaded(void);
 #define for_each_process_thread(p, t)	\
 	for_each_process(p) for_each_thread(p, t)
 
+typedef int (*proc_visitor)(struct task_struct *p, void *data);
+void walk_process_tree(struct task_struct *top, proc_visitor, void *);
+
 static inline int get_nr_threads(struct task_struct *tsk)
 {
 	return tsk->signal->nr_threads;
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..135b7a49ad59 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2053,6 +2053,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 }
 #endif
 
+void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
+{
+	struct task_struct *leader, *parent, *child;
+	int res;
+
+	read_lock(&tasklist_lock);
+	leader = top = top->group_leader;
+down:
+	for_each_thread(leader, parent) {
+		list_for_each_entry(child, &parent->children, sibling) {
+			res = visitor(child, data);
+			if (res) {
+				if (res < 0)
+					goto out;
+				leader = child;
+				goto down;
+			}
+up:
+			;
+		}
+	}
+
+	if (leader != top) {
+		child = leader;
+		parent = child->real_parent;
+		leader = parent->group_leader;
+		goto up;
+	}
+out:
+	read_unlock(&tasklist_lock);
+}
+
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-- 
cgit v1.2.3


From c3485ee0d560b182e1e0f67d67246718739f0782 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 2 Feb 2017 13:48:05 -0600
Subject: tty_port: Add port client functions

Introduce a client (upward direction) operations struct for tty_port
clients. Initially supported operations are for receiving data and write
wake-up. This will allow for having clients other than an ldisc.

Convert the calls to the ldisc to use the client ops as the default
operations.

Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-By: Sebastian Reichel <sre@kernel.org>
Tested-By: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_buffer.c | 17 +++--------------
 drivers/tty/tty_port.c   | 46 ++++++++++++++++++++++++++++++++++++++++------
 include/linux/tty.h      |  9 ++++++++-
 3 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index f4dc3e296dd5..4e7a4e9dcf4d 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -437,7 +437,7 @@ int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf);
 
 static int
-receive_buf(struct tty_ldisc *ld, struct tty_buffer *head, int count)
+receive_buf(struct tty_port *port, struct tty_buffer *head, int count)
 {
 	unsigned char *p = char_buf_ptr(head, head->read);
 	char	      *f = NULL;
@@ -445,7 +445,7 @@ receive_buf(struct tty_ldisc *ld, struct tty_buffer *head, int count)
 	if (~head->flags & TTYB_NORMAL)
 		f = flag_buf_ptr(head, head->read);
 
-	return tty_ldisc_receive_buf(ld, p, f, count);
+	return port->client_ops->receive_buf(port, p, f, count);
 }
 
 /**
@@ -465,16 +465,6 @@ static void flush_to_ldisc(struct work_struct *work)
 {
 	struct tty_port *port = container_of(work, struct tty_port, buf.work);
 	struct tty_bufhead *buf = &port->buf;
-	struct tty_struct *tty;
-	struct tty_ldisc *disc;
-
-	tty = READ_ONCE(port->itty);
-	if (tty == NULL)
-		return;
-
-	disc = tty_ldisc_ref(tty);
-	if (disc == NULL)
-		return;
 
 	mutex_lock(&buf->lock);
 
@@ -504,7 +494,7 @@ static void flush_to_ldisc(struct work_struct *work)
 			continue;
 		}
 
-		count = receive_buf(disc, head, count);
+		count = receive_buf(port, head, count);
 		if (!count)
 			break;
 		head->read += count;
@@ -512,7 +502,6 @@ static void flush_to_ldisc(struct work_struct *work)
 
 	mutex_unlock(&buf->lock);
 
-	tty_ldisc_deref(disc);
 }
 
 /**
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 1d8804843103..8d9886b06037 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -17,6 +17,44 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 
+static int tty_port_default_receive_buf(struct tty_port *port,
+					const unsigned char *p,
+					const unsigned char *f, size_t count)
+{
+	int ret;
+	struct tty_struct *tty;
+	struct tty_ldisc *disc;
+
+	tty = READ_ONCE(port->itty);
+	if (!tty)
+		return 0;
+
+	disc = tty_ldisc_ref(tty);
+	if (!disc)
+		return 0;
+
+	ret = tty_ldisc_receive_buf(disc, p, (char *)f, count);
+
+	tty_ldisc_deref(disc);
+
+	return ret;
+}
+
+static void tty_port_default_wakeup(struct tty_port *port)
+{
+	struct tty_struct *tty = tty_port_tty_get(port);
+
+	if (tty) {
+		tty_wakeup(tty);
+		tty_kref_put(tty);
+	}
+}
+
+static const struct tty_port_client_operations default_client_ops = {
+	.receive_buf = tty_port_default_receive_buf,
+	.write_wakeup = tty_port_default_wakeup,
+};
+
 void tty_port_init(struct tty_port *port)
 {
 	memset(port, 0, sizeof(*port));
@@ -28,6 +66,7 @@ void tty_port_init(struct tty_port *port)
 	spin_lock_init(&port->lock);
 	port->close_delay = (50 * HZ) / 100;
 	port->closing_wait = (3000 * HZ) / 100;
+	port->client_ops = &default_client_ops;
 	kref_init(&port->kref);
 }
 EXPORT_SYMBOL(tty_port_init);
@@ -272,12 +311,7 @@ EXPORT_SYMBOL_GPL(tty_port_tty_hangup);
  */
 void tty_port_tty_wakeup(struct tty_port *port)
 {
-	struct tty_struct *tty = tty_port_tty_get(port);
-
-	if (tty) {
-		tty_wakeup(tty);
-		tty_kref_put(tty);
-	}
+	port->client_ops->write_wakeup(port);
 }
 EXPORT_SYMBOL_GPL(tty_port_tty_wakeup);
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 21c0fabfed60..1017e904c0a3 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -217,12 +217,18 @@ struct tty_port_operations {
 	/* Called on the final put of a port */
 	void (*destruct)(struct tty_port *port);
 };
-	
+
+struct tty_port_client_operations {
+	int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t);
+	void (*write_wakeup)(struct tty_port *port);
+};
+
 struct tty_port {
 	struct tty_bufhead	buf;		/* Locked internally */
 	struct tty_struct	*tty;		/* Back pointer */
 	struct tty_struct	*itty;		/* internal back ptr */
 	const struct tty_port_operations *ops;	/* Port operations */
+	const struct tty_port_client_operations *client_ops; /* Port client operations */
 	spinlock_t		lock;		/* Lock protecting tty field */
 	int			blocked_open;	/* Waiting to open */
 	int			count;		/* Usage count */
@@ -241,6 +247,7 @@ struct tty_port {
 						   based drain is needed else
 						   set to size of fifo */
 	struct kref		kref;		/* Ref counter */
+	void 			*client_data;
 };
 
 /* tty_port::iflags bits -- use atomic bit ops */
-- 
cgit v1.2.3


From cd6484e1830be260abfba80a9c7d8f65531126d6 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 2 Feb 2017 13:48:07 -0600
Subject: serdev: Introduce new bus for serial attached devices

The serdev bus is designed for devices such as Bluetooth, WiFi, GPS
and NFC connected to UARTs on host processors. Tradionally these have
been handled with tty line disciplines, rfkill, and userspace glue such
as hciattach. This approach has many drawbacks since it doesn't fit
into the Linux driver model. Handling of sideband signals, power control
and firmware loading are the main issues.

This creates a serdev bus with controllers (i.e. host serial ports) and
attached devices. Typically, these are point to point connections, but
some devices have muxing protocols or a h/w mux is conceivable. Any
muxing is not yet supported with the serdev bus.

Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-By: Sebastian Reichel <sre@kernel.org>
Tested-By: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                 |   8 +
 drivers/char/Kconfig        |   1 +
 drivers/tty/Makefile        |   1 +
 drivers/tty/serdev/Kconfig  |   8 +
 drivers/tty/serdev/Makefile |   3 +
 drivers/tty/serdev/core.c   | 421 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/serdev.h      | 241 +++++++++++++++++++++++++
 7 files changed, 683 insertions(+)
 create mode 100644 drivers/tty/serdev/Kconfig
 create mode 100644 drivers/tty/serdev/Makefile
 create mode 100644 drivers/tty/serdev/core.c
 create mode 100644 include/linux/serdev.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c36976d3bd1a..90db2b5d418c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10831,6 +10831,14 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/serial/
 F:	drivers/tty/serial/
 
+SERIAL DEVICE BUS
+M:	Rob Herring <robh@kernel.org>
+L:	linux-serial@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/serial/slave-device.txt
+F:	drivers/tty/serdev/
+F:	include/linux/serdev.h
+
 SERIAL IR RECEIVER
 M:	Sean Young <sean@mess.org>
 L:	linux-media@vger.kernel.org
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index fde005ef9d36..db8f74bbaf3e 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -46,6 +46,7 @@ config SGI_MBCS
          say Y or M here, otherwise say N.
 
 source "drivers/tty/serial/Kconfig"
+source "drivers/tty/serdev/Kconfig"
 
 config TTY_PRINTK
 	tristate "TTY driver to output user messages via printk"
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 5817e2397463..b95bed92da9f 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_R3964)		+= n_r3964.o
 obj-y				+= vt/
 obj-$(CONFIG_HVC_DRIVER)	+= hvc/
 obj-y				+= serial/
+obj-$(CONFIG_SERIAL_DEV_BUS)	+= serdev/
 
 # tty drivers
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
diff --git a/drivers/tty/serdev/Kconfig b/drivers/tty/serdev/Kconfig
new file mode 100644
index 000000000000..3b6ecd187bef
--- /dev/null
+++ b/drivers/tty/serdev/Kconfig
@@ -0,0 +1,8 @@
+#
+# Serial bus device driver configuration
+#
+menuconfig SERIAL_DEV_BUS
+	tristate "Serial device bus"
+	help
+	  Core support for devices connected via a serial port.
+
diff --git a/drivers/tty/serdev/Makefile b/drivers/tty/serdev/Makefile
new file mode 100644
index 000000000000..01a9b62183f4
--- /dev/null
+++ b/drivers/tty/serdev/Makefile
@@ -0,0 +1,3 @@
+serdev-objs := core.o
+
+obj-$(CONFIG_SERIAL_DEV_BUS) += serdev.o
diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
new file mode 100644
index 000000000000..f4c6c90add78
--- /dev/null
+++ b/drivers/tty/serdev/core.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2016-2017 Linaro Ltd., Rob Herring <robh@kernel.org>
+ *
+ * Based on drivers/spmi/spmi.c:
+ * Copyright (c) 2012-2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/errno.h>
+#include <linux/idr.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/serdev.h>
+#include <linux/slab.h>
+
+static bool is_registered;
+static DEFINE_IDA(ctrl_ida);
+
+static void serdev_device_release(struct device *dev)
+{
+	struct serdev_device *serdev = to_serdev_device(dev);
+	kfree(serdev);
+}
+
+static const struct device_type serdev_device_type = {
+	.release	= serdev_device_release,
+};
+
+static void serdev_ctrl_release(struct device *dev)
+{
+	struct serdev_controller *ctrl = to_serdev_controller(dev);
+	ida_simple_remove(&ctrl_ida, ctrl->nr);
+	kfree(ctrl);
+}
+
+static const struct device_type serdev_ctrl_type = {
+	.release	= serdev_ctrl_release,
+};
+
+static int serdev_device_match(struct device *dev, struct device_driver *drv)
+{
+	/* TODO: ACPI and platform matching */
+	return of_driver_match_device(dev, drv);
+}
+
+static int serdev_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	/* TODO: ACPI and platform modalias */
+	return of_device_uevent_modalias(dev, env);
+}
+
+/**
+ * serdev_device_add() - add a device previously constructed via serdev_device_alloc()
+ * @serdev:	serdev_device to be added
+ */
+int serdev_device_add(struct serdev_device *serdev)
+{
+	struct device *parent = serdev->dev.parent;
+	int err;
+
+	dev_set_name(&serdev->dev, "%s-%d", dev_name(parent), serdev->nr);
+
+	err = device_add(&serdev->dev);
+	if (err < 0) {
+		dev_err(&serdev->dev, "Can't add %s, status %d\n",
+			dev_name(&serdev->dev), err);
+		goto err_device_add;
+	}
+
+	dev_dbg(&serdev->dev, "device %s registered\n", dev_name(&serdev->dev));
+
+err_device_add:
+	return err;
+}
+EXPORT_SYMBOL_GPL(serdev_device_add);
+
+/**
+ * serdev_device_remove(): remove an serdev device
+ * @serdev:	serdev_device to be removed
+ */
+void serdev_device_remove(struct serdev_device *serdev)
+{
+	device_unregister(&serdev->dev);
+}
+EXPORT_SYMBOL_GPL(serdev_device_remove);
+
+int serdev_device_open(struct serdev_device *serdev)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->open)
+		return -EINVAL;
+
+	return ctrl->ops->open(ctrl);
+}
+EXPORT_SYMBOL_GPL(serdev_device_open);
+
+void serdev_device_close(struct serdev_device *serdev)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->close)
+		return;
+
+	ctrl->ops->close(ctrl);
+}
+EXPORT_SYMBOL_GPL(serdev_device_close);
+
+int serdev_device_write_buf(struct serdev_device *serdev,
+			    const unsigned char *buf, size_t count)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->write_buf)
+		return -EINVAL;
+
+	return ctrl->ops->write_buf(ctrl, buf, count);
+}
+EXPORT_SYMBOL_GPL(serdev_device_write_buf);
+
+void serdev_device_write_flush(struct serdev_device *serdev)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->write_flush)
+		return;
+
+	ctrl->ops->write_flush(ctrl);
+}
+EXPORT_SYMBOL_GPL(serdev_device_write_flush);
+
+int serdev_device_write_room(struct serdev_device *serdev)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->write_room)
+		return 0;
+
+	return serdev->ctrl->ops->write_room(ctrl);
+}
+EXPORT_SYMBOL_GPL(serdev_device_write_room);
+
+unsigned int serdev_device_set_baudrate(struct serdev_device *serdev, unsigned int speed)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->set_baudrate)
+		return 0;
+
+	return ctrl->ops->set_baudrate(ctrl, speed);
+
+}
+EXPORT_SYMBOL_GPL(serdev_device_set_baudrate);
+
+void serdev_device_set_flow_control(struct serdev_device *serdev, bool enable)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->set_flow_control)
+		return;
+
+	ctrl->ops->set_flow_control(ctrl, enable);
+}
+EXPORT_SYMBOL_GPL(serdev_device_set_flow_control);
+
+static int serdev_drv_probe(struct device *dev)
+{
+	const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver);
+
+	return sdrv->probe(to_serdev_device(dev));
+}
+
+static int serdev_drv_remove(struct device *dev)
+{
+	const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver);
+
+	sdrv->remove(to_serdev_device(dev));
+	return 0;
+}
+
+static ssize_t modalias_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
+	buf[len] = '\n';
+	buf[len+1] = 0;
+	return len+1;
+}
+
+static struct device_attribute serdev_device_attrs[] = {
+	__ATTR_RO(modalias),
+	__ATTR_NULL
+};
+
+static struct bus_type serdev_bus_type = {
+	.name		= "serial",
+	.match		= serdev_device_match,
+	.probe		= serdev_drv_probe,
+	.remove		= serdev_drv_remove,
+	.uevent		= serdev_uevent,
+	.dev_attrs	= serdev_device_attrs,
+};
+
+/**
+ * serdev_controller_alloc() - Allocate a new serdev device
+ * @ctrl:	associated controller
+ *
+ * Caller is responsible for either calling serdev_device_add() to add the
+ * newly allocated controller, or calling serdev_device_put() to discard it.
+ */
+struct serdev_device *serdev_device_alloc(struct serdev_controller *ctrl)
+{
+	struct serdev_device *serdev;
+
+	serdev = kzalloc(sizeof(*serdev), GFP_KERNEL);
+	if (!serdev)
+		return NULL;
+
+	serdev->ctrl = ctrl;
+	ctrl->serdev = serdev;
+	device_initialize(&serdev->dev);
+	serdev->dev.parent = &ctrl->dev;
+	serdev->dev.bus = &serdev_bus_type;
+	serdev->dev.type = &serdev_device_type;
+	return serdev;
+}
+EXPORT_SYMBOL_GPL(serdev_device_alloc);
+
+/**
+ * serdev_controller_alloc() - Allocate a new serdev controller
+ * @parent:	parent device
+ * @size:	size of private data
+ *
+ * Caller is responsible for either calling serdev_controller_add() to add the
+ * newly allocated controller, or calling serdev_controller_put() to discard it.
+ * The allocated private data region may be accessed via
+ * serdev_controller_get_drvdata()
+ */
+struct serdev_controller *serdev_controller_alloc(struct device *parent,
+					      size_t size)
+{
+	struct serdev_controller *ctrl;
+	int id;
+
+	if (WARN_ON(!parent))
+		return NULL;
+
+	ctrl = kzalloc(sizeof(*ctrl) + size, GFP_KERNEL);
+	if (!ctrl)
+		return NULL;
+
+	device_initialize(&ctrl->dev);
+	ctrl->dev.type = &serdev_ctrl_type;
+	ctrl->dev.bus = &serdev_bus_type;
+	ctrl->dev.parent = parent;
+	ctrl->dev.of_node = parent->of_node;
+	serdev_controller_set_drvdata(ctrl, &ctrl[1]);
+
+	id = ida_simple_get(&ctrl_ida, 0, 0, GFP_KERNEL);
+	if (id < 0) {
+		dev_err(parent,
+			"unable to allocate serdev controller identifier.\n");
+		serdev_controller_put(ctrl);
+		return NULL;
+	}
+
+	ctrl->nr = id;
+	dev_set_name(&ctrl->dev, "serial%d", id);
+
+	dev_dbg(&ctrl->dev, "allocated controller 0x%p id %d\n", ctrl, id);
+	return ctrl;
+}
+EXPORT_SYMBOL_GPL(serdev_controller_alloc);
+
+static int of_serdev_register_devices(struct serdev_controller *ctrl)
+{
+	struct device_node *node;
+	struct serdev_device *serdev = NULL;
+	int err;
+	bool found = false;
+
+	for_each_available_child_of_node(ctrl->dev.of_node, node) {
+		if (!of_get_property(node, "compatible", NULL))
+			continue;
+
+		dev_dbg(&ctrl->dev, "adding child %s\n", node->full_name);
+
+		serdev = serdev_device_alloc(ctrl);
+		if (!serdev)
+			continue;
+
+		serdev->dev.of_node = node;
+
+		err = serdev_device_add(serdev);
+		if (err) {
+			dev_err(&serdev->dev,
+				"failure adding device. status %d\n", err);
+			serdev_device_put(serdev);
+		} else
+			found = true;
+	}
+	if (!found)
+		return -ENODEV;
+
+	return 0;
+}
+
+/**
+ * serdev_controller_add() - Add an serdev controller
+ * @ctrl:	controller to be registered.
+ *
+ * Register a controller previously allocated via serdev_controller_alloc() with
+ * the serdev core.
+ */
+int serdev_controller_add(struct serdev_controller *ctrl)
+{
+	int ret;
+
+	/* Can't register until after driver model init */
+	if (WARN_ON(!is_registered))
+		return -EAGAIN;
+
+	ret = device_add(&ctrl->dev);
+	if (ret)
+		return ret;
+
+	ret = of_serdev_register_devices(ctrl);
+	if (ret)
+		goto out_dev_del;
+
+	dev_dbg(&ctrl->dev, "serdev%d registered: dev:%p\n",
+		ctrl->nr, &ctrl->dev);
+	return 0;
+
+out_dev_del:
+	device_del(&ctrl->dev);
+	return ret;
+};
+EXPORT_SYMBOL_GPL(serdev_controller_add);
+
+/* Remove a device associated with a controller */
+static int serdev_remove_device(struct device *dev, void *data)
+{
+	struct serdev_device *serdev = to_serdev_device(dev);
+	if (dev->type == &serdev_device_type)
+		serdev_device_remove(serdev);
+	return 0;
+}
+
+/**
+ * serdev_controller_remove(): remove an serdev controller
+ * @ctrl:	controller to remove
+ *
+ * Remove a serdev controller.  Caller is responsible for calling
+ * serdev_controller_put() to discard the allocated controller.
+ */
+void serdev_controller_remove(struct serdev_controller *ctrl)
+{
+	int dummy;
+
+	if (!ctrl)
+		return;
+
+	dummy = device_for_each_child(&ctrl->dev, NULL,
+				      serdev_remove_device);
+	device_del(&ctrl->dev);
+}
+EXPORT_SYMBOL_GPL(serdev_controller_remove);
+
+/**
+ * serdev_driver_register() - Register client driver with serdev core
+ * @sdrv:	client driver to be associated with client-device.
+ *
+ * This API will register the client driver with the serdev framework.
+ * It is typically called from the driver's module-init function.
+ */
+int __serdev_device_driver_register(struct serdev_device_driver *sdrv, struct module *owner)
+{
+	sdrv->driver.bus = &serdev_bus_type;
+	sdrv->driver.owner = owner;
+
+	/* force drivers to async probe so I/O is possible in probe */
+        sdrv->driver.probe_type = PROBE_PREFER_ASYNCHRONOUS;
+
+	return driver_register(&sdrv->driver);
+}
+EXPORT_SYMBOL_GPL(__serdev_device_driver_register);
+
+static void __exit serdev_exit(void)
+{
+	bus_unregister(&serdev_bus_type);
+}
+module_exit(serdev_exit);
+
+static int __init serdev_init(void)
+{
+	int ret;
+
+	ret = bus_register(&serdev_bus_type);
+	if (ret)
+		return ret;
+
+	is_registered = true;
+	return 0;
+}
+/* Must be before serial drivers register */
+postcore_initcall(serdev_init);
+
+MODULE_AUTHOR("Rob Herring <robh@kernel.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Serial attached device bus");
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
new file mode 100644
index 000000000000..3ac26c1a8aab
--- /dev/null
+++ b/include/linux/serdev.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2016-2017 Linaro Ltd., Rob Herring <robh@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _LINUX_SERDEV_H
+#define _LINUX_SERDEV_H
+
+#include <linux/types.h>
+#include <linux/device.h>
+
+struct serdev_controller;
+struct serdev_device;
+
+/*
+ * serdev device structures
+ */
+
+/**
+ * struct serdev_device_ops - Callback operations for a serdev device
+ * @receive_buf:	Function called with data received from device.
+ * @write_wakeup:	Function called when ready to transmit more data.
+ */
+struct serdev_device_ops {
+	int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t);
+	void (*write_wakeup)(struct serdev_device *);
+};
+
+/**
+ * struct serdev_device - Basic representation of an serdev device
+ * @dev:	Driver model representation of the device.
+ * @nr:		Device number on serdev bus.
+ * @ctrl:	serdev controller managing this device.
+ * @ops:	Device operations.
+ */
+struct serdev_device {
+	struct device dev;
+	int nr;
+	struct serdev_controller *ctrl;
+	const struct serdev_device_ops *ops;
+};
+
+static inline struct serdev_device *to_serdev_device(struct device *d)
+{
+	return container_of(d, struct serdev_device, dev);
+}
+
+/**
+ * struct serdev_device_driver - serdev slave device driver
+ * @driver:	serdev device drivers should initialize name field of this
+ *		structure.
+ * @probe:	binds this driver to a serdev device.
+ * @remove:	unbinds this driver from the serdev device.
+ */
+struct serdev_device_driver {
+	struct device_driver driver;
+	int	(*probe)(struct serdev_device *);
+	void	(*remove)(struct serdev_device *);
+};
+
+static inline struct serdev_device_driver *to_serdev_device_driver(struct device_driver *d)
+{
+	return container_of(d, struct serdev_device_driver, driver);
+}
+
+/*
+ * serdev controller structures
+ */
+struct serdev_controller_ops {
+	int (*write_buf)(struct serdev_controller *, const unsigned char *, size_t);
+	void (*write_flush)(struct serdev_controller *);
+	int (*write_room)(struct serdev_controller *);
+	int (*open)(struct serdev_controller *);
+	void (*close)(struct serdev_controller *);
+	void (*set_flow_control)(struct serdev_controller *, bool);
+	unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int);
+};
+
+/**
+ * struct serdev_controller - interface to the serdev controller
+ * @dev:	Driver model representation of the device.
+ * @nr:		number identifier for this controller/bus.
+ * @serdev:	Pointer to slave device for this controller.
+ * @ops:	Controller operations.
+ */
+struct serdev_controller {
+	struct device		dev;
+	unsigned int		nr;
+	struct serdev_device	*serdev;
+	const struct serdev_controller_ops *ops;
+};
+
+static inline struct serdev_controller *to_serdev_controller(struct device *d)
+{
+	return container_of(d, struct serdev_controller, dev);
+}
+
+static inline void *serdev_device_get_drvdata(const struct serdev_device *serdev)
+{
+	return dev_get_drvdata(&serdev->dev);
+}
+
+static inline void serdev_device_set_drvdata(struct serdev_device *serdev, void *data)
+{
+	dev_set_drvdata(&serdev->dev, data);
+}
+
+/**
+ * serdev_device_put() - decrement serdev device refcount
+ * @serdev	serdev device.
+ */
+static inline void serdev_device_put(struct serdev_device *serdev)
+{
+	if (serdev)
+		put_device(&serdev->dev);
+}
+
+static inline void serdev_device_set_client_ops(struct serdev_device *serdev,
+					      const struct serdev_device_ops *ops)
+{
+	serdev->ops = ops;
+}
+
+static inline
+void *serdev_controller_get_drvdata(const struct serdev_controller *ctrl)
+{
+	return ctrl ? dev_get_drvdata(&ctrl->dev) : NULL;
+}
+
+static inline void serdev_controller_set_drvdata(struct serdev_controller *ctrl,
+					       void *data)
+{
+	dev_set_drvdata(&ctrl->dev, data);
+}
+
+/**
+ * serdev_controller_put() - decrement controller refcount
+ * @ctrl	serdev controller.
+ */
+static inline void serdev_controller_put(struct serdev_controller *ctrl)
+{
+	if (ctrl)
+		put_device(&ctrl->dev);
+}
+
+struct serdev_device *serdev_device_alloc(struct serdev_controller *);
+int serdev_device_add(struct serdev_device *);
+void serdev_device_remove(struct serdev_device *);
+
+struct serdev_controller *serdev_controller_alloc(struct device *, size_t);
+int serdev_controller_add(struct serdev_controller *);
+void serdev_controller_remove(struct serdev_controller *);
+
+static inline void serdev_controller_write_wakeup(struct serdev_controller *ctrl)
+{
+	struct serdev_device *serdev = ctrl->serdev;
+
+	if (!serdev || !serdev->ops->write_wakeup)
+		return;
+
+	serdev->ops->write_wakeup(ctrl->serdev);
+}
+
+static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
+					      const unsigned char *data,
+					      size_t count)
+{
+	struct serdev_device *serdev = ctrl->serdev;
+
+	if (!serdev || !serdev->ops->receive_buf)
+		return -EINVAL;
+
+	return serdev->ops->receive_buf(ctrl->serdev, data, count);
+}
+
+#if IS_ENABLED(CONFIG_SERIAL_DEV_BUS)
+
+int serdev_device_open(struct serdev_device *);
+void serdev_device_close(struct serdev_device *);
+unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
+void serdev_device_set_flow_control(struct serdev_device *, bool);
+int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
+void serdev_device_write_flush(struct serdev_device *);
+int serdev_device_write_room(struct serdev_device *);
+
+/*
+ * serdev device driver functions
+ */
+int __serdev_device_driver_register(struct serdev_device_driver *, struct module *);
+#define serdev_device_driver_register(sdrv) \
+	__serdev_device_driver_register(sdrv, THIS_MODULE)
+
+/**
+ * serdev_device_driver_unregister() - unregister an serdev client driver
+ * @sdrv:	the driver to unregister
+ */
+static inline void serdev_device_driver_unregister(struct serdev_device_driver *sdrv)
+{
+	if (sdrv)
+		driver_unregister(&sdrv->driver);
+}
+
+#define module_serdev_device_driver(__serdev_device_driver) \
+	module_driver(__serdev_device_driver, serdev_device_driver_register, \
+			serdev_device_driver_unregister)
+
+#else
+
+static inline int serdev_device_open(struct serdev_device *sdev)
+{
+	return -ENODEV;
+}
+static inline void serdev_device_close(struct serdev_device *sdev) {}
+static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev, unsigned int baudrate)
+{
+	return 0;
+}
+static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
+static inline int serdev_device_write_buf(struct serdev_device *sdev, const unsigned char *buf, size_t count)
+{
+	return -ENODEV;
+}
+static inline void serdev_device_write_flush(struct serdev_device *sdev) {}
+static inline int serdev_device_write_room(struct serdev_device *sdev)
+{
+	return 0;
+}
+
+#define serdev_device_driver_register(x)
+#define serdev_device_driver_unregister(x)
+
+#endif /* CONFIG_SERIAL_DEV_BUS */
+
+#endif /*_LINUX_SERDEV_H */
-- 
cgit v1.2.3


From bed35c6dfa6a36233c3e1238a40dc1ae67955898 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 2 Feb 2017 13:48:08 -0600
Subject: serdev: add a tty port controller driver

Add a serdev controller driver for tty ports.

The controller is registered with serdev when tty ports are registered
with the TTY core. As the TTY core is built-in only, this has the side
effect of making serdev built-in as well.

Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-By: Sebastian Reichel <sre@kernel.org>
Tested-By: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serdev/Kconfig          |   8 ++
 drivers/tty/serdev/Makefile         |   2 +
 drivers/tty/serdev/serdev-ttyport.c | 224 ++++++++++++++++++++++++++++++++++++
 include/linux/serdev.h              |  21 ++++
 4 files changed, 255 insertions(+)
 create mode 100644 drivers/tty/serdev/serdev-ttyport.c

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/Kconfig b/drivers/tty/serdev/Kconfig
index 3b6ecd187bef..cdc6b820cf93 100644
--- a/drivers/tty/serdev/Kconfig
+++ b/drivers/tty/serdev/Kconfig
@@ -6,3 +6,11 @@ menuconfig SERIAL_DEV_BUS
 	help
 	  Core support for devices connected via a serial port.
 
+if SERIAL_DEV_BUS
+
+config SERIAL_DEV_CTRL_TTYPORT
+	bool "Serial device TTY port controller"
+	depends on TTY
+	depends on SERIAL_DEV_BUS != m
+
+endif
diff --git a/drivers/tty/serdev/Makefile b/drivers/tty/serdev/Makefile
index 01a9b62183f4..0cbdb9444d9d 100644
--- a/drivers/tty/serdev/Makefile
+++ b/drivers/tty/serdev/Makefile
@@ -1,3 +1,5 @@
 serdev-objs := core.o
 
 obj-$(CONFIG_SERIAL_DEV_BUS) += serdev.o
+
+obj-$(CONFIG_SERIAL_DEV_CTRL_TTYPORT) += serdev-ttyport.o
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
new file mode 100644
index 000000000000..683320b81a2b
--- /dev/null
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2016-2017 Linaro Ltd., Rob Herring <robh@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/serdev.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+
+#define SERPORT_ACTIVE		1
+
+struct serport {
+	struct tty_port *port;
+	struct tty_struct *tty;
+	struct tty_driver *tty_drv;
+	int tty_idx;
+	unsigned long flags;
+};
+
+/*
+ * Callback functions from the tty port.
+ */
+
+static int ttyport_receive_buf(struct tty_port *port, const unsigned char *cp,
+				const unsigned char *fp, size_t count)
+{
+	struct serdev_controller *ctrl = port->client_data;
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+
+	if (!test_bit(SERPORT_ACTIVE, &serport->flags))
+		return 0;
+
+	return serdev_controller_receive_buf(ctrl, cp, count);
+}
+
+static void ttyport_write_wakeup(struct tty_port *port)
+{
+	struct serdev_controller *ctrl = port->client_data;
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+
+	if (!test_and_clear_bit(TTY_DO_WRITE_WAKEUP, &port->tty->flags))
+		return;
+
+	if (test_bit(SERPORT_ACTIVE, &serport->flags))
+		serdev_controller_write_wakeup(ctrl);
+}
+
+static const struct tty_port_client_operations client_ops = {
+	.receive_buf = ttyport_receive_buf,
+	.write_wakeup = ttyport_write_wakeup,
+};
+
+/*
+ * Callback functions from the serdev core.
+ */
+
+static int ttyport_write_buf(struct serdev_controller *ctrl, const unsigned char *data, size_t len)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	if (!test_bit(SERPORT_ACTIVE, &serport->flags))
+		return 0;
+
+	set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
+	return tty->ops->write(serport->tty, data, len);
+}
+
+static void ttyport_write_flush(struct serdev_controller *ctrl)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	tty_driver_flush_buffer(tty);
+}
+
+static int ttyport_write_room(struct serdev_controller *ctrl)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	return tty_write_room(tty);
+}
+
+static int ttyport_open(struct serdev_controller *ctrl)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty;
+	struct ktermios ktermios;
+
+	tty = tty_init_dev(serport->tty_drv, serport->tty_idx);
+	serport->tty = tty;
+
+	serport->port->client_ops = &client_ops;
+	serport->port->client_data = ctrl;
+
+	if (tty->ops->open)
+		tty->ops->open(serport->tty, NULL);
+	else
+		tty_port_open(serport->port, tty, NULL);
+
+	/* Bring the UART into a known 8 bits no parity hw fc state */
+	ktermios = tty->termios;
+	ktermios.c_iflag &= ~(IGNBRK | BRKINT | PARMRK | ISTRIP |
+			      INLCR | IGNCR | ICRNL | IXON);
+	ktermios.c_oflag &= ~OPOST;
+	ktermios.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN);
+	ktermios.c_cflag &= ~(CSIZE | PARENB);
+	ktermios.c_cflag |= CS8;
+	ktermios.c_cflag |= CRTSCTS;
+	tty_set_termios(tty, &ktermios);
+
+	set_bit(SERPORT_ACTIVE, &serport->flags);
+
+	tty_unlock(serport->tty);
+	return 0;
+}
+
+static void ttyport_close(struct serdev_controller *ctrl)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	clear_bit(SERPORT_ACTIVE, &serport->flags);
+
+	if (tty->ops->close)
+		tty->ops->close(tty, NULL);
+
+	tty_release_struct(tty, serport->tty_idx);
+}
+
+static unsigned int ttyport_set_baudrate(struct serdev_controller *ctrl, unsigned int speed)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+	struct ktermios ktermios = tty->termios;
+
+	ktermios.c_cflag &= ~CBAUD;
+	tty_termios_encode_baud_rate(&ktermios, speed, speed);
+
+	/* tty_set_termios() return not checked as it is always 0 */
+	tty_set_termios(tty, &ktermios);
+	return speed;
+}
+
+static void ttyport_set_flow_control(struct serdev_controller *ctrl, bool enable)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+	struct ktermios ktermios = tty->termios;
+
+	if (enable)
+		ktermios.c_cflag |= CRTSCTS;
+	else
+		ktermios.c_cflag &= ~CRTSCTS;
+
+	tty_set_termios(tty, &ktermios);
+}
+
+static const struct serdev_controller_ops ctrl_ops = {
+	.write_buf = ttyport_write_buf,
+	.write_flush = ttyport_write_flush,
+	.write_room = ttyport_write_room,
+	.open = ttyport_open,
+	.close = ttyport_close,
+	.set_flow_control = ttyport_set_flow_control,
+	.set_baudrate = ttyport_set_baudrate,
+};
+
+struct device *serdev_tty_port_register(struct tty_port *port,
+					struct device *parent,
+					struct tty_driver *drv, int idx)
+{
+	struct serdev_controller *ctrl;
+	struct serport *serport;
+	int ret;
+
+	if (!port || !drv || !parent)
+		return ERR_PTR(-ENODEV);
+
+	ctrl = serdev_controller_alloc(parent, sizeof(struct serport));
+	if (!ctrl)
+		return ERR_PTR(-ENOMEM);
+	serport = serdev_controller_get_drvdata(ctrl);
+
+	serport->port = port;
+	serport->tty_idx = idx;
+	serport->tty_drv = drv;
+
+	ctrl->ops = &ctrl_ops;
+
+	ret = serdev_controller_add(ctrl);
+	if (ret)
+		goto err_controller_put;
+
+	dev_info(&ctrl->dev, "tty port %s%d registered\n", drv->name, idx);
+	return &ctrl->dev;
+
+err_controller_put:
+	serdev_controller_put(ctrl);
+	return ERR_PTR(ret);
+}
+
+void serdev_tty_port_unregister(struct tty_port *port)
+{
+	struct serdev_controller *ctrl = port->client_data;
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+
+	if (!serport)
+		return;
+
+	serdev_controller_remove(ctrl);
+	port->client_ops = NULL;
+	port->client_data = NULL;
+	serdev_controller_put(ctrl);
+}
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 3ac26c1a8aab..9519da6253a8 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -238,4 +238,25 @@ static inline int serdev_device_write_room(struct serdev_device *sdev)
 
 #endif /* CONFIG_SERIAL_DEV_BUS */
 
+/*
+ * serdev hooks into TTY core
+ */
+struct tty_port;
+struct tty_driver;
+
+#ifdef CONFIG_SERIAL_DEV_CTRL_TTYPORT
+struct device *serdev_tty_port_register(struct tty_port *port,
+					struct device *parent,
+					struct tty_driver *drv, int idx);
+void serdev_tty_port_unregister(struct tty_port *port);
+#else
+static inline struct device *serdev_tty_port_register(struct tty_port *port,
+					   struct device *parent,
+					   struct tty_driver *drv, int idx)
+{
+	return ERR_PTR(-ENODEV);
+}
+static inline void serdev_tty_port_unregister(struct tty_port *port) {}
+#endif /* CONFIG_SERIAL_DEV_CTRL_TTYPORT */
+
 #endif /*_LINUX_SERDEV_H */
-- 
cgit v1.2.3


From 3bb434cdcc6af3d4e70ba041e6f596e465d11e14 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 Feb 2017 15:42:57 +0100
Subject: vmw_vmci: switch to pci_irq_alloc_vectors

Cleans up the IRQ management code a lot, including removing a lot of
state from the per-device structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/vmw_vmci/vmci_guest.c | 75 ++++++++++----------------------------
 include/linux/vmw_vmci_defs.h      |  7 ----
 2 files changed, 20 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/vmw_vmci/vmci_guest.c b/drivers/misc/vmw_vmci/vmci_guest.c
index 189b32519748..9d659542a335 100644
--- a/drivers/misc/vmw_vmci/vmci_guest.c
+++ b/drivers/misc/vmw_vmci/vmci_guest.c
@@ -54,10 +54,7 @@ struct vmci_guest_device {
 	struct device *dev;	/* PCI device we are attached to */
 	void __iomem *iobase;
 
-	unsigned int irq;
-	unsigned int intr_type;
 	bool exclusive_vectors;
-	struct msix_entry msix_entries[VMCI_MAX_INTRS];
 
 	struct tasklet_struct datagram_tasklet;
 	struct tasklet_struct bm_tasklet;
@@ -368,30 +365,6 @@ static void vmci_process_bitmap(unsigned long data)
 	vmci_dbell_scan_notification_entries(dev->notification_bitmap);
 }
 
-/*
- * Enable MSI-X.  Try exclusive vectors first, then shared vectors.
- */
-static int vmci_enable_msix(struct pci_dev *pdev,
-			    struct vmci_guest_device *vmci_dev)
-{
-	int i;
-	int result;
-
-	for (i = 0; i < VMCI_MAX_INTRS; ++i) {
-		vmci_dev->msix_entries[i].entry = i;
-		vmci_dev->msix_entries[i].vector = i;
-	}
-
-	result = pci_enable_msix_exact(pdev,
-				       vmci_dev->msix_entries, VMCI_MAX_INTRS);
-	if (result == 0)
-		vmci_dev->exclusive_vectors = true;
-	else if (result == -ENOSPC)
-		result = pci_enable_msix_exact(pdev, vmci_dev->msix_entries, 1);
-
-	return result;
-}
-
 /*
  * Interrupt handler for legacy or MSI interrupt, or for first MSI-X
  * interrupt (vector VMCI_INTR_DATAGRAM).
@@ -406,7 +379,7 @@ static irqreturn_t vmci_interrupt(int irq, void *_dev)
 	 * Otherwise we must read the ICR to determine what to do.
 	 */
 
-	if (dev->intr_type == VMCI_INTR_TYPE_MSIX && dev->exclusive_vectors) {
+	if (dev->exclusive_vectors) {
 		tasklet_schedule(&dev->datagram_tasklet);
 	} else {
 		unsigned int icr;
@@ -491,7 +464,6 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
 	}
 
 	vmci_dev->dev = &pdev->dev;
-	vmci_dev->intr_type = VMCI_INTR_TYPE_INTX;
 	vmci_dev->exclusive_vectors = false;
 	vmci_dev->iobase = iobase;
 
@@ -592,26 +564,26 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
 	 * Enable interrupts.  Try MSI-X first, then MSI, and then fallback on
 	 * legacy interrupts.
 	 */
-	if (!vmci_disable_msix && !vmci_enable_msix(pdev, vmci_dev)) {
-		vmci_dev->intr_type = VMCI_INTR_TYPE_MSIX;
-		vmci_dev->irq = vmci_dev->msix_entries[0].vector;
-	} else if (!vmci_disable_msi && !pci_enable_msi(pdev)) {
-		vmci_dev->intr_type = VMCI_INTR_TYPE_MSI;
-		vmci_dev->irq = pdev->irq;
+	error = pci_alloc_irq_vectors(pdev, VMCI_MAX_INTRS, VMCI_MAX_INTRS,
+			PCI_IRQ_MSIX);
+	if (error) {
+		error = pci_alloc_irq_vectors(pdev, 1, 1,
+				PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_LEGACY);
+		if (error)
+			goto err_remove_bitmap;
 	} else {
-		vmci_dev->intr_type = VMCI_INTR_TYPE_INTX;
-		vmci_dev->irq = pdev->irq;
+		vmci_dev->exclusive_vectors = true;
 	}
 
 	/*
 	 * Request IRQ for legacy or MSI interrupts, or for first
 	 * MSI-X vector.
 	 */
-	error = request_irq(vmci_dev->irq, vmci_interrupt, IRQF_SHARED,
-			    KBUILD_MODNAME, vmci_dev);
+	error = request_irq(pci_irq_vector(pdev, 0), vmci_interrupt,
+			    IRQF_SHARED, KBUILD_MODNAME, vmci_dev);
 	if (error) {
 		dev_err(&pdev->dev, "Irq %u in use: %d\n",
-			vmci_dev->irq, error);
+			pci_irq_vector(pdev, 0), error);
 		goto err_disable_msi;
 	}
 
@@ -622,13 +594,13 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
 	 * between the vectors.
 	 */
 	if (vmci_dev->exclusive_vectors) {
-		error = request_irq(vmci_dev->msix_entries[1].vector,
+		error = request_irq(pci_irq_vector(pdev, 1),
 				    vmci_interrupt_bm, 0, KBUILD_MODNAME,
 				    vmci_dev);
 		if (error) {
 			dev_err(&pdev->dev,
 				"Failed to allocate irq %u: %d\n",
-				vmci_dev->msix_entries[1].vector, error);
+				pci_irq_vector(pdev, 1), error);
 			goto err_free_irq;
 		}
 	}
@@ -651,15 +623,12 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
 	return 0;
 
 err_free_irq:
-	free_irq(vmci_dev->irq, vmci_dev);
+	free_irq(pci_irq_vector(pdev, 0), vmci_dev);
 	tasklet_kill(&vmci_dev->datagram_tasklet);
 	tasklet_kill(&vmci_dev->bm_tasklet);
 
 err_disable_msi:
-	if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSIX)
-		pci_disable_msix(pdev);
-	else if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSI)
-		pci_disable_msi(pdev);
+	pci_free_irq_vectors(pdev);
 
 	vmci_err = vmci_event_unsubscribe(ctx_update_sub_id);
 	if (vmci_err < VMCI_SUCCESS)
@@ -719,14 +688,10 @@ static void vmci_guest_remove_device(struct pci_dev *pdev)
 	 * MSI-X, we might have multiple vectors, each with their own
 	 * IRQ, which we must free too.
 	 */
-	free_irq(vmci_dev->irq, vmci_dev);
-	if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSIX) {
-		if (vmci_dev->exclusive_vectors)
-			free_irq(vmci_dev->msix_entries[1].vector, vmci_dev);
-		pci_disable_msix(pdev);
-	} else if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSI) {
-		pci_disable_msi(pdev);
-	}
+	if (vmci_dev->exclusive_vectors)
+		free_irq(pci_irq_vector(pdev, 1), vmci_dev);
+	free_irq(pci_irq_vector(pdev, 0), vmci_dev);
+	pci_free_irq_vectors(pdev);
 
 	tasklet_kill(&vmci_dev->datagram_tasklet);
 	tasklet_kill(&vmci_dev->bm_tasklet);
diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h
index 1bd31a38c51e..b724ef7005de 100644
--- a/include/linux/vmw_vmci_defs.h
+++ b/include/linux/vmw_vmci_defs.h
@@ -54,13 +54,6 @@
 #define VMCI_IMR_DATAGRAM      0x1
 #define VMCI_IMR_NOTIFICATION  0x2
 
-/* Interrupt type. */
-enum {
-	VMCI_INTR_TYPE_INTX = 0,
-	VMCI_INTR_TYPE_MSI = 1,
-	VMCI_INTR_TYPE_MSIX = 2,
-};
-
 /* Maximum MSI/MSI-X interrupt vectors in the device. */
 #define VMCI_MAX_INTRS 2
 
-- 
cgit v1.2.3


From d44fa3d46079dc095c1346fa6e5bc96dca1ead41 Mon Sep 17 00:00:00 2001
From: Agustin Vega-Frias <agustinv@codeaurora.org>
Date: Thu, 2 Feb 2017 18:23:58 -0500
Subject: ACPI: Add support for ResourceSource/IRQ domain mapping

ACPI extended IRQ resources may contain a ResourceSource to specify
an alternate interrupt controller. Introduce acpi_irq_get and use it
to implement ResourceSource/IRQ domain mapping.

The new API is similar to of_irq_get and allows re-initialization
of a platform resource from the ACPI extended IRQ resource, and
provides proper behavior for probe deferral when the domain is not
yet present when called.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Agustin Vega-Frias <agustinv@codeaurora.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/acpi/Makefile   |   2 +-
 drivers/acpi/gsi.c      |  98 ----------------
 drivers/acpi/irq.c      | 297 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/base/platform.c |  10 ++
 include/linux/acpi.h    |  10 ++
 5 files changed, 318 insertions(+), 99 deletions(-)
 delete mode 100644 drivers/acpi/gsi.c
 create mode 100644 drivers/acpi/irq.c

(limited to 'include/linux')

diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 9ed087853dee..a391bbc48105 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -55,7 +55,7 @@ acpi-$(CONFIG_DEBUG_FS)		+= debugfs.o
 acpi-$(CONFIG_ACPI_NUMA)	+= numa.o
 acpi-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o
 acpi-y				+= acpi_lpat.o
-acpi-$(CONFIG_ACPI_GENERIC_GSI) += gsi.o
+acpi-$(CONFIG_ACPI_GENERIC_GSI) += irq.o
 acpi-$(CONFIG_ACPI_WATCHDOG)	+= acpi_watchdog.o
 
 # These are (potentially) separate modules
diff --git a/drivers/acpi/gsi.c b/drivers/acpi/gsi.c
deleted file mode 100644
index ee9e0f27b2bf..000000000000
--- a/drivers/acpi/gsi.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * ACPI GSI IRQ layer
- *
- * Copyright (C) 2015 ARM Ltd.
- * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/acpi.h>
-#include <linux/irq.h>
-#include <linux/irqdomain.h>
-#include <linux/of.h>
-
-enum acpi_irq_model_id acpi_irq_model;
-
-static struct fwnode_handle *acpi_gsi_domain_id;
-
-/**
- * acpi_gsi_to_irq() - Retrieve the linux irq number for a given GSI
- * @gsi: GSI IRQ number to map
- * @irq: pointer where linux IRQ number is stored
- *
- * irq location updated with irq value [>0 on success, 0 on failure]
- *
- * Returns: linux IRQ number on success (>0)
- *          -EINVAL on failure
- */
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
-{
-	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
-							DOMAIN_BUS_ANY);
-
-	*irq = irq_find_mapping(d, gsi);
-	/*
-	 * *irq == 0 means no mapping, that should
-	 * be reported as a failure
-	 */
-	return (*irq > 0) ? *irq : -EINVAL;
-}
-EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
-
-/**
- * acpi_register_gsi() - Map a GSI to a linux IRQ number
- * @dev: device for which IRQ has to be mapped
- * @gsi: GSI IRQ number
- * @trigger: trigger type of the GSI number to be mapped
- * @polarity: polarity of the GSI to be mapped
- *
- * Returns: a valid linux IRQ number on success
- *          -EINVAL on failure
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int trigger,
-		      int polarity)
-{
-	struct irq_fwspec fwspec;
-
-	if (WARN_ON(!acpi_gsi_domain_id)) {
-		pr_warn("GSI: No registered irqchip, giving up\n");
-		return -EINVAL;
-	}
-
-	fwspec.fwnode = acpi_gsi_domain_id;
-	fwspec.param[0] = gsi;
-	fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity);
-	fwspec.param_count = 2;
-
-	return irq_create_fwspec_mapping(&fwspec);
-}
-EXPORT_SYMBOL_GPL(acpi_register_gsi);
-
-/**
- * acpi_unregister_gsi() - Free a GSI<->linux IRQ number mapping
- * @gsi: GSI IRQ number
- */
-void acpi_unregister_gsi(u32 gsi)
-{
-	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
-							DOMAIN_BUS_ANY);
-	int irq = irq_find_mapping(d, gsi);
-
-	irq_dispose_mapping(irq);
-}
-EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
-
-/**
- * acpi_set_irq_model - Setup the GSI irqdomain information
- * @model: the value assigned to acpi_irq_model
- * @fwnode: the irq_domain identifier for mapping and looking up
- *          GSI interrupts
- */
-void __init acpi_set_irq_model(enum acpi_irq_model_id model,
-			       struct fwnode_handle *fwnode)
-{
-	acpi_irq_model = model;
-	acpi_gsi_domain_id = fwnode;
-}
diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c
new file mode 100644
index 000000000000..830299a74b84
--- /dev/null
+++ b/drivers/acpi/irq.c
@@ -0,0 +1,297 @@
+/*
+ * ACPI GSI IRQ layer
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/acpi.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
+
+enum acpi_irq_model_id acpi_irq_model;
+
+static struct fwnode_handle *acpi_gsi_domain_id;
+
+/**
+ * acpi_gsi_to_irq() - Retrieve the linux irq number for a given GSI
+ * @gsi: GSI IRQ number to map
+ * @irq: pointer where linux IRQ number is stored
+ *
+ * irq location updated with irq value [>0 on success, 0 on failure]
+ *
+ * Returns: linux IRQ number on success (>0)
+ *          -EINVAL on failure
+ */
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+{
+	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
+							DOMAIN_BUS_ANY);
+
+	*irq = irq_find_mapping(d, gsi);
+	/*
+	 * *irq == 0 means no mapping, that should
+	 * be reported as a failure
+	 */
+	return (*irq > 0) ? *irq : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
+
+/**
+ * acpi_register_gsi() - Map a GSI to a linux IRQ number
+ * @dev: device for which IRQ has to be mapped
+ * @gsi: GSI IRQ number
+ * @trigger: trigger type of the GSI number to be mapped
+ * @polarity: polarity of the GSI to be mapped
+ *
+ * Returns: a valid linux IRQ number on success
+ *          -EINVAL on failure
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger,
+		      int polarity)
+{
+	struct irq_fwspec fwspec;
+
+	if (WARN_ON(!acpi_gsi_domain_id)) {
+		pr_warn("GSI: No registered irqchip, giving up\n");
+		return -EINVAL;
+	}
+
+	fwspec.fwnode = acpi_gsi_domain_id;
+	fwspec.param[0] = gsi;
+	fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity);
+	fwspec.param_count = 2;
+
+	return irq_create_fwspec_mapping(&fwspec);
+}
+EXPORT_SYMBOL_GPL(acpi_register_gsi);
+
+/**
+ * acpi_unregister_gsi() - Free a GSI<->linux IRQ number mapping
+ * @gsi: GSI IRQ number
+ */
+void acpi_unregister_gsi(u32 gsi)
+{
+	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
+							DOMAIN_BUS_ANY);
+	int irq = irq_find_mapping(d, gsi);
+
+	irq_dispose_mapping(irq);
+}
+EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
+
+/**
+ * acpi_get_irq_source_fwhandle() - Retrieve fwhandle from IRQ resource source.
+ * @source: acpi_resource_source to use for the lookup.
+ *
+ * Description:
+ * Retrieve the fwhandle of the device referenced by the given IRQ resource
+ * source.
+ *
+ * Return:
+ * The referenced device fwhandle or NULL on failure
+ */
+static struct fwnode_handle *
+acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source)
+{
+	struct fwnode_handle *result;
+	struct acpi_device *device;
+	acpi_handle handle;
+	acpi_status status;
+
+	if (!source->string_length)
+		return acpi_gsi_domain_id;
+
+	status = acpi_get_handle(NULL, source->string_ptr, &handle);
+	if (WARN_ON(ACPI_FAILURE(status)))
+		return NULL;
+
+	device = acpi_bus_get_acpi_device(handle);
+	if (WARN_ON(!device))
+		return NULL;
+
+	result = &device->fwnode;
+	acpi_bus_put_acpi_device(device);
+	return result;
+}
+
+/*
+ * Context for the resource walk used to lookup IRQ resources.
+ * Contains a return code, the lookup index, and references to the flags
+ * and fwspec where the result is returned.
+ */
+struct acpi_irq_parse_one_ctx {
+	int rc;
+	unsigned int index;
+	unsigned long *res_flags;
+	struct irq_fwspec *fwspec;
+};
+
+/**
+ * acpi_irq_parse_one_match - Handle a matching IRQ resource.
+ * @fwnode: matching fwnode
+ * @hwirq: hardware IRQ number
+ * @triggering: triggering attributes of hwirq
+ * @polarity: polarity attributes of hwirq
+ * @polarity: polarity attributes of hwirq
+ * @shareable: shareable attributes of hwirq
+ * @ctx: acpi_irq_parse_one_ctx updated by this function
+ *
+ * Description:
+ * Handle a matching IRQ resource by populating the given ctx with
+ * the information passed.
+ */
+static inline void acpi_irq_parse_one_match(struct fwnode_handle *fwnode,
+					    u32 hwirq, u8 triggering,
+					    u8 polarity, u8 shareable,
+					    struct acpi_irq_parse_one_ctx *ctx)
+{
+	if (!fwnode)
+		return;
+	ctx->rc = 0;
+	*ctx->res_flags = acpi_dev_irq_flags(triggering, polarity, shareable);
+	ctx->fwspec->fwnode = fwnode;
+	ctx->fwspec->param[0] = hwirq;
+	ctx->fwspec->param[1] = acpi_dev_get_irq_type(triggering, polarity);
+	ctx->fwspec->param_count = 2;
+}
+
+/**
+ * acpi_irq_parse_one_cb - Handle the given resource.
+ * @ares: resource to handle
+ * @context: context for the walk
+ *
+ * Description:
+ * This is called by acpi_walk_resources passing each resource returned by
+ * the _CRS method. We only inspect IRQ resources. Since IRQ resources
+ * might contain multiple interrupts we check if the index is within this
+ * one's interrupt array, otherwise we subtract the current resource IRQ
+ * count from the lookup index to prepare for the next resource.
+ * Once a match is found we call acpi_irq_parse_one_match to populate
+ * the result and end the walk by returning AE_CTRL_TERMINATE.
+ *
+ * Return:
+ * AE_OK if the walk should continue, AE_CTRL_TERMINATE if a matching
+ * IRQ resource was found.
+ */
+static acpi_status acpi_irq_parse_one_cb(struct acpi_resource *ares,
+					 void *context)
+{
+	struct acpi_irq_parse_one_ctx *ctx = context;
+	struct acpi_resource_irq *irq;
+	struct acpi_resource_extended_irq *eirq;
+	struct fwnode_handle *fwnode;
+
+	switch (ares->type) {
+	case ACPI_RESOURCE_TYPE_IRQ:
+		irq = &ares->data.irq;
+		if (ctx->index >= irq->interrupt_count) {
+			ctx->index -= irq->interrupt_count;
+			return AE_OK;
+		}
+		fwnode = acpi_gsi_domain_id;
+		acpi_irq_parse_one_match(fwnode, irq->interrupts[ctx->index],
+					 irq->triggering, irq->polarity,
+					 irq->sharable, ctx);
+		return AE_CTRL_TERMINATE;
+	case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
+		eirq = &ares->data.extended_irq;
+		if (eirq->producer_consumer == ACPI_PRODUCER)
+			return AE_OK;
+		if (ctx->index >= eirq->interrupt_count) {
+			ctx->index -= eirq->interrupt_count;
+			return AE_OK;
+		}
+		fwnode = acpi_get_irq_source_fwhandle(&eirq->resource_source);
+		acpi_irq_parse_one_match(fwnode, eirq->interrupts[ctx->index],
+					 eirq->triggering, eirq->polarity,
+					 eirq->sharable, ctx);
+		return AE_CTRL_TERMINATE;
+	}
+
+	return AE_OK;
+}
+
+/**
+ * acpi_irq_parse_one - Resolve an interrupt for a device
+ * @handle: the device whose interrupt is to be resolved
+ * @index: index of the interrupt to resolve
+ * @fwspec: structure irq_fwspec filled by this function
+ * @flags: resource flags filled by this function
+ *
+ * Description:
+ * Resolves an interrupt for a device by walking its CRS resources to find
+ * the appropriate ACPI IRQ resource and populating the given struct irq_fwspec
+ * and flags.
+ *
+ * Return:
+ * The result stored in ctx.rc by the callback, or the default -EINVAL value
+ * if an error occurs.
+ */
+static int acpi_irq_parse_one(acpi_handle handle, unsigned int index,
+			      struct irq_fwspec *fwspec, unsigned long *flags)
+{
+	struct acpi_irq_parse_one_ctx ctx = { -EINVAL, index, flags, fwspec };
+
+	acpi_walk_resources(handle, METHOD_NAME__CRS, acpi_irq_parse_one_cb, &ctx);
+	return ctx.rc;
+}
+
+/**
+ * acpi_irq_get - Lookup an ACPI IRQ resource and use it to initialize resource.
+ * @handle: ACPI device handle
+ * @index:  ACPI IRQ resource index to lookup
+ * @res:    Linux IRQ resource to initialize
+ *
+ * Description:
+ * Look for the ACPI IRQ resource with the given index and use it to initialize
+ * the given Linux IRQ resource.
+ *
+ * Return:
+ * 0 on success
+ * -EINVAL if an error occurs
+ * -EPROBE_DEFER if the IRQ lookup/conversion failed
+ */
+int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
+{
+	struct irq_fwspec fwspec;
+	struct irq_domain *domain;
+	unsigned long flags;
+	int rc;
+
+	rc = acpi_irq_parse_one(handle, index, &fwspec, &flags);
+	if (rc)
+		return rc;
+
+	domain = irq_find_matching_fwnode(fwspec.fwnode, DOMAIN_BUS_ANY);
+	if (!domain)
+		return -EPROBE_DEFER;
+
+	rc = irq_create_fwspec_mapping(&fwspec);
+	if (rc <= 0)
+		return -EINVAL;
+
+	res->start = rc;
+	res->end = rc;
+	res->flags = flags;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_irq_get);
+
+/**
+ * acpi_set_irq_model - Setup the GSI irqdomain information
+ * @model: the value assigned to acpi_irq_model
+ * @fwnode: the irq_domain identifier for mapping and looking up
+ *          GSI interrupts
+ */
+void __init acpi_set_irq_model(enum acpi_irq_model_id model,
+			       struct fwnode_handle *fwnode)
+{
+	acpi_irq_model = model;
+	acpi_gsi_domain_id = fwnode;
+}
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index c4af00385502..647e4761dbf3 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -102,6 +102,16 @@ int platform_get_irq(struct platform_device *dev, unsigned int num)
 	}
 
 	r = platform_get_resource(dev, IORESOURCE_IRQ, num);
+	if (has_acpi_companion(&dev->dev)) {
+		if (r && r->flags & IORESOURCE_DISABLED) {
+			int ret;
+
+			ret = acpi_irq_get(ACPI_HANDLE(&dev->dev), num, r);
+			if (ret)
+				return ret;
+		}
+	}
+
 	/*
 	 * The resources may pass trigger flags to the irqs that need
 	 * to be set up. It so happens that the trigger flags for
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5b36974ed60a..8e577c2cb0ce 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1153,4 +1153,14 @@ int parse_spcr(bool earlycon);
 static inline int parse_spcr(bool earlycon) { return 0; }
 #endif
 
+#if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI)
+int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res);
+#else
+static inline
+int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
+{
+	return -EINVAL;
+}
+#endif
+
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3


From 3541f9e8bdebce02458882b66b638d7302c1f616 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 2 Feb 2017 08:04:56 -0800
Subject: tcp: add tcp_mss_clamp() helper

Small cleanup factorizing code doing the TCP_MAXSEG clamping.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  9 +++++++++
 net/ipv4/tcp_ipv4.c      |  5 +----
 net/ipv4/tcp_minisocks.c |  7 ++-----
 net/ipv4/tcp_output.c    | 14 ++++----------
 net/ipv6/tcp_ipv6.c      |  5 +----
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f88f4649ba6f..cfc2d9506ce8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -445,4 +445,13 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
 
 struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);
 
+static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
+{
+	/* We use READ_ONCE() here because socket might not be locked.
+	 * This happens for listeners.
+	 */
+	u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
+
+	return (user_mss && user_mss < mss) ? user_mss : mss;
+}
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8c9e9aa17d66..8c124d4ef4b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1324,10 +1324,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	tcp_ca_openreq_child(newsk, dst);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
-	newtp->advmss = dst_metric_advmss(dst);
-	if (tcp_sk(sk)->rx_opt.user_mss &&
-	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
-		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(newsk);
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bdb443471c39..dff7d2aaf861 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -360,15 +360,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	const struct tcp_sock *tp = tcp_sk(sk_listener);
-	u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
 	int full_space = tcp_full_space(sk_listener);
-	int mss = dst_metric_advmss(dst);
 	u32 window_clamp;
 	__u8 rcv_wscale;
+	int mss;
 
-	if (user_mss && user_mss < mss)
-		mss = user_mss;
-
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 	window_clamp = READ_ONCE(tp->window_clamp);
 	/* Set this up on the first call only */
 	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6d5bab8a3ea6..956bea9a5394 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3062,7 +3062,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	struct sk_buff *skb;
 	int tcp_header_size;
 	struct tcphdr *th;
-	u16 user_mss;
 	int mss;
 
 	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3092,10 +3091,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	}
 	skb_dst_set(skb, dst);
 
-	mss = dst_metric_advmss(dst);
-	user_mss = READ_ONCE(tp->rx_opt.user_mss);
-	if (user_mss && user_mss < mss)
-		mss = user_mss;
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
@@ -3201,9 +3197,7 @@ static void tcp_connect_init(struct sock *sk)
 
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
-	tp->advmss = dst_metric_advmss(dst);
-	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
-		tp->advmss = tp->rx_opt.user_mss;
+	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(sk);
 
@@ -3280,8 +3274,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	 * user-MSS. Reserve maximum option space for middleboxes that add
 	 * private TCP options. The cost is reduced data space in SYN :(
 	 */
-	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
-		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+
 	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
 		MAX_TCP_OPTION_SPACE;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 64834ec5ab73..6b9fc63fd4d2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1147,10 +1147,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	tcp_ca_openreq_child(newsk, dst);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
-	newtp->advmss = dst_metric_advmss(dst);
-	if (tcp_sk(sk)->rx_opt.user_mss &&
-	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
-		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(newsk);
 
-- 
cgit v1.2.3


From 680a0873e193bae666439f4b5e32c758e68f114c Mon Sep 17 00:00:00 2001
From: Andy Gross <andy.gross@linaro.org>
Date: Wed, 1 Feb 2017 11:28:27 -0600
Subject: arm: kernel: Add SMC structure parameter

This patch adds a quirk parameter to the arm_smccc_(smc/hvc) calls.
The quirk structure allows for specialized SMC operations due to SoC
specific requirements.  The current arm_smccc_(smc/hvc) is renamed and
macros are used instead to specify the standard arm_smccc_(smc/hvc) or
the arm_smccc_(smc/hvc)_quirk function.

This patch and partial implementation was suggested by Will Deacon.

Signed-off-by: Andy Gross <andy.gross@linaro.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm/kernel/armksyms.c      |  4 ++--
 arch/arm/kernel/smccc-call.S    | 14 ++++++++------
 arch/arm64/kernel/arm64ksyms.c  |  4 ++--
 arch/arm64/kernel/asm-offsets.c |  7 +++++--
 arch/arm64/kernel/smccc-call.S  | 14 ++++++++------
 include/linux/arm-smccc.h       | 40 ++++++++++++++++++++++++++++++++--------
 6 files changed, 57 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 7e45f69a0ddc..8e8d20cdbce7 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -178,6 +178,6 @@ EXPORT_SYMBOL(__pv_offset);
 #endif
 
 #ifdef CONFIG_HAVE_ARM_SMCCC
-EXPORT_SYMBOL(arm_smccc_smc);
-EXPORT_SYMBOL(arm_smccc_hvc);
+EXPORT_SYMBOL(__arm_smccc_smc);
+EXPORT_SYMBOL(__arm_smccc_hvc);
 #endif
diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S
index 2e48b674aab1..e5d43066b889 100644
--- a/arch/arm/kernel/smccc-call.S
+++ b/arch/arm/kernel/smccc-call.S
@@ -46,17 +46,19 @@ UNWIND(	.fnend)
 /*
  * void smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
  *		  unsigned long a3, unsigned long a4, unsigned long a5,
- *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res,
+ *		  struct arm_smccc_quirk *quirk)
  */
-ENTRY(arm_smccc_smc)
+ENTRY(__arm_smccc_smc)
 	SMCCC SMCCC_SMC
-ENDPROC(arm_smccc_smc)
+ENDPROC(__arm_smccc_smc)
 
 /*
  * void smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
  *		  unsigned long a3, unsigned long a4, unsigned long a5,
- *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res,
+ *		  struct arm_smccc_quirk *quirk)
  */
-ENTRY(arm_smccc_hvc)
+ENTRY(__arm_smccc_hvc)
 	SMCCC SMCCC_HVC
-ENDPROC(arm_smccc_hvc)
+ENDPROC(__arm_smccc_hvc)
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 78f368039c79..e9c4dc9e0ada 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -73,5 +73,5 @@ NOKPROBE_SYMBOL(_mcount);
 #endif
 
 	/* arm-smccc */
-EXPORT_SYMBOL(arm_smccc_smc);
-EXPORT_SYMBOL(arm_smccc_hvc);
+EXPORT_SYMBOL(__arm_smccc_smc);
+EXPORT_SYMBOL(__arm_smccc_hvc);
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index bc049afc73a7..b3bb7ef97bc8 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -143,8 +143,11 @@ int main(void)
   DEFINE(SLEEP_STACK_DATA_SYSTEM_REGS,	offsetof(struct sleep_stack_data, system_regs));
   DEFINE(SLEEP_STACK_DATA_CALLEE_REGS,	offsetof(struct sleep_stack_data, callee_saved_regs));
 #endif
-  DEFINE(ARM_SMCCC_RES_X0_OFFS,	offsetof(struct arm_smccc_res, a0));
-  DEFINE(ARM_SMCCC_RES_X2_OFFS,	offsetof(struct arm_smccc_res, a2));
+  DEFINE(ARM_SMCCC_RES_X0_OFFS,		offsetof(struct arm_smccc_res, a0));
+  DEFINE(ARM_SMCCC_RES_X2_OFFS,		offsetof(struct arm_smccc_res, a2));
+  DEFINE(ARM_SMCCC_QUIRK_ID_OFFS,	offsetof(struct arm_smccc_quirk, id));
+  DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS,	offsetof(struct arm_smccc_quirk, state));
+
   BLANK();
   DEFINE(HIBERN_PBE_ORIG,	offsetof(struct pbe, orig_address));
   DEFINE(HIBERN_PBE_ADDR,	offsetof(struct pbe, address));
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index ae0496fa4235..ba60a8cb07d2 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -27,17 +27,19 @@
 /*
  * void arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
  *		  unsigned long a3, unsigned long a4, unsigned long a5,
- *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res,
+ *		  struct arm_smccc_quirk *quirk)
  */
-ENTRY(arm_smccc_smc)
+ENTRY(__arm_smccc_smc)
 	SMCCC	smc
-ENDPROC(arm_smccc_smc)
+ENDPROC(__arm_smccc_smc)
 
 /*
  * void arm_smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
  *		  unsigned long a3, unsigned long a4, unsigned long a5,
- *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res,
+ *		  struct arm_smccc_quirk *quirk)
  */
-ENTRY(arm_smccc_hvc)
+ENTRY(__arm_smccc_hvc)
 	SMCCC	hvc
-ENDPROC(arm_smccc_hvc)
+ENDPROC(__arm_smccc_hvc)
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index b5abfda80465..c66f8ae94b5a 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -72,33 +72,57 @@ struct arm_smccc_res {
 };
 
 /**
- * arm_smccc_smc() - make SMC calls
+ * struct arm_smccc_quirk - Contains quirk information
+ * @id: quirk identification
+ * @state: quirk specific information
+ * @a6: Qualcomm quirk entry for returning post-smc call contents of a6
+ */
+struct arm_smccc_quirk {
+	int	id;
+	union {
+		unsigned long a6;
+	} state;
+};
+
+/**
+ * __arm_smccc_smc() - make SMC calls
  * @a0-a7: arguments passed in registers 0 to 7
  * @res: result values from registers 0 to 3
+ * @quirk: points to an arm_smccc_quirk, or NULL when no quirks are required.
  *
  * This function is used to make SMC calls following SMC Calling Convention.
  * The content of the supplied param are copied to registers 0 to 7 prior
  * to the SMC instruction. The return values are updated with the content
- * from register 0 to 3 on return from the SMC instruction.
+ * from register 0 to 3 on return from the SMC instruction.  An optional
+ * quirk structure provides vendor specific behavior.
  */
-asmlinkage void arm_smccc_smc(unsigned long a0, unsigned long a1,
+asmlinkage void __arm_smccc_smc(unsigned long a0, unsigned long a1,
 			unsigned long a2, unsigned long a3, unsigned long a4,
 			unsigned long a5, unsigned long a6, unsigned long a7,
-			struct arm_smccc_res *res);
+			struct arm_smccc_res *res, struct arm_smccc_quirk *quirk);
 
 /**
- * arm_smccc_hvc() - make HVC calls
+ * __arm_smccc_hvc() - make HVC calls
  * @a0-a7: arguments passed in registers 0 to 7
  * @res: result values from registers 0 to 3
  *
  * This function is used to make HVC calls following SMC Calling
  * Convention.  The content of the supplied param are copied to registers 0
  * to 7 prior to the HVC instruction. The return values are updated with
- * the content from register 0 to 3 on return from the HVC instruction.
+ * the content from register 0 to 3 on return from the HVC instruction.  An
+ * optional quirk structure provides vendor specific behavior.
  */
-asmlinkage void arm_smccc_hvc(unsigned long a0, unsigned long a1,
+asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 			unsigned long a2, unsigned long a3, unsigned long a4,
 			unsigned long a5, unsigned long a6, unsigned long a7,
-			struct arm_smccc_res *res);
+			struct arm_smccc_res *res, struct arm_smccc_quirk *quirk);
+
+#define arm_smccc_smc(...) __arm_smccc_smc(__VA_ARGS__, NULL)
+
+#define arm_smccc_smc_quirk(...) __arm_smccc_smc(__VA_ARGS__)
+
+#define arm_smccc_hvc(...) __arm_smccc_hvc(__VA_ARGS__, NULL)
+
+#define arm_smccc_hvc_quirk(...) __arm_smccc_hvc(__VA_ARGS__)
 
 #endif /*__LINUX_ARM_SMCCC_H*/
-- 
cgit v1.2.3


From 82bcd087029f6056506ea929f11af02622230901 Mon Sep 17 00:00:00 2001
From: Andy Gross <andy.gross@linaro.org>
Date: Wed, 1 Feb 2017 11:28:28 -0600
Subject: firmware: qcom: scm: Fix interrupted SCM calls

This patch adds a Qualcomm specific quirk to the arm_smccc_smc call.

On Qualcomm ARM64 platforms, the SMC call can return before it has
completed.  If this occurs, the call can be restarted, but it requires
using the returned session ID value from the interrupted SMC call.

The quirk stores off the session ID from the interrupted call in the
quirk structure so that it can be used by the caller.

This patch folds in a fix given by Sricharan R:
https://lkml.org/lkml/2016/9/28/272

Signed-off-by: Andy Gross <andy.gross@linaro.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/smccc-call.S |  9 ++++++++-
 drivers/firmware/qcom_scm-64.c | 13 ++++++++++---
 include/linux/arm-smccc.h      | 11 ++++++++---
 3 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index ba60a8cb07d2..62522342e1e4 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -12,6 +12,7 @@
  *
  */
 #include <linux/linkage.h>
+#include <linux/arm-smccc.h>
 #include <asm/asm-offsets.h>
 
 	.macro SMCCC instr
@@ -20,7 +21,13 @@
 	ldr	x4, [sp]
 	stp	x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
 	stp	x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS]
-	ret
+	ldr	x4, [sp, #8]
+	cbz	x4, 1f /* no quirk structure */
+	ldr	x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS]
+	cmp	x9, #ARM_SMCCC_QUIRK_QCOM_A6
+	b.ne	1f
+	str	x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS]
+1:	ret
 	.cfi_endproc
 	.endm
 
diff --git a/drivers/firmware/qcom_scm-64.c b/drivers/firmware/qcom_scm-64.c
index 4a0f5ead4fb5..1e2e5198db53 100644
--- a/drivers/firmware/qcom_scm-64.c
+++ b/drivers/firmware/qcom_scm-64.c
@@ -91,6 +91,7 @@ static int qcom_scm_call(struct device *dev, u32 svc_id, u32 cmd_id,
 	dma_addr_t args_phys = 0;
 	void *args_virt = NULL;
 	size_t alloc_len;
+	struct arm_smccc_quirk quirk = {.id = ARM_SMCCC_QUIRK_QCOM_A6};
 
 	if (unlikely(arglen > N_REGISTER_ARGS)) {
 		alloc_len = N_EXT_QCOM_SCM_ARGS * sizeof(u64);
@@ -131,10 +132,16 @@ static int qcom_scm_call(struct device *dev, u32 svc_id, u32 cmd_id,
 					 qcom_smccc_convention,
 					 ARM_SMCCC_OWNER_SIP, fn_id);
 
+		quirk.state.a6 = 0;
+
 		do {
-			arm_smccc_smc(cmd, desc->arginfo, desc->args[0],
-				      desc->args[1], desc->args[2], x5, 0, 0,
-				      res);
+			arm_smccc_smc_quirk(cmd, desc->arginfo, desc->args[0],
+				      desc->args[1], desc->args[2], x5,
+				      quirk.state.a6, 0, res, &quirk);
+
+			if (res->a0 == QCOM_SCM_INTERRUPTED)
+				cmd = res->a0;
+
 		} while (res->a0 == QCOM_SCM_INTERRUPTED);
 
 		mutex_unlock(&qcom_scm_lock);
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index c66f8ae94b5a..b67934164401 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -14,9 +14,6 @@
 #ifndef __LINUX_ARM_SMCCC_H
 #define __LINUX_ARM_SMCCC_H
 
-#include <linux/linkage.h>
-#include <linux/types.h>
-
 /*
  * This file provides common defines for ARM SMC Calling Convention as
  * specified in
@@ -60,6 +57,13 @@
 #define ARM_SMCCC_OWNER_TRUSTED_OS	50
 #define ARM_SMCCC_OWNER_TRUSTED_OS_END	63
 
+#define ARM_SMCCC_QUIRK_NONE		0
+#define ARM_SMCCC_QUIRK_QCOM_A6		1 /* Save/restore register a6 */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/linkage.h>
+#include <linux/types.h>
 /**
  * struct arm_smccc_res - Result from SMC/HVC call
  * @a0-a3 result values from registers 0 to 3
@@ -125,4 +129,5 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 
 #define arm_smccc_hvc_quirk(...) __arm_smccc_hvc(__VA_ARGS__)
 
+#endif /*__ASSEMBLY__*/
 #endif /*__LINUX_ARM_SMCCC_H*/
-- 
cgit v1.2.3


From b3c7ef0adadc5768e0baa786213c6bd1ce521a77 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 31 Jan 2017 22:59:53 -0800
Subject: bridge: uapi: add per vlan tunnel info

New nested netlink attribute to associate tunnel info per vlan.
This is used by bridge driver to send tunnel metadata to
bridge ports in vlan tunnel mode. This patch also adds new per
port flag IFLA_BRPORT_VLAN_TUNNEL to enable vlan tunnel mode.
off by default.

One example use for this is a vxlan bridging gateway or vtep
which maps vlans to vn-segments (or vnis). User can configure
per-vlan tunnel information which the bridge driver can use
to bridge vlan into the corresponding vn-segment.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h      |  1 +
 include/uapi/linux/if_bridge.h | 11 +++++++++++
 include/uapi/linux/if_link.h   |  1 +
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index debc9d5904e5..c5847dc75a93 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -47,6 +47,7 @@ struct br_ip_list {
 #define BR_PROXYARP_WIFI	BIT(10)
 #define BR_MCAST_FLOOD		BIT(11)
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
+#define BR_VLAN_TUNNEL		BIT(13)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index ab92bca6d448..a9e6244ce438 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -118,6 +118,7 @@ enum {
 	IFLA_BRIDGE_FLAGS,
 	IFLA_BRIDGE_MODE,
 	IFLA_BRIDGE_VLAN_INFO,
+	IFLA_BRIDGE_VLAN_TUNNEL_INFO,
 	__IFLA_BRIDGE_MAX,
 };
 #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
@@ -134,6 +135,16 @@ struct bridge_vlan_info {
 	__u16 vid;
 };
 
+enum {
+	IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC,
+	IFLA_BRIDGE_VLAN_TUNNEL_ID,
+	IFLA_BRIDGE_VLAN_TUNNEL_VID,
+	IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
+	__IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+};
+
+#define IFLA_BRIDGE_VLAN_TUNNEL_MAX (__IFLA_BRIDGE_VLAN_TUNNEL_MAX - 1)
+
 struct bridge_vlan_xstats {
 	__u64 rx_bytes;
 	__u64 rx_packets;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b9aa5641ebe5..320fc1e747ee 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -322,6 +322,7 @@ enum {
 	IFLA_BRPORT_PAD,
 	IFLA_BRPORT_MCAST_FLOOD,
 	IFLA_BRPORT_MCAST_TO_UCAST,
+	IFLA_BRPORT_VLAN_TUNNEL,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
-- 
cgit v1.2.3


From 3898fac1f488c76e0eef5b5267b4ba8112a82ac4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 2 Feb 2017 17:09:54 +0100
Subject: trace: rename trace_print_hex_seq arg and add kdoc

Steven suggested to improve trace_print_hex_seq() a bit after commit
2acae0d5b0f7 ("trace: add variant without spacing in trace_print_hex_seq")
in two ways: i) by adding a kdoc comment for the helper function
itself and ii) by renaming 'spacing' argument into 'concatenate'
to better denote that we don't add spaces between each hex bytes.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h |  2 +-
 include/trace/trace_events.h |  4 ++--
 kernel/trace/trace_output.c  | 15 +++++++++++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index cfa475a0e9ca..0f165507495c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -34,7 +34,7 @@ const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 
 const char *trace_print_hex_seq(struct trace_seq *p,
 				const unsigned char *buf, int len,
-				bool spacing);
+				bool concatenate);
 
 const char *trace_print_array_seq(struct trace_seq *p,
 				   const void *buf, int count,
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 9f684629c3cf..5c06f4af8323 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -298,11 +298,11 @@ TRACE_MAKE_SYSTEM_STR();
 
 #undef __print_hex
 #define __print_hex(buf, buf_len)					\
-	trace_print_hex_seq(p, buf, buf_len, true)
+	trace_print_hex_seq(p, buf, buf_len, false)
 
 #undef __print_hex_str
 #define __print_hex_str(buf, buf_len)					\
-	trace_print_hex_seq(p, buf, buf_len, false)
+	trace_print_hex_seq(p, buf, buf_len, true)
 
 #undef __print_array
 #define __print_array(array, count, el_size)				\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 30a144b1b9ee..aea6a1218c7d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -162,15 +162,26 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 }
 EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
 
+/**
+ * trace_print_hex_seq - print buffer as hex sequence
+ * @p: trace seq struct to write to
+ * @buf: The buffer to print
+ * @buf_len: Length of @buf in bytes
+ * @concatenate: Print @buf as single hex string or with spacing
+ *
+ * Prints the passed buffer as a hex sequence either as a whole,
+ * single hex string if @concatenate is true or with spacing after
+ * each byte in case @concatenate is false.
+ */
 const char *
 trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
-		    bool spacing)
+		    bool concatenate)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
 
 	for (i = 0; i < buf_len; i++)
-		trace_seq_printf(p, "%s%2.2x", !spacing || i == 0 ? "" : " ",
+		trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ",
 				 buf[i]);
 	trace_seq_putc(p, 0);
 
-- 
cgit v1.2.3


From b862815c3ee7b49ec20a9ab25da55a5f0bcbb95e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 3 Feb 2017 10:29:05 +0100
Subject: list: introduce list_for_each_entry_from_reverse helper

Similar to list_for_each_entry_continue and its reverse variant
list_for_each_entry_continue_reverse, introduce reverse helper for
list_for_each_entry_from.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index d1039ecaf94f..ae537fa46216 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -526,6 +526,19 @@ static inline void list_splice_tail_init(struct list_head *list,
 	for (; &pos->member != (head);					\
 	     pos = list_next_entry(pos, member))
 
+/**
+ * list_for_each_entry_from_reverse - iterate backwards over list of given type
+ *                                    from the current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate backwards over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from_reverse(pos, head, member)		\
+	for (; &pos->member != (head);					\
+	     pos = list_prev_entry(pos, member))
+
 /**
  * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
  * @pos:	the type * to use as a loop cursor.
-- 
cgit v1.2.3


From 44091d29f2075972aede47ef17e1e70db3d51190 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 3 Feb 2017 10:29:06 +0100
Subject: lib: Introduce priority array area manager

This introduces a infrastructure for management of linear priority
areas. Priority order in an array matters, however order of items inside
a priority group does not matter.

As an initial implementation, L-sort algorithm is used. It is quite
trivial. More advanced algorithm called P-sort will be introduced as a
follow-up. The infrastructure is prepared for other algos.

Alongside this, a testing module is introduced as well.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS            |   8 +
 include/linux/parman.h |  76 ++++++++++
 lib/Kconfig            |   3 +
 lib/Kconfig.debug      |  10 ++
 lib/Makefile           |   3 +
 lib/parman.c           | 376 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/test_parman.c      | 395 +++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 871 insertions(+)
 create mode 100644 include/linux/parman.h
 create mode 100644 lib/parman.c
 create mode 100644 lib/test_parman.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c15dc53f9d66..a9368bba9b37 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9382,6 +9382,14 @@ F:	drivers/video/fbdev/sti*
 F:	drivers/video/console/sti*
 F:	drivers/video/logo/logo_parisc*
 
+PARMAN
+M:	Jiri Pirko <jiri@mellanox.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	lib/parman.c
+F:	lib/test_parman.c
+F:	include/linux/parman.h
+
 PC87360 HARDWARE MONITORING DRIVER
 M:	Jim Cromie <jim.cromie@gmail.com>
 L:	linux-hwmon@vger.kernel.org
diff --git a/include/linux/parman.h b/include/linux/parman.h
new file mode 100644
index 000000000000..3c8cccc7d4da
--- /dev/null
+++ b/include/linux/parman.h
@@ -0,0 +1,76 @@
+/*
+ * include/linux/parman.h - Manager for linear priority array areas
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _PARMAN_H
+#define _PARMAN_H
+
+#include <linux/list.h>
+
+enum parman_algo_type {
+	PARMAN_ALGO_TYPE_LSORT,
+};
+
+struct parman_item {
+	struct list_head list;
+	unsigned long index;
+};
+
+struct parman_prio {
+	struct list_head list;
+	struct list_head item_list;
+	unsigned long priority;
+};
+
+struct parman_ops {
+	unsigned long base_count;
+	unsigned long resize_step;
+	int (*resize)(void *priv, unsigned long new_count);
+	void (*move)(void *priv, unsigned long from_index,
+		     unsigned long to_index, unsigned long count);
+	enum parman_algo_type algo;
+};
+
+struct parman;
+
+struct parman *parman_create(const struct parman_ops *ops, void *priv);
+void parman_destroy(struct parman *parman);
+void parman_prio_init(struct parman *parman, struct parman_prio *prio,
+		      unsigned long priority);
+void parman_prio_fini(struct parman_prio *prio);
+int parman_item_add(struct parman *parman, struct parman_prio *prio,
+		    struct parman_item *item);
+void parman_item_remove(struct parman *parman, struct parman_prio *prio,
+			struct parman_item *item);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 260a80e313b9..5d644f180fe5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -550,4 +550,7 @@ config STACKDEPOT
 config SBITMAP
 	bool
 
+config PARMAN
+	tristate "parman"
+
 endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 15969abf00a6..433a788500be 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1826,6 +1826,16 @@ config TEST_HASH
 	  This is intended to help people writing architecture-specific
 	  optimized versions.  If unsure, say N.
 
+config TEST_PARMAN
+	tristate "Perform selftest on priority array manager"
+	default n
+	depends on PARMAN
+	help
+	  Enable this option to test priority array manager on boot
+	  (or module load).
+
+	  If unsure, say N.
+
 endmenu # runtime tests
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Makefile b/lib/Makefile
index 7b3008d58600..1c039a4f60e5 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
 obj-$(CONFIG_TEST_PRINTF) += test_printf.o
 obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
 obj-$(CONFIG_TEST_UUID) += test_uuid.o
+obj-$(CONFIG_TEST_PARMAN) += test_parman.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
@@ -230,3 +231,5 @@ obj-$(CONFIG_UBSAN) += ubsan.o
 UBSAN_SANITIZE_ubsan.o := n
 
 obj-$(CONFIG_SBITMAP) += sbitmap.o
+
+obj-$(CONFIG_PARMAN) += parman.o
diff --git a/lib/parman.c b/lib/parman.c
new file mode 100644
index 000000000000..c6e42a8db824
--- /dev/null
+++ b/lib/parman.c
@@ -0,0 +1,376 @@
+/*
+ * lib/parman.c - Manager for linear priority array areas
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/list.h>
+#include <linux/err.h>
+#include <linux/parman.h>
+
+struct parman_algo {
+	int (*item_add)(struct parman *parman, struct parman_prio *prio,
+			struct parman_item *item);
+	void (*item_remove)(struct parman *parman, struct parman_prio *prio,
+			    struct parman_item *item);
+};
+
+struct parman {
+	const struct parman_ops *ops;
+	void *priv;
+	const struct parman_algo *algo;
+	unsigned long count;
+	unsigned long limit_count;
+	struct list_head prio_list;
+};
+
+static int parman_enlarge(struct parman *parman)
+{
+	unsigned long new_count = parman->limit_count +
+				  parman->ops->resize_step;
+	int err;
+
+	err = parman->ops->resize(parman->priv, new_count);
+	if (err)
+		return err;
+	parman->limit_count = new_count;
+	return 0;
+}
+
+static int parman_shrink(struct parman *parman)
+{
+	unsigned long new_count = parman->limit_count -
+				  parman->ops->resize_step;
+	int err;
+
+	if (new_count < parman->ops->base_count)
+		return 0;
+	err = parman->ops->resize(parman->priv, new_count);
+	if (err)
+		return err;
+	parman->limit_count = new_count;
+	return 0;
+}
+
+static bool parman_prio_used(struct parman_prio *prio)
+
+{
+	return !list_empty(&prio->item_list);
+}
+
+static struct parman_item *parman_prio_first_item(struct parman_prio *prio)
+{
+	return list_first_entry(&prio->item_list,
+				typeof(struct parman_item), list);
+}
+
+static unsigned long parman_prio_first_index(struct parman_prio *prio)
+{
+	return parman_prio_first_item(prio)->index;
+}
+
+static struct parman_item *parman_prio_last_item(struct parman_prio *prio)
+{
+	return list_last_entry(&prio->item_list,
+			       typeof(struct parman_item), list);
+}
+
+static unsigned long parman_prio_last_index(struct parman_prio *prio)
+{
+	return parman_prio_last_item(prio)->index;
+}
+
+static unsigned long parman_lsort_new_index_find(struct parman *parman,
+						 struct parman_prio *prio)
+{
+	list_for_each_entry_from_reverse(prio, &parman->prio_list, list) {
+		if (!parman_prio_used(prio))
+			continue;
+		return parman_prio_last_index(prio) + 1;
+	}
+	return 0;
+}
+
+static void __parman_prio_move(struct parman *parman, struct parman_prio *prio,
+			       struct parman_item *item, unsigned long to_index,
+			       unsigned long count)
+{
+	parman->ops->move(parman->priv, item->index, to_index, count);
+}
+
+static void parman_prio_shift_down(struct parman *parman,
+				   struct parman_prio *prio)
+{
+	struct parman_item *item;
+	unsigned long to_index;
+
+	if (!parman_prio_used(prio))
+		return;
+	item = parman_prio_first_item(prio);
+	to_index = parman_prio_last_index(prio) + 1;
+	__parman_prio_move(parman, prio, item, to_index, 1);
+	list_move_tail(&item->list, &prio->item_list);
+	item->index = to_index;
+}
+
+static void parman_prio_shift_up(struct parman *parman,
+				 struct parman_prio *prio)
+{
+	struct parman_item *item;
+	unsigned long to_index;
+
+	if (!parman_prio_used(prio))
+		return;
+	item = parman_prio_last_item(prio);
+	to_index = parman_prio_first_index(prio) - 1;
+	__parman_prio_move(parman, prio, item, to_index, 1);
+	list_move(&item->list, &prio->item_list);
+	item->index = to_index;
+}
+
+static void parman_prio_item_remove(struct parman *parman,
+				    struct parman_prio *prio,
+				    struct parman_item *item)
+{
+	struct parman_item *last_item;
+	unsigned long to_index;
+
+	last_item = parman_prio_last_item(prio);
+	if (last_item == item) {
+		list_del(&item->list);
+		return;
+	}
+	to_index = item->index;
+	__parman_prio_move(parman, prio, last_item, to_index, 1);
+	list_del(&last_item->list);
+	list_replace(&item->list, &last_item->list);
+	last_item->index = to_index;
+}
+
+static int parman_lsort_item_add(struct parman *parman,
+				 struct parman_prio *prio,
+				 struct parman_item *item)
+{
+	struct parman_prio *prio2;
+	unsigned long new_index;
+	int err;
+
+	if (parman->count + 1 > parman->limit_count) {
+		err = parman_enlarge(parman);
+		if (err)
+			return err;
+	}
+
+	new_index = parman_lsort_new_index_find(parman, prio);
+	list_for_each_entry_reverse(prio2, &parman->prio_list, list) {
+		if (prio2 == prio)
+			break;
+		parman_prio_shift_down(parman, prio2);
+	}
+	item->index = new_index;
+	list_add_tail(&item->list, &prio->item_list);
+	parman->count++;
+	return 0;
+}
+
+static void parman_lsort_item_remove(struct parman *parman,
+				     struct parman_prio *prio,
+				     struct parman_item *item)
+{
+	parman_prio_item_remove(parman, prio, item);
+	list_for_each_entry_continue(prio, &parman->prio_list, list)
+		parman_prio_shift_up(parman, prio);
+	parman->count--;
+	if (parman->limit_count - parman->count >= parman->ops->resize_step)
+		parman_shrink(parman);
+}
+
+static const struct parman_algo parman_lsort = {
+	.item_add	= parman_lsort_item_add,
+	.item_remove	= parman_lsort_item_remove,
+};
+
+static const struct parman_algo *parman_algos[] = {
+	&parman_lsort,
+};
+
+/**
+ * parman_create - creates a new parman instance
+ * @ops:	caller-specific callbacks
+ * @priv:	pointer to a private data passed to the ops
+ *
+ * Note: all locking must be provided by the caller.
+ *
+ * Each parman instance manages an array area with chunks of entries
+ * with the same priority. Consider following example:
+ *
+ * item 1 with prio 10
+ * item 2 with prio 10
+ * item 3 with prio 10
+ * item 4 with prio 20
+ * item 5 with prio 20
+ * item 6 with prio 30
+ * item 7 with prio 30
+ * item 8 with prio 30
+ *
+ * In this example, there are 3 priority chunks. The order of the priorities
+ * matters, however the order of items within a single priority chunk does not
+ * matter. So the same array could be ordered as follows:
+ *
+ * item 2 with prio 10
+ * item 3 with prio 10
+ * item 1 with prio 10
+ * item 5 with prio 20
+ * item 4 with prio 20
+ * item 7 with prio 30
+ * item 8 with prio 30
+ * item 6 with prio 30
+ *
+ * The goal of parman is to maintain the priority ordering. The caller
+ * provides @ops with callbacks parman uses to move the items
+ * and resize the array area.
+ *
+ * Returns a pointer to newly created parman instance in case of success,
+ * otherwise it returns NULL.
+ */
+struct parman *parman_create(const struct parman_ops *ops, void *priv)
+{
+	struct parman *parman;
+
+	parman = kzalloc(sizeof(*parman), GFP_KERNEL);
+	if (!parman)
+		return NULL;
+	INIT_LIST_HEAD(&parman->prio_list);
+	parman->ops = ops;
+	parman->priv = priv;
+	parman->limit_count = ops->base_count;
+	parman->algo = parman_algos[ops->algo];
+	return parman;
+}
+EXPORT_SYMBOL(parman_create);
+
+/**
+ * parman_destroy - destroys existing parman instance
+ * @parman:	parman instance
+ *
+ * Note: all locking must be provided by the caller.
+ */
+void parman_destroy(struct parman *parman)
+{
+	WARN_ON(!list_empty(&parman->prio_list));
+	kfree(parman);
+}
+EXPORT_SYMBOL(parman_destroy);
+
+/**
+ * parman_prio_init - initializes a parman priority chunk
+ * @parman:	parman instance
+ * @prio:	parman prio structure to be initialized
+ * @prority:	desired priority of the chunk
+ *
+ * Note: all locking must be provided by the caller.
+ *
+ * Before caller could add an item with certain priority, he has to
+ * initialize a priority chunk for it using this function.
+ */
+void parman_prio_init(struct parman *parman, struct parman_prio *prio,
+		      unsigned long priority)
+{
+	struct parman_prio *prio2;
+	struct list_head *pos;
+
+	INIT_LIST_HEAD(&prio->item_list);
+	prio->priority = priority;
+
+	/* Position inside the list according to priority */
+	list_for_each(pos, &parman->prio_list) {
+		prio2 = list_entry(pos, typeof(*prio2), list);
+		if (prio2->priority > prio->priority)
+			break;
+	}
+	list_add_tail(&prio->list, pos);
+}
+EXPORT_SYMBOL(parman_prio_init);
+
+/**
+ * parman_prio_fini - finalizes use of parman priority chunk
+ * @prio:	parman prio structure
+ *
+ * Note: all locking must be provided by the caller.
+ */
+void parman_prio_fini(struct parman_prio *prio)
+{
+	WARN_ON(parman_prio_used(prio));
+	list_del(&prio->list);
+}
+EXPORT_SYMBOL(parman_prio_fini);
+
+/**
+ * parman_item_add - adds a parman item under defined priority
+ * @parman:	parman instance
+ * @prio:	parman prio instance to add the item to
+ * @item:	parman item instance
+ *
+ * Note: all locking must be provided by the caller.
+ *
+ * Adds item to a array managed by parman instance under the specified priority.
+ *
+ * Returns 0 in case of success, negative number to indicate an error.
+ */
+int parman_item_add(struct parman *parman, struct parman_prio *prio,
+		    struct parman_item *item)
+{
+	return parman->algo->item_add(parman, prio, item);
+}
+EXPORT_SYMBOL(parman_item_add);
+
+/**
+ * parman_item_del - deletes parman item
+ * @parman:	parman instance
+ * @prio:	parman prio instance to delete the item from
+ * @item:	parman item instance
+ *
+ * Note: all locking must be provided by the caller.
+ */
+void parman_item_remove(struct parman *parman, struct parman_prio *prio,
+			struct parman_item *item)
+{
+	parman->algo->item_remove(parman, prio, item);
+}
+EXPORT_SYMBOL(parman_item_remove);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Priority-based array manager");
diff --git a/lib/test_parman.c b/lib/test_parman.c
new file mode 100644
index 000000000000..fe9f3a785804
--- /dev/null
+++ b/lib/test_parman.c
@@ -0,0 +1,395 @@
+/*
+ * lib/test_parman.c - Test module for parman
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/parman.h>
+
+#define TEST_PARMAN_PRIO_SHIFT 7 /* defines number of prios for testing */
+#define TEST_PARMAN_PRIO_COUNT BIT(TEST_PARMAN_PRIO_SHIFT)
+#define TEST_PARMAN_PRIO_MASK (TEST_PARMAN_PRIO_COUNT - 1)
+
+#define TEST_PARMAN_ITEM_SHIFT 13 /* defines a total number
+				   * of items for testing
+				   */
+#define TEST_PARMAN_ITEM_COUNT BIT(TEST_PARMAN_ITEM_SHIFT)
+#define TEST_PARMAN_ITEM_MASK (TEST_PARMAN_ITEM_COUNT - 1)
+
+#define TEST_PARMAN_BASE_SHIFT 8
+#define TEST_PARMAN_BASE_COUNT BIT(TEST_PARMAN_BASE_SHIFT)
+#define TEST_PARMAN_RESIZE_STEP_SHIFT 7
+#define TEST_PARMAN_RESIZE_STEP_COUNT BIT(TEST_PARMAN_RESIZE_STEP_SHIFT)
+
+#define TEST_PARMAN_BULK_MAX_SHIFT (2 + TEST_PARMAN_RESIZE_STEP_SHIFT)
+#define TEST_PARMAN_BULK_MAX_COUNT BIT(TEST_PARMAN_BULK_MAX_SHIFT)
+#define TEST_PARMAN_BULK_MAX_MASK (TEST_PARMAN_BULK_MAX_COUNT - 1)
+
+#define TEST_PARMAN_RUN_BUDGET (TEST_PARMAN_ITEM_COUNT * 256)
+
+struct test_parman_prio {
+	struct parman_prio parman_prio;
+	unsigned long priority;
+};
+
+struct test_parman_item {
+	struct parman_item parman_item;
+	struct test_parman_prio *prio;
+	bool used;
+};
+
+struct test_parman {
+	struct parman *parman;
+	struct test_parman_item **prio_array;
+	unsigned long prio_array_limit;
+	struct test_parman_prio prios[TEST_PARMAN_PRIO_COUNT];
+	struct test_parman_item items[TEST_PARMAN_ITEM_COUNT];
+	struct rnd_state rnd;
+	unsigned long run_budget;
+	unsigned long bulk_budget;
+	bool bulk_noop;
+	unsigned int used_items;
+};
+
+#define ITEM_PTRS_SIZE(count) (sizeof(struct test_parman_item *) * (count))
+
+static int test_parman_resize(void *priv, unsigned long new_count)
+{
+	struct test_parman *test_parman = priv;
+	struct test_parman_item **prio_array;
+	unsigned long old_count;
+
+	prio_array = krealloc(test_parman->prio_array,
+			      ITEM_PTRS_SIZE(new_count), GFP_KERNEL);
+	if (new_count == 0)
+		return 0;
+	if (!prio_array)
+		return -ENOMEM;
+	old_count = test_parman->prio_array_limit;
+	if (new_count > old_count)
+		memset(&prio_array[old_count], 0,
+		       ITEM_PTRS_SIZE(new_count - old_count));
+	test_parman->prio_array = prio_array;
+	test_parman->prio_array_limit = new_count;
+	return 0;
+}
+
+static void test_parman_move(void *priv, unsigned long from_index,
+			     unsigned long to_index, unsigned long count)
+{
+	struct test_parman *test_parman = priv;
+	struct test_parman_item **prio_array = test_parman->prio_array;
+
+	memmove(&prio_array[to_index], &prio_array[from_index],
+		ITEM_PTRS_SIZE(count));
+	memset(&prio_array[from_index], 0, ITEM_PTRS_SIZE(count));
+}
+
+static const struct parman_ops test_parman_lsort_ops = {
+	.base_count	= TEST_PARMAN_BASE_COUNT,
+	.resize_step	= TEST_PARMAN_RESIZE_STEP_COUNT,
+	.resize		= test_parman_resize,
+	.move		= test_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+static void test_parman_rnd_init(struct test_parman *test_parman)
+{
+	prandom_seed_state(&test_parman->rnd, 3141592653589793238ULL);
+}
+
+static u32 test_parman_rnd_get(struct test_parman *test_parman)
+{
+	return prandom_u32_state(&test_parman->rnd);
+}
+
+static unsigned long test_parman_priority_gen(struct test_parman *test_parman)
+{
+	unsigned long priority;
+	int i;
+
+again:
+	priority = test_parman_rnd_get(test_parman);
+	if (priority == 0)
+		goto again;
+
+	for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) {
+		struct test_parman_prio *prio = &test_parman->prios[i];
+
+		if (prio->priority == 0)
+			break;
+		if (prio->priority == priority)
+			goto again;
+	}
+	return priority;
+}
+
+static void test_parman_prios_init(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) {
+		struct test_parman_prio *prio = &test_parman->prios[i];
+
+		/* Assign random uniqueue priority to each prio structure */
+		prio->priority = test_parman_priority_gen(test_parman);
+		parman_prio_init(test_parman->parman, &prio->parman_prio,
+				 prio->priority);
+	}
+}
+
+static void test_parman_prios_fini(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) {
+		struct test_parman_prio *prio = &test_parman->prios[i];
+
+		parman_prio_fini(&prio->parman_prio);
+	}
+}
+
+static void test_parman_items_init(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_ITEM_COUNT; i++) {
+		struct test_parman_item *item = &test_parman->items[i];
+		unsigned int prio_index = test_parman_rnd_get(test_parman) &
+					  TEST_PARMAN_PRIO_MASK;
+
+		/* Assign random prio to each item structure */
+		item->prio = &test_parman->prios[prio_index];
+	}
+}
+
+static void test_parman_items_fini(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_ITEM_COUNT; i++) {
+		struct test_parman_item *item = &test_parman->items[i];
+
+		if (!item->used)
+			continue;
+		parman_item_remove(test_parman->parman,
+				   &item->prio->parman_prio,
+				   &item->parman_item);
+	}
+}
+
+static struct test_parman *test_parman_create(const struct parman_ops *ops)
+{
+	struct test_parman *test_parman;
+	int err;
+
+	test_parman = kzalloc(sizeof(*test_parman), GFP_KERNEL);
+	if (!test_parman)
+		return ERR_PTR(-ENOMEM);
+	err = test_parman_resize(test_parman, TEST_PARMAN_BASE_COUNT);
+	if (err)
+		goto err_resize;
+	test_parman->parman = parman_create(ops, test_parman);
+	if (!test_parman->parman) {
+		err = -ENOMEM;
+		goto err_parman_create;
+	}
+	test_parman_rnd_init(test_parman);
+	test_parman_prios_init(test_parman);
+	test_parman_items_init(test_parman);
+	test_parman->run_budget = TEST_PARMAN_RUN_BUDGET;
+	return test_parman;
+
+err_parman_create:
+	test_parman_resize(test_parman, 0);
+err_resize:
+	kfree(test_parman);
+	return ERR_PTR(err);
+}
+
+static void test_parman_destroy(struct test_parman *test_parman)
+{
+	test_parman_items_fini(test_parman);
+	test_parman_prios_fini(test_parman);
+	parman_destroy(test_parman->parman);
+	test_parman_resize(test_parman, 0);
+	kfree(test_parman);
+}
+
+static bool test_parman_run_check_budgets(struct test_parman *test_parman)
+{
+	if (test_parman->run_budget-- == 0)
+		return false;
+	if (test_parman->bulk_budget-- != 0)
+		return true;
+
+	test_parman->bulk_budget = test_parman_rnd_get(test_parman) &
+				   TEST_PARMAN_BULK_MAX_MASK;
+	test_parman->bulk_noop = test_parman_rnd_get(test_parman) & 1;
+	return true;
+}
+
+static int test_parman_run(struct test_parman *test_parman)
+{
+	unsigned int i = test_parman_rnd_get(test_parman);
+	int err;
+
+	while (test_parman_run_check_budgets(test_parman)) {
+		unsigned int item_index = i++ & TEST_PARMAN_ITEM_MASK;
+		struct test_parman_item *item = &test_parman->items[item_index];
+
+		if (test_parman->bulk_noop)
+			continue;
+
+		if (!item->used) {
+			err = parman_item_add(test_parman->parman,
+					      &item->prio->parman_prio,
+					      &item->parman_item);
+			if (err)
+				return err;
+			test_parman->prio_array[item->parman_item.index] = item;
+			test_parman->used_items++;
+		} else {
+			test_parman->prio_array[item->parman_item.index] = NULL;
+			parman_item_remove(test_parman->parman,
+					   &item->prio->parman_prio,
+					   &item->parman_item);
+			test_parman->used_items--;
+		}
+		item->used = !item->used;
+	}
+	return 0;
+}
+
+static int test_parman_check_array(struct test_parman *test_parman,
+				   bool gaps_allowed)
+{
+	unsigned int last_unused_items = 0;
+	unsigned long last_priority = 0;
+	unsigned int used_items = 0;
+	int i;
+
+	if (test_parman->prio_array_limit < TEST_PARMAN_BASE_COUNT) {
+		pr_err("Array limit is lower than the base count (%lu < %lu)\n",
+		       test_parman->prio_array_limit, TEST_PARMAN_BASE_COUNT);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < test_parman->prio_array_limit; i++) {
+		struct test_parman_item *item = test_parman->prio_array[i];
+
+		if (!item) {
+			last_unused_items++;
+			continue;
+		}
+		if (last_unused_items && !gaps_allowed) {
+			pr_err("Gap found in array even though they are forbidden\n");
+			return -EINVAL;
+		}
+
+		last_unused_items = 0;
+		used_items++;
+
+		if (item->prio->priority < last_priority) {
+			pr_err("Item belongs under higher priority then the last one (current: %lu, previous: %lu)\n",
+			       item->prio->priority, last_priority);
+			return -EINVAL;
+		}
+		last_priority = item->prio->priority;
+
+		if (item->parman_item.index != i) {
+			pr_err("Item has different index in compare to where it actualy is (%lu != %d)\n",
+			       item->parman_item.index, i);
+			return -EINVAL;
+		}
+	}
+
+	if (used_items != test_parman->used_items) {
+		pr_err("Number of used items in array does not match (%u != %u)\n",
+		       used_items, test_parman->used_items);
+		return -EINVAL;
+	}
+
+	if (last_unused_items >= TEST_PARMAN_RESIZE_STEP_COUNT) {
+		pr_err("Number of unused item at the end of array is bigger than resize step (%u >= %lu)\n",
+		       last_unused_items, TEST_PARMAN_RESIZE_STEP_COUNT);
+		return -EINVAL;
+	}
+
+	pr_info("Priority array check successful\n");
+
+	return 0;
+}
+
+static int test_parman_lsort(void)
+{
+	struct test_parman *test_parman;
+	int err;
+
+	test_parman = test_parman_create(&test_parman_lsort_ops);
+	if (IS_ERR(test_parman))
+		return PTR_ERR(test_parman);
+
+	err = test_parman_run(test_parman);
+	if (err)
+		goto out;
+
+	err = test_parman_check_array(test_parman, false);
+	if (err)
+		goto out;
+out:
+	test_parman_destroy(test_parman);
+	return err;
+}
+
+static int __init test_parman_init(void)
+{
+	return test_parman_lsort();
+}
+
+static void __exit test_parman_exit(void)
+{
+}
+
+module_init(test_parman_init);
+module_exit(test_parman_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Test module for parman");
-- 
cgit v1.2.3


From 79e7fff47b7bb4124ef970a13eac4fdeddd1fc25 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 2 Feb 2017 18:43:28 -0800
Subject: net: remove support for per driver ndo_busy_poll()

We added generic support for busy polling in NAPI layer in linux-4.5

No network driver uses ndo_busy_poll() anymore, we can get rid
of the pointer in struct net_device_ops, and its use in sk_busy_loop()

Saves NETIF_F_BUSY_POLL features bit.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  2 --
 include/linux/netdevice.h       |  3 ---
 net/core/dev.c                  | 15 ---------------
 net/core/ethtool.c              |  1 -
 4 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9c6c8ef2e9e7..9a0419594e84 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -71,7 +71,6 @@ enum {
 	NETIF_F_HW_VLAN_STAG_RX_BIT,	/* Receive VLAN STAG HW acceleration */
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
-	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
 
 	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
 
@@ -134,7 +133,6 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
-#define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
 #define NETIF_F_HW_TC		__NETIF_F(HW_TC)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f3878fbe7786..6f18b509fb2f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1184,9 +1184,6 @@ struct net_device_ops {
 	int			(*ndo_netpoll_setup)(struct net_device *dev,
 						     struct netpoll_info *info);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
-#endif
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	int			(*ndo_busy_poll)(struct napi_struct *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
diff --git a/net/core/dev.c b/net/core/dev.c
index 727b6fda0e8c..4cde8bfb9bab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4978,7 +4978,6 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 {
 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
 	int (*napi_poll)(struct napi_struct *napi, int budget);
-	int (*busy_poll)(struct napi_struct *dev);
 	void *have_poll_lock = NULL;
 	struct napi_struct *napi;
 	int rc;
@@ -4993,17 +4992,10 @@ restart:
 	if (!napi)
 		goto out;
 
-	/* Note: ndo_busy_poll method is optional in linux-4.5 */
-	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
-
 	preempt_disable();
 	for (;;) {
 		rc = 0;
 		local_bh_disable();
-		if (busy_poll) {
-			rc = busy_poll(napi);
-			goto count;
-		}
 		if (!napi_poll) {
 			unsigned long val = READ_ONCE(napi->state);
 
@@ -6956,13 +6948,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		features &= ~dev->gso_partial_features;
 	}
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	if (dev->netdev_ops->ndo_busy_poll)
-		features |= NETIF_F_BUSY_POLL;
-	else
-#endif
-		features &= ~NETIF_F_BUSY_POLL;
-
 	return features;
 }
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6b3eee0834a0..d5f412b3093d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXFCS_BIT] =            "rx-fcs",
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
-	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
 	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
 };
 
-- 
cgit v1.2.3


From f9f41e3ef99ac9d4e91b07634362e393fb929aad Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 5 Jan 2017 10:17:27 +0530
Subject: cpufreq: Remove policy create/remove notifiers

Those were added by:

commit fcd7af917abb ("cpufreq: stats: handle cpufreq_unregister_driver()
and suspend/resume properly")

but aren't used anymore since:

commit 1aefc75b2449 ("cpufreq: stats: Make the stats code non-modular").

Remove them. Also remove the redundant parameter to the respective
routines.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 16 +++++-----------
 include/linux/cpufreq.h   |  2 --
 2 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index cc475eff90b3..53268bebdf1e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1078,15 +1078,11 @@ err_free_policy:
 	return NULL;
 }
 
-static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy, bool notify)
+static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
 {
 	struct kobject *kobj;
 	struct completion *cmp;
 
-	if (notify)
-		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-					     CPUFREQ_REMOVE_POLICY, policy);
-
 	down_write(&policy->rwsem);
 	cpufreq_stats_free_table(policy);
 	kobj = &policy->kobj;
@@ -1104,7 +1100,7 @@ static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy, bool notify)
 	pr_debug("wait complete\n");
 }
 
-static void cpufreq_policy_free(struct cpufreq_policy *policy, bool notify)
+static void cpufreq_policy_free(struct cpufreq_policy *policy)
 {
 	unsigned long flags;
 	int cpu;
@@ -1117,7 +1113,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy, bool notify)
 		per_cpu(cpufreq_cpu_data, cpu) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-	cpufreq_policy_put_kobj(policy, notify);
+	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
 	free_cpumask_var(policy->cpus);
@@ -1244,8 +1240,6 @@ static int cpufreq_online(unsigned int cpu)
 			goto out_exit_policy;
 
 		cpufreq_stats_create_table(policy);
-		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-				CPUFREQ_CREATE_POLICY, policy);
 
 		write_lock_irqsave(&cpufreq_driver_lock, flags);
 		list_add(&policy->policy_list, &cpufreq_policy_list);
@@ -1282,7 +1276,7 @@ out_exit_policy:
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(policy);
 out_free_policy:
-	cpufreq_policy_free(policy, !new_policy);
+	cpufreq_policy_free(policy);
 	return ret;
 }
 
@@ -1403,7 +1397,7 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 	remove_cpu_dev_symlink(policy, dev);
 
 	if (cpumask_empty(policy->real_cpus))
-		cpufreq_policy_free(policy, true);
+		cpufreq_policy_free(policy);
 }
 
 /**
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 7e05c5e4e45c..a597bb825ae1 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -416,8 +416,6 @@ static inline void cpufreq_resume(void) {}
 #define CPUFREQ_ADJUST			(0)
 #define CPUFREQ_NOTIFY			(1)
 #define CPUFREQ_START			(2)
-#define CPUFREQ_CREATE_POLICY		(3)
-#define CPUFREQ_REMOVE_POLICY		(4)
 
 #ifdef CONFIG_CPU_FREQ
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list);
-- 
cgit v1.2.3


From 052f573f5cca0ce0a16de409012660565bd792df Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 5 Jan 2017 11:34:31 +0530
Subject: cpufreq: Remove CPUFREQ_START notifier event

Its not used anymore, remove it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c             | 3 ---
 drivers/cpufreq/ppc_cbe_cpufreq_pmi.c | 3 ---
 include/linux/cpufreq.h               | 1 -
 3 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 53268bebdf1e..408479540566 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1246,9 +1246,6 @@ static int cpufreq_online(unsigned int cpu)
 		write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 	}
 
-	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-				     CPUFREQ_START, policy);
-
 	ret = cpufreq_init_policy(policy);
 	if (ret) {
 		pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",
diff --git a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
index dc112481a408..eeaa92251512 100644
--- a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
+++ b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
@@ -100,9 +100,6 @@ static int pmi_notifier(struct notifier_block *nb,
 	/* Should this really be called for CPUFREQ_ADJUST and CPUFREQ_NOTIFY
 	 * policy events?)
 	 */
-	if (event == CPUFREQ_START)
-		return 0;
-
 	node = cbe_cpu_to_node(policy->cpu);
 
 	pr_debug("got notified, event=%lu, node=%u\n", event, node);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a597bb825ae1..b07838b1fc60 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -415,7 +415,6 @@ static inline void cpufreq_resume(void) {}
 /* Policy Notifiers  */
 #define CPUFREQ_ADJUST			(0)
 #define CPUFREQ_NOTIFY			(1)
-#define CPUFREQ_START			(2)
 
 #ifdef CONFIG_CPU_FREQ
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list);
-- 
cgit v1.2.3


From 565ebe8073f84ced436a18e76a5ba8e6bb73dfb3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 3 Feb 2017 15:26:25 +0530
Subject: cpufreq: Fix typos in comments

- s/freqnency/frequency/
- s/accomodating/accommodating/

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index b07838b1fc60..87165f06a307 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -31,7 +31,7 @@
 
 #define CPUFREQ_ETERNAL			(-1)
 #define CPUFREQ_NAME_LEN		16
-/* Print length for names. Extra 1 space for accomodating '\n' in prints */
+/* Print length for names. Extra 1 space for accommodating '\n' in prints */
 #define CPUFREQ_NAME_PLEN		(CPUFREQ_NAME_LEN + 1)
 
 struct cpufreq_governor;
@@ -115,7 +115,7 @@ struct cpufreq_policy {
 	 *   guarantee that frequency can be changed on any CPU sharing the
 	 *   policy and that the change will affect all of the policy CPUs then.
 	 * - fast_switch_enabled is to be set by governors that support fast
-	 *   freqnency switching with the help of cpufreq_enable_fast_switch().
+	 *   frequency switching with the help of cpufreq_enable_fast_switch().
 	 */
 	bool			fast_switch_possible;
 	bool			fast_switch_enabled;
-- 
cgit v1.2.3


From 668802c25729a8e3423015c33c05f1c3be3858e9 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 30 Jan 2017 12:57:43 -0500
Subject: tick/broadcast: Reduce lock cacheline contention

It was observed that on an Intel x86 system without the ARAT (Always
running APIC timer) feature and with fairly large number of CPUs as
well as CPUs coming in and out of intel_idle frequently, the lock
contention on the tick_broadcast_lock can become significant.

To reduce contention, the lock is put into its own cacheline and all
the cpumask_var_t variables are put into the __read_mostly section.

Running the SP benchmark of the NAS Parallel Benchmarks on a 4-socket
16-core 32-thread Nehalam system, the performance number improved
from 3353.94 Mop/s to 3469.31 Mop/s when this patch was applied on
a 4.9.6 kernel.  This is a 3.4% improvement.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1485799063-20857-1-git-send-email-longman@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpumask.h      |  7 ++++++-
 kernel/time/tick-broadcast.c | 15 ++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c717f5ea88cb..23c1a6d09ec5 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -649,11 +649,15 @@ static inline size_t cpumask_size(void)
  * used. Please use this_cpu_cpumask_var_t in those cases. The direct use
  * of this_cpu_ptr() or this_cpu_read() will lead to failures when the
  * other type of cpumask_var_t implementation is configured.
+ *
+ * Please also note that __cpumask_var_read_mostly can be used to declare
+ * a cpumask_var_t variable itself (not its content) as read mostly.
  */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 typedef struct cpumask *cpumask_var_t;
 
-#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
+#define this_cpu_cpumask_var_ptr(x)	this_cpu_read(x)
+#define __cpumask_var_read_mostly	__read_mostly
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
@@ -667,6 +671,7 @@ void free_bootmem_cpumask_var(cpumask_var_t mask);
 typedef struct cpumask cpumask_var_t[1];
 
 #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
+#define __cpumask_var_read_mostly
 
 static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 3109204c87cc..244c93544150 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -29,12 +29,13 @@
  */
 
 static struct tick_device tick_broadcast_device;
-static cpumask_var_t tick_broadcast_mask;
-static cpumask_var_t tick_broadcast_on;
-static cpumask_var_t tmpmask;
-static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly;
+static cpumask_var_t tmpmask __cpumask_var_read_mostly;
 static int tick_broadcast_forced;
 
+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -517,9 +518,9 @@ void tick_resume_broadcast(void)
 
 #ifdef CONFIG_TICK_ONESHOT
 
-static cpumask_var_t tick_broadcast_oneshot_mask;
-static cpumask_var_t tick_broadcast_pending_mask;
-static cpumask_var_t tick_broadcast_force_mask;
+static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly;
 
 /*
  * Exposed for debugging: see timer_list.c
-- 
cgit v1.2.3


From 4b0947974e593d52aace18ca5c7e2746fdebae60 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 2 Feb 2017 14:53:10 +0100
Subject: gpio: Rename devm_get_gpiod_from_child()

Rename devm_get_gpiod_from_child() into
devm_fwnode_get_gpiod_from_child() to reflect the fact that this
function is operating on a fwnode object.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/devres.c                     | 15 ++++++++-------
 drivers/input/keyboard/gpio_keys.c        |  6 ++++--
 drivers/input/keyboard/gpio_keys_polled.c |  8 ++++----
 drivers/leds/leds-gpio.c                  |  5 +++--
 drivers/video/fbdev/amba-clcd-nomadik.c   | 16 ++++++++--------
 include/linux/gpio/consumer.h             | 20 ++++++++++----------
 6 files changed, 37 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/devres.c b/drivers/gpio/devres.c
index 89969e887e5d..cd1bd1c3d2c0 100644
--- a/drivers/gpio/devres.c
+++ b/drivers/gpio/devres.c
@@ -123,7 +123,8 @@ struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev,
 EXPORT_SYMBOL(devm_gpiod_get_index);
 
 /**
- * devm_get_gpiod_from_child - get a GPIO descriptor from a device's child node
+ * devm_fwnode_get_gpiod_from_child - get a GPIO descriptor from a device's
+ *				      child node
  * @dev:	GPIO consumer
  * @con_id:	function within the GPIO consumer
  * @child:	firmware node (child of @dev)
@@ -135,11 +136,11 @@ EXPORT_SYMBOL(devm_gpiod_get_index);
  * On successfull request the GPIO pin is configured in accordance with
  * provided @flags.
  */
-struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
-					    const char *con_id,
-					    struct fwnode_handle *child,
-					    enum gpiod_flags flags,
-					    const char *label)
+struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
+						   const char *con_id,
+						   struct fwnode_handle *child,
+						   enum gpiod_flags flags,
+						   const char *label)
 {
 	static const char * const suffixes[] = { "gpios", "gpio" };
 	char prop_name[32]; /* 32 is max size of property name */
@@ -174,7 +175,7 @@ struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
 
 	return desc;
 }
-EXPORT_SYMBOL(devm_get_gpiod_from_child);
+EXPORT_SYMBOL(devm_fwnode_get_gpiod_from_child);
 
 /**
  * devm_gpiod_get_index_optional - Resource-managed gpiod_get_index_optional()
diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c
index 2ec74d90ef78..13372782dffe 100644
--- a/drivers/input/keyboard/gpio_keys.c
+++ b/drivers/input/keyboard/gpio_keys.c
@@ -481,8 +481,10 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 	spin_lock_init(&bdata->lock);
 
 	if (child) {
-		bdata->gpiod = devm_get_gpiod_from_child(dev, NULL, child,
-							 GPIOD_IN, desc);
+		bdata->gpiod = devm_fwnode_get_gpiod_from_child(dev, NULL,
+								child,
+								GPIOD_IN,
+								desc);
 		if (IS_ERR(bdata->gpiod)) {
 			error = PTR_ERR(bdata->gpiod);
 			if (error == -ENOENT) {
diff --git a/drivers/input/keyboard/gpio_keys_polled.c b/drivers/input/keyboard/gpio_keys_polled.c
index 5e35c565e341..b3c8d5e7985c 100644
--- a/drivers/input/keyboard/gpio_keys_polled.c
+++ b/drivers/input/keyboard/gpio_keys_polled.c
@@ -303,10 +303,10 @@ static int gpio_keys_polled_probe(struct platform_device *pdev)
 				return -EINVAL;
 			}
 
-			bdata->gpiod = devm_get_gpiod_from_child(dev, NULL,
-								 child,
-								 GPIOD_IN,
-								 button->desc);
+			bdata->gpiod = devm_fwnode_get_gpiod_from_child(dev,
+								NULL, child,
+								GPIOD_IN,
+								button->desc);
 			if (IS_ERR(bdata->gpiod)) {
 				error = PTR_ERR(bdata->gpiod);
 				if (error != -EPROBE_DEFER)
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index 6c4825d96693..066fc7590729 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -182,8 +182,9 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
 			return ERR_PTR(-EINVAL);
 		}
 
-		led.gpiod = devm_get_gpiod_from_child(dev, NULL, child,
-						      GPIOD_ASIS, led.name);
+		led.gpiod = devm_fwnode_get_gpiod_from_child(dev, NULL, child,
+							     GPIOD_ASIS,
+							     led.name);
 		if (IS_ERR(led.gpiod)) {
 			fwnode_handle_put(child);
 			return ERR_CAST(led.gpiod);
diff --git a/drivers/video/fbdev/amba-clcd-nomadik.c b/drivers/video/fbdev/amba-clcd-nomadik.c
index 9175ad9034e1..476ff3f4d466 100644
--- a/drivers/video/fbdev/amba-clcd-nomadik.c
+++ b/drivers/video/fbdev/amba-clcd-nomadik.c
@@ -185,26 +185,26 @@ static void tpg110_init(struct device *dev, struct device_node *np,
 	dev_info(dev, "TPG110 display init\n");
 
 	/* This asserts the GRESTB signal, putting the display into reset */
-	grestb = devm_get_gpiod_from_child(dev, "grestb", &np->fwnode,
-					   GPIOD_OUT_HIGH, "grestb");
+	grestb = devm_fwnode_get_gpiod_from_child(dev, "grestb", &np->fwnode,
+						  GPIOD_OUT_HIGH, "grestb");
 	if (IS_ERR(grestb)) {
 		dev_err(dev, "no GRESTB GPIO\n");
 		return;
 	}
-	scen = devm_get_gpiod_from_child(dev, "scen", &np->fwnode,
-					 GPIOD_OUT_LOW, "scen");
+	scen = devm_fwnode_get_gpiod_from_child(dev, "scen", &np->fwnode,
+						GPIOD_OUT_LOW, "scen");
 	if (IS_ERR(scen)) {
 		dev_err(dev, "no SCEN GPIO\n");
 		return;
 	}
-	scl = devm_get_gpiod_from_child(dev, "scl", &np->fwnode, GPIOD_OUT_LOW,
-					"scl");
+	scl = devm_fwnode_get_gpiod_from_child(dev, "scl", &np->fwnode,
+					       GPIOD_OUT_LOW, "scl");
 	if (IS_ERR(scl)) {
 		dev_err(dev, "no SCL GPIO\n");
 		return;
 	}
-	sda = devm_get_gpiod_from_child(dev, "sda", &np->fwnode, GPIOD_OUT_LOW,
-					"sda");
+	sda = devm_fwnode_get_gpiod_from_child(dev, "sda", &np->fwnode,
+					       GPIOD_OUT_LOW, "sda");
 	if (IS_ERR(sda)) {
 		dev_err(dev, "no SDA GPIO\n");
 		return;
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 80bad7ebde04..f32f49e96c0b 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -138,11 +138,11 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 					 const char *propname,
 					 enum gpiod_flags dflags,
 					 const char *label);
-struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
-					    const char *con_id,
-					    struct fwnode_handle *child,
-					    enum gpiod_flags flags,
-					    const char *label);
+struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
+						   const char *con_id,
+						   struct fwnode_handle *child,
+						   enum gpiod_flags flags,
+						   const char *label);
 #else /* CONFIG_GPIOLIB */
 
 static inline int gpiod_count(struct device *dev, const char *con_id)
@@ -425,11 +425,11 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 }
 
 static inline
-struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
-					    const char *con_id,
-					    struct fwnode_handle *child,
-					    enum gpiod_flags flags,
-					    const char *label)
+struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
+						   const char *con_id,
+						   struct fwnode_handle *child,
+						   enum gpiod_flags flags,
+						   const char *label)
 {
 	return ERR_PTR(-ENOSYS);
 }
-- 
cgit v1.2.3


From 537b94dafce29af6a3e923d216472cfc2f3659af Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 2 Feb 2017 14:53:11 +0100
Subject: gpio: Add the devm_fwnode_get_index_gpiod_from_child() helper

devm_fwnode_get_gpiod_from_child() currently allows GPIO users to
request a GPIO that is defined in a child fwnode instead of directly in
the device fwnode.
Extend this API by adding the devm_fwnode_get_index_gpiod_from_child()
helper which does the same except you can also specify an index in case
the 'xx-gpios' property describe several GPIOs.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/devres.c         | 20 +++++++++++---------
 drivers/gpio/gpiolib.c        |  9 +++++----
 include/linux/gpio/consumer.h | 31 +++++++++++++++++++++----------
 3 files changed, 37 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/devres.c b/drivers/gpio/devres.c
index cd1bd1c3d2c0..b2bbcaae6a1f 100644
--- a/drivers/gpio/devres.c
+++ b/drivers/gpio/devres.c
@@ -123,10 +123,11 @@ struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev,
 EXPORT_SYMBOL(devm_gpiod_get_index);
 
 /**
- * devm_fwnode_get_gpiod_from_child - get a GPIO descriptor from a device's
- *				      child node
+ * devm_fwnode_get_index_gpiod_from_child - get a GPIO descriptor from a
+ *					    device's child node
  * @dev:	GPIO consumer
  * @con_id:	function within the GPIO consumer
+ * @index:	index of the GPIO to obtain in the consumer
  * @child:	firmware node (child of @dev)
  * @flags:	GPIO initialization flags
  *
@@ -136,11 +137,11 @@ EXPORT_SYMBOL(devm_gpiod_get_index);
  * On successfull request the GPIO pin is configured in accordance with
  * provided @flags.
  */
-struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
-						   const char *con_id,
-						   struct fwnode_handle *child,
-						   enum gpiod_flags flags,
-						   const char *label)
+struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
+						const char *con_id, int index,
+						struct fwnode_handle *child,
+						enum gpiod_flags flags,
+						const char *label)
 {
 	static const char * const suffixes[] = { "gpios", "gpio" };
 	char prop_name[32]; /* 32 is max size of property name */
@@ -161,7 +162,8 @@ struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
 			snprintf(prop_name, sizeof(prop_name), "%s",
 							       suffixes[i]);
 
-		desc = fwnode_get_named_gpiod(child, prop_name, flags, label);
+		desc = fwnode_get_named_gpiod(child, prop_name, index, flags,
+					      label);
 		if (!IS_ERR(desc) || (PTR_ERR(desc) != -ENOENT))
 			break;
 	}
@@ -175,7 +177,7 @@ struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
 
 	return desc;
 }
-EXPORT_SYMBOL(devm_fwnode_get_gpiod_from_child);
+EXPORT_SYMBOL(devm_fwnode_get_index_gpiod_from_child);
 
 /**
  * devm_gpiod_get_index_optional - Resource-managed gpiod_get_index_optional()
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 20c6c51cbe10..5e5d48c3512c 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3309,6 +3309,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_index);
  * fwnode_get_named_gpiod - obtain a GPIO from firmware node
  * @fwnode:	handle of the firmware node
  * @propname:	name of the firmware property representing the GPIO
+ * @index:	index of the GPIO to obtain in the consumer
  * @dflags:	GPIO initialization flags
  *
  * This function can be used for drivers that get their configuration
@@ -3324,7 +3325,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_index);
  * In case of error an ERR_PTR() is returned.
  */
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
-					 const char *propname,
+					 const char *propname, int index,
 					 enum gpiod_flags dflags,
 					 const char *label)
 {
@@ -3340,8 +3341,8 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 	if (is_of_node(fwnode)) {
 		enum of_gpio_flags flags;
 
-		desc = of_get_named_gpiod_flags(to_of_node(fwnode), propname, 0,
-						&flags);
+		desc = of_get_named_gpiod_flags(to_of_node(fwnode), propname,
+						index, &flags);
 		if (!IS_ERR(desc)) {
 			active_low = flags & OF_GPIO_ACTIVE_LOW;
 			single_ended = flags & OF_GPIO_SINGLE_ENDED;
@@ -3349,7 +3350,7 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 	} else if (is_acpi_node(fwnode)) {
 		struct acpi_gpio_info info;
 
-		desc = acpi_node_get_gpiod(fwnode, propname, 0, &info);
+		desc = acpi_node_get_gpiod(fwnode, propname, index, &info);
 		if (!IS_ERR(desc))
 			active_low = info.polarity == GPIO_ACTIVE_LOW;
 	}
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index f32f49e96c0b..ea9b01d40017 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -135,14 +135,14 @@ int desc_to_gpio(const struct gpio_desc *desc);
 struct fwnode_handle;
 
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
-					 const char *propname,
+					 const char *propname, int index,
 					 enum gpiod_flags dflags,
 					 const char *label);
-struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
-						   const char *con_id,
-						   struct fwnode_handle *child,
-						   enum gpiod_flags flags,
-						   const char *label);
+struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
+						const char *con_id, int index,
+						struct fwnode_handle *child,
+						enum gpiod_flags flags,
+						const char *label);
 #else /* CONFIG_GPIOLIB */
 
 static inline int gpiod_count(struct device *dev, const char *con_id)
@@ -417,13 +417,25 @@ struct fwnode_handle;
 
 static inline
 struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
-					 const char *propname,
+					 const char *propname, int index,
 					 enum gpiod_flags dflags,
 					 const char *label)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
+static inline
+struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
+						const char *con_id, int index,
+						struct fwnode_handle *child,
+						enum gpiod_flags flags,
+						const char *label)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+#endif /* CONFIG_GPIOLIB */
+
 static inline
 struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
 						   const char *con_id,
@@ -431,11 +443,10 @@ struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
 						   enum gpiod_flags flags,
 						   const char *label)
 {
-	return ERR_PTR(-ENOSYS);
+	return devm_fwnode_get_index_gpiod_from_child(dev, con_id, 0, child,
+						      flags, label);
 }
 
-#endif /* CONFIG_GPIOLIB */
-
 #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS)
 
 int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
-- 
cgit v1.2.3


From 02c1602ee7b3e3d062c3eacd374d6a6e3a2ebb73 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 4 Feb 2017 15:25:02 -0800
Subject: net: remove __napi_complete()

All __napi_complete() callers have been converted to
use the more standard napi_complete_done(),
we can now remove this NAPI method for good.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 net/core/dev.c            | 24 +++---------------------
 2 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6f18b509fb2f..014fbe256d55 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -463,7 +463,6 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
-bool __napi_complete(struct napi_struct *n);
 bool napi_complete_done(struct napi_struct *n, int work_done);
 /**
  *	napi_complete - NAPI processing complete
diff --git a/net/core/dev.c b/net/core/dev.c
index 42ba0379575a..404d2e6d5d32 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4883,23 +4883,6 @@ void __napi_schedule_irqoff(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule_irqoff);
 
-bool __napi_complete(struct napi_struct *n)
-{
-	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-
-	/* Some drivers call us directly, instead of calling
-	 * napi_complete_done().
-	 */
-	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
-		return false;
-
-	list_del_init(&n->poll_list);
-	smp_mb__before_atomic();
-	clear_bit(NAPI_STATE_SCHED, &n->state);
-	return true;
-}
-EXPORT_SYMBOL(__napi_complete);
-
 bool napi_complete_done(struct napi_struct *n, int work_done)
 {
 	unsigned long flags;
@@ -4926,14 +4909,13 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		else
 			napi_gro_flush(n, false);
 	}
-	if (likely(list_empty(&n->poll_list))) {
-		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
-	} else {
+	if (unlikely(!list_empty(&n->poll_list))) {
 		/* If n->poll_list is not empty, we need to mask irqs */
 		local_irq_save(flags);
-		__napi_complete(n);
+		list_del_init(&n->poll_list);
 		local_irq_restore(flags);
 	}
+	WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
 	return true;
 }
 EXPORT_SYMBOL(napi_complete_done);
-- 
cgit v1.2.3


From 656441478ed55d960df5f3ccdf5a0f8c61dfd0b3 Mon Sep 17 00:00:00 2001
From: Mark Marshall <mark.marshall@omicronenergy.com>
Date: Thu, 26 Jan 2017 16:18:27 +0100
Subject: mtd: nand: ifc: Fix location of eccstat registers for IFC V1.0

The commit 7a654172161c ("mtd/ifc: Add support for IFC controller
version 2.0") added support for version 2.0 of the IFC controller.
The version 2.0 controller has the ECC status registers at a different
location to the previous versions.

Correct the fsl_ifc_nand structure so that the ECC status can be read
from the correct location for both version 1.0 and 2.0 of the controller.

Cc: stable@vger.kernel.org
Fixes: 7a654172161c ("mtd/ifc: Add support for IFC controller version 2.0")
Signed-off-by: Mark Marshall <mark.marshall@omicronenergy.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/fsl_ifc_nand.c | 8 +++++++-
 include/linux/fsl_ifc.h         | 8 ++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index 0a177b1bfe3e..d1570f512f0b 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -258,9 +258,15 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
 		int bufnum = nctrl->page & priv->bufnum_mask;
 		int sector = bufnum * chip->ecc.steps;
 		int sector_end = sector + chip->ecc.steps - 1;
+		__be32 *eccstat_regs;
+
+		if (ctrl->version >= FSL_IFC_VERSION_2_0_0)
+			eccstat_regs = ifc->ifc_nand.v2_nand_eccstat;
+		else
+			eccstat_regs = ifc->ifc_nand.v1_nand_eccstat;
 
 		for (i = sector / 4; i <= sector_end / 4; i++)
-			eccstat[i] = ifc_in32(&ifc->ifc_nand.nand_eccstat[i]);
+			eccstat[i] = ifc_in32(&eccstat_regs[i]);
 
 		for (i = sector; i <= sector_end; i++) {
 			errors = check_read_ecc(mtd, ctrl, eccstat, i);
diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h
index 3f9778cbc79d..c332f0a45607 100644
--- a/include/linux/fsl_ifc.h
+++ b/include/linux/fsl_ifc.h
@@ -733,8 +733,12 @@ struct fsl_ifc_nand {
 	__be32 nand_erattr1;
 	u32 res19[0x10];
 	__be32 nand_fsr;
-	u32 res20[0x3];
-	__be32 nand_eccstat[6];
+	u32 res20;
+	/* The V1 nand_eccstat is actually 4 words that overlaps the
+	 * V2 nand_eccstat.
+	 */
+	__be32 v1_nand_eccstat[2];
+	__be32 v2_nand_eccstat[6];
 	u32 res21[0x1c];
 	__be32 nanndcr;
 	u32 res22[0x2];
-- 
cgit v1.2.3


From a61d5ce9cc56e2e41bbb1ad62ca7a16d7e7567bd Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 8 Dec 2016 12:58:45 +0200
Subject: net/mlx5: Fix static checker warnings

For some reason, sparse doesn't like using an expression of type (!x)
with a bitwise | and &.  In order to mitigate that, we use a local variable.

This removes the following sparse complaints on the core driver
(and similar ones on the IB driver too):

drivers/net/ethernet/mellanox/mlx5/core/srq.c:83:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/srq.c:96:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/port.c:59:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/vport.c:561:9: warning: dubious: !x & y

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 7b6cd67a263f..dd9a263ed368 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -67,10 +67,11 @@
 
 /* insert a value to a struct */
 #define MLX5_SET(typ, p, fld, v) do { \
+	u32 _v = v; \
 	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32);             \
 	*((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \
 	cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \
-		     (~__mlx5_dw_mask(typ, fld))) | (((v) & __mlx5_mask(typ, fld)) \
+		     (~__mlx5_dw_mask(typ, fld))) | (((_v) & __mlx5_mask(typ, fld)) \
 		     << __mlx5_dw_bit_off(typ, fld))); \
 } while (0)
 
-- 
cgit v1.2.3


From a4077ce5871304f8a78f80b74b18b6052a410f1a Mon Sep 17 00:00:00 2001
From: "Andrey Jr. Melnikov" <temnota.am@gmail.com>
Date: Thu, 8 Dec 2016 19:57:08 +0300
Subject: mtd: nand: Add Winbond manufacturer id

Add WINBOND manufacturer id.

Signed-off-by: Andrey Jr. Melnikov <temnota.am@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_ids.c | 1 +
 include/linux/mtd/nand.h    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index b3a332f37e14..4a2f75b0c200 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -185,6 +185,7 @@ struct nand_manufacturers nand_manuf_ids[] = {
 	{NAND_MFR_SANDISK, "SanDisk"},
 	{NAND_MFR_INTEL, "Intel"},
 	{NAND_MFR_ATO, "ATO"},
+	{NAND_MFR_WINBOND, "Winbond"},
 	{0x0, "Unknown"}
 };
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index f67915c7726f..f6017a1a35d6 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -959,6 +959,7 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv)
 #define NAND_MFR_SANDISK	0x45
 #define NAND_MFR_INTEL		0x89
 #define NAND_MFR_ATO		0x9b
+#define NAND_MFR_WINBOND	0xef
 
 /* The maximum expected count of bytes in the NAND ID sequence */
 #define NAND_MAX_ID_LEN 8
-- 
cgit v1.2.3


From a1831bb9403720db6d4c033fe2d6bd0116dd28fe Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 1 Feb 2017 17:53:04 +0000
Subject: iommu/dma: Remove bogus dma_supported() implementation

Back when this was first written, dma_supported() was somewhat of a
murky mess, with subtly different interpretations being relied upon in
various places. The "does device X support DMA to address range Y?"
uses assuming Y to be physical addresses, which motivated the current
iommu_dma_supported() implementation and are alluded to in the comment
therein, have since been cleaned up, leaving only the far less ambiguous
"can device X drive address bits Y" usage internal to DMA API mask
setting. As such, there is no reason to keep a slightly misleading
callback which does nothing but duplicate the current default behaviour;
we already constrain IOVA allocations to the iommu_domain aperture where
necessary, so let's leave DMA mask business to architecture-specific
code where it belongs.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c |  1 -
 drivers/iommu/dma-iommu.c   | 10 ----------
 include/linux/dma-iommu.h   |  1 -
 3 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index ae9f817eadf2..4a14b25163fb 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -799,7 +799,6 @@ static struct dma_map_ops iommu_dma_ops = {
 	.sync_sg_for_device = __iommu_sync_sg_for_device,
 	.map_resource = iommu_dma_map_resource,
 	.unmap_resource = iommu_dma_unmap_resource,
-	.dma_supported = iommu_dma_supported,
 	.mapping_error = iommu_dma_mapping_error,
 };
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 1c9ac26e3b68..48d36ce59efb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -734,16 +734,6 @@ void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
 	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), handle);
 }
 
-int iommu_dma_supported(struct device *dev, u64 mask)
-{
-	/*
-	 * 'Special' IOMMUs which don't have the same addressing capability
-	 * as the CPU will have to wait until we have some way to query that
-	 * before they'll be able to use this framework.
-	 */
-	return 1;
-}
-
 int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == DMA_ERROR_CODE;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 3a846f9ec0fd..5725c94b1f12 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -67,7 +67,6 @@ dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
-int iommu_dma_supported(struct device *dev, u64 mask);
 int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 
 /* The DMA API isn't _quite_ the whole story, though... */
-- 
cgit v1.2.3


From d254586c34538c0014280806c5d4795697cf21e5 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Fri, 10 Oct 2014 17:30:10 +0200
Subject: can: rx-offload: Add support for HW fifo based irq offloading

Some CAN controllers have a usable FIFO already but can still benefit
from off-loading the CAN controller FIFO. The CAN frames of the FIFO are
read and put into a skb queue during interrupt and then transmitted in a
NAPI context.

Signed-off-by: David Jander <david@protonic.nl>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/Makefile       |   3 +-
 drivers/net/can/rx-offload.c   | 156 +++++++++++++++++++++++++++++++++++++++++
 include/linux/can/rx-offload.h |  51 ++++++++++++++
 3 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/can/rx-offload.c
 create mode 100644 include/linux/can/rx-offload.h

(limited to 'include/linux')

diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index 7a85495dbb0c..0da4f2f5c7e3 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_CAN_VCAN)		+= vcan.o
 obj-$(CONFIG_CAN_SLCAN)		+= slcan.o
 
 obj-$(CONFIG_CAN_DEV)		+= can-dev.o
-can-dev-y			:= dev.o
+can-dev-y			+= dev.o
+can-dev-y			+= rx-offload.o
 
 can-dev-$(CONFIG_CAN_LEDS)	+= led.o
 
diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c
new file mode 100644
index 000000000000..7267c61762c7
--- /dev/null
+++ b/drivers/net/can/rx-offload.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2014 David Jander, Protonic Holland
+ * Copyright (C) 2014-2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/can/dev.h>
+#include <linux/can/rx-offload.h>
+
+static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota)
+{
+	struct can_rx_offload *offload = container_of(napi, struct can_rx_offload, napi);
+	struct net_device *dev = offload->dev;
+	struct net_device_stats *stats = &dev->stats;
+	struct sk_buff *skb;
+	int work_done = 0;
+
+	while ((work_done < quota) &&
+	       (skb = skb_dequeue(&offload->skb_queue))) {
+		struct can_frame *cf = (struct can_frame *)skb->data;
+
+		work_done++;
+		stats->rx_packets++;
+		stats->rx_bytes += cf->can_dlc;
+		netif_receive_skb(skb);
+	}
+
+	if (work_done < quota) {
+		napi_complete_done(napi, work_done);
+
+		/* Check if there was another interrupt */
+		if (!skb_queue_empty(&offload->skb_queue))
+			napi_reschedule(&offload->napi);
+	}
+
+	can_led_event(offload->dev, CAN_LED_EVENT_RX);
+
+	return work_done;
+}
+
+static struct sk_buff *can_rx_offload_offload_one(struct can_rx_offload *offload, unsigned int n)
+{
+	struct sk_buff *skb = NULL;
+	struct can_frame *cf;
+	int ret;
+
+	/* If queue is full or skb not available, read to discard mailbox */
+	if (likely(skb_queue_len(&offload->skb_queue) <=
+		   offload->skb_queue_len_max))
+		skb = alloc_can_skb(offload->dev, &cf);
+
+	if (!skb) {
+		struct can_frame cf_overflow;
+
+		ret = offload->mailbox_read(offload, &cf_overflow, n);
+		if (ret)
+			offload->dev->stats.rx_dropped++;
+
+		return NULL;
+	}
+
+	ret = offload->mailbox_read(offload, cf, n);
+	if (!ret) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	return skb;
+}
+
+int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload)
+{
+	struct sk_buff *skb;
+	int received = 0;
+
+	while ((skb = can_rx_offload_offload_one(offload, 0))) {
+		skb_queue_tail(&offload->skb_queue, skb);
+		received++;
+	}
+
+	if (received)
+		can_rx_offload_schedule(offload);
+
+	return received;
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_irq_offload_fifo);
+
+int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_buff *skb)
+{
+	if (skb_queue_len(&offload->skb_queue) >
+	    offload->skb_queue_len_max)
+		return -ENOMEM;
+
+	skb_queue_tail(&offload->skb_queue, skb);
+	can_rx_offload_schedule(offload);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_irq_queue_err_skb);
+
+static int can_rx_offload_init_queue(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight)
+{
+	offload->dev = dev;
+
+	/* Limit queue len to 4x the weight (rounted to next power of two) */
+	offload->skb_queue_len_max = 2 << fls(weight);
+	offload->skb_queue_len_max *= 4;
+	skb_queue_head_init(&offload->skb_queue);
+
+	can_rx_offload_reset(offload);
+	netif_napi_add(dev, &offload->napi, can_rx_offload_napi_poll, weight);
+
+	dev_dbg(dev->dev.parent, "%s: skb_queue_len_max=%d\n",
+		__func__, offload->skb_queue_len_max);
+
+	return 0;
+}
+
+int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight)
+{
+	if (!offload->mailbox_read)
+		return -EINVAL;
+
+	return can_rx_offload_init_queue(dev, offload, weight);
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_add_fifo);
+
+void can_rx_offload_enable(struct can_rx_offload *offload)
+{
+	can_rx_offload_reset(offload);
+	napi_enable(&offload->napi);
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_enable);
+
+void can_rx_offload_del(struct can_rx_offload *offload)
+{
+	netif_napi_del(&offload->napi);
+	skb_queue_purge(&offload->skb_queue);
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_del);
+
+void can_rx_offload_reset(struct can_rx_offload *offload)
+{
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_reset);
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
new file mode 100644
index 000000000000..cadfe9869600
--- /dev/null
+++ b/include/linux/can/rx-offload.h
@@ -0,0 +1,51 @@
+/*
+ * linux/can/rx-offload.h
+ *
+ * Copyright (c) 2014 David Jander, Protonic Holland
+ * Copyright (c) 2014-2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _CAN_RX_OFFLOAD_H
+#define _CAN_RX_OFFLOAD_H
+
+#include <linux/netdevice.h>
+#include <linux/can.h>
+
+struct can_rx_offload {
+	struct net_device *dev;
+
+	unsigned int (*mailbox_read)(struct can_rx_offload *offload, struct can_frame *cf, unsigned int mb);
+
+	struct sk_buff_head skb_queue;
+	u32 skb_queue_len_max;
+
+	struct napi_struct napi;
+};
+
+int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight);
+int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload);
+int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_buff *skb);
+void can_rx_offload_reset(struct can_rx_offload *offload);
+void can_rx_offload_del(struct can_rx_offload *offload);
+void can_rx_offload_enable(struct can_rx_offload *offload);
+
+static inline void can_rx_offload_schedule(struct can_rx_offload *offload)
+{
+	napi_schedule(&offload->napi);
+}
+
+static inline void can_rx_offload_disable(struct can_rx_offload *offload)
+{
+	napi_disable(&offload->napi);
+}
+
+#endif /* !_CAN_RX_OFFLOAD_H */
-- 
cgit v1.2.3


From 3abbac0b5dd3e3b3dfb30a4885cdde5c20e1cb83 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Tue, 23 Sep 2014 15:28:21 +0200
Subject: can: rx-offload: Add support for timestamp based irq offloading

Some CAN controllers don't implement a FIFO in hardware, but fill their
mailboxes in a particular order (from lowest to highest or highest to lowest).
This makes problems to read the frames in the correct order from the hardware,
as new frames might be filled into just read (low) mailboxes. This gets worse,
when following new frames are received into not read (higher) mailboxes.

On the bright side some these CAN controllers put a timestamp on each received
CAN frame. This patch adds support to offload CAN frames in interrupt context,
order them by timestamp and then transmitted in a NAPI context.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/rx-offload.c   | 137 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/can/rx-offload.h |  10 ++-
 2 files changed, 144 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c
index 7267c61762c7..f394f77d7528 100644
--- a/drivers/net/can/rx-offload.c
+++ b/drivers/net/can/rx-offload.c
@@ -18,6 +18,33 @@
 #include <linux/can/dev.h>
 #include <linux/can/rx-offload.h>
 
+struct can_rx_offload_cb {
+	u32 timestamp;
+};
+
+static inline struct can_rx_offload_cb *can_rx_offload_get_cb(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(struct can_rx_offload_cb) > sizeof(skb->cb));
+
+	return (struct can_rx_offload_cb *)skb->cb;
+}
+
+static inline bool can_rx_offload_le(struct can_rx_offload *offload, unsigned int a, unsigned int b)
+{
+	if (offload->inc)
+		return a <= b;
+	else
+		return a >= b;
+}
+
+static inline unsigned int can_rx_offload_inc(struct can_rx_offload *offload, unsigned int *val)
+{
+	if (offload->inc)
+		return (*val)++;
+	else
+		return (*val)--;
+}
+
 static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota)
 {
 	struct can_rx_offload *offload = container_of(napi, struct can_rx_offload, napi);
@@ -49,9 +76,50 @@ static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota)
 	return work_done;
 }
 
+static inline void __skb_queue_add_sort(struct sk_buff_head *head, struct sk_buff *new,
+					int (*compare)(struct sk_buff *a, struct sk_buff *b))
+{
+	struct sk_buff *pos, *insert = (struct sk_buff *)head;
+
+	skb_queue_reverse_walk(head, pos) {
+		const struct can_rx_offload_cb *cb_pos, *cb_new;
+
+		cb_pos = can_rx_offload_get_cb(pos);
+		cb_new = can_rx_offload_get_cb(new);
+
+		netdev_dbg(new->dev,
+			   "%s: pos=0x%08x, new=0x%08x, diff=%10d, queue_len=%d\n",
+			   __func__,
+			   cb_pos->timestamp, cb_new->timestamp,
+			   cb_new->timestamp - cb_pos->timestamp,
+			   skb_queue_len(head));
+
+		if (compare(pos, new) < 0)
+			continue;
+		insert = pos;
+		break;
+	}
+
+	__skb_queue_after(head, insert, new);
+}
+
+static int can_rx_offload_compare(struct sk_buff *a, struct sk_buff *b)
+{
+	const struct can_rx_offload_cb *cb_a, *cb_b;
+
+	cb_a = can_rx_offload_get_cb(a);
+	cb_b = can_rx_offload_get_cb(b);
+
+	/* Substract two u32 and return result as int, to keep
+	 * difference steady around the u32 overflow.
+	 */
+	return cb_b->timestamp - cb_a->timestamp;
+}
+
 static struct sk_buff *can_rx_offload_offload_one(struct can_rx_offload *offload, unsigned int n)
 {
 	struct sk_buff *skb = NULL;
+	struct can_rx_offload_cb *cb;
 	struct can_frame *cf;
 	int ret;
 
@@ -62,15 +130,18 @@ static struct sk_buff *can_rx_offload_offload_one(struct can_rx_offload *offload
 
 	if (!skb) {
 		struct can_frame cf_overflow;
+		u32 timestamp;
 
-		ret = offload->mailbox_read(offload, &cf_overflow, n);
+		ret = offload->mailbox_read(offload, &cf_overflow,
+					    &timestamp, n);
 		if (ret)
 			offload->dev->stats.rx_dropped++;
 
 		return NULL;
 	}
 
-	ret = offload->mailbox_read(offload, cf, n);
+	cb = can_rx_offload_get_cb(skb);
+	ret = offload->mailbox_read(offload, cf, &cb->timestamp, n);
 	if (!ret) {
 		kfree_skb(skb);
 		return NULL;
@@ -79,6 +150,48 @@ static struct sk_buff *can_rx_offload_offload_one(struct can_rx_offload *offload
 	return skb;
 }
 
+int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 pending)
+{
+	struct sk_buff_head skb_queue;
+	unsigned int i;
+
+	__skb_queue_head_init(&skb_queue);
+
+	for (i = offload->mb_first;
+	     can_rx_offload_le(offload, i, offload->mb_last);
+	     can_rx_offload_inc(offload, &i)) {
+		struct sk_buff *skb;
+
+		if (!(pending & BIT_ULL(i)))
+			continue;
+
+		skb = can_rx_offload_offload_one(offload, i);
+		if (!skb)
+			break;
+
+		__skb_queue_add_sort(&skb_queue, skb, can_rx_offload_compare);
+	}
+
+	if (!skb_queue_empty(&skb_queue)) {
+		unsigned long flags;
+		u32 queue_len;
+
+		spin_lock_irqsave(&offload->skb_queue.lock, flags);
+		skb_queue_splice_tail(&skb_queue, &offload->skb_queue);
+		spin_unlock_irqrestore(&offload->skb_queue.lock, flags);
+
+		if ((queue_len = skb_queue_len(&offload->skb_queue)) >
+		    (offload->skb_queue_len_max / 8))
+			netdev_dbg(offload->dev, "%s: queue_len=%d\n",
+				   __func__, queue_len);
+
+		can_rx_offload_schedule(offload);
+	}
+
+	return skb_queue_len(&skb_queue);
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_irq_offload_timestamp);
+
 int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload)
 {
 	struct sk_buff *skb;
@@ -127,6 +240,26 @@ static int can_rx_offload_init_queue(struct net_device *dev, struct can_rx_offlo
 	return 0;
 }
 
+int can_rx_offload_add_timestamp(struct net_device *dev, struct can_rx_offload *offload)
+{
+	unsigned int weight;
+
+	if (offload->mb_first > BITS_PER_LONG_LONG ||
+	    offload->mb_last > BITS_PER_LONG_LONG || !offload->mailbox_read)
+		return -EINVAL;
+
+	if (offload->mb_first < offload->mb_last) {
+		offload->inc = true;
+		weight = offload->mb_last - offload->mb_first;
+	} else {
+		offload->inc = false;
+		weight = offload->mb_first - offload->mb_last;
+	}
+
+	return can_rx_offload_init_queue(dev, offload, weight);;
+}
+EXPORT_SYMBOL_GPL(can_rx_offload_add_timestamp);
+
 int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight)
 {
 	if (!offload->mailbox_read)
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
index cadfe9869600..cb31683bbe15 100644
--- a/include/linux/can/rx-offload.h
+++ b/include/linux/can/rx-offload.h
@@ -23,15 +23,23 @@
 struct can_rx_offload {
 	struct net_device *dev;
 
-	unsigned int (*mailbox_read)(struct can_rx_offload *offload, struct can_frame *cf, unsigned int mb);
+	unsigned int (*mailbox_read)(struct can_rx_offload *offload, struct can_frame *cf,
+				     u32 *timestamp, unsigned int mb);
 
 	struct sk_buff_head skb_queue;
 	u32 skb_queue_len_max;
 
+	unsigned int mb_first;
+	unsigned int mb_last;
+
 	struct napi_struct napi;
+
+	bool inc;
 };
 
+int can_rx_offload_add_timestamp(struct net_device *dev, struct can_rx_offload *offload);
 int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight);
+int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 reg);
 int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload);
 int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_buff *skb);
 void can_rx_offload_reset(struct can_rx_offload *offload);
-- 
cgit v1.2.3


From f32f5bd2eb7e91a5090c06bbe1ed14bffbbda5d3 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@mellanox.com>
Date: Thu, 19 Nov 2015 17:12:26 +0200
Subject: net/mlx5: Configure cache line size for start and end padding

There is a hardware feature that will pad the start or end of a DMA to
be cache line aligned to avoid RMWs on the last cache line. The default
cache line size setting for this feature is 64B. This change configures
the hardware to use 128B alignment on systems with 128B cache lines.

In addition we lower bound MPWRQ stride by HCA cacheline in mlx5e,
MPWRQ stride should be at least the HCA cacheline, the current default
is 64B and in case HCA_CAP.cach_line_128byte capability is set, MPWRQ RX
stride will automatically be aligned to 128B.

Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 9 +++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 6 ++++++
 include/linux/mlx5/mlx5_ifc.h                     | 6 ++++--
 4 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9b52c58cd528..9b23d3329847 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -70,8 +70,13 @@
 
 #define MLX5_RX_HEADROOM NET_SKB_PAD
 
-#define MLX5_MPWRQ_LOG_STRIDE_SIZE		6  /* >= 6, HW restriction */
-#define MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS	8  /* >= 6, HW restriction */
+#define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
+	(6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \
+	max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
+#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
+#define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
+
 #define MLX5_MPWRQ_LOG_WQE_SZ			18
 #define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
 				    MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ab6f4d3b8063..1b7fe43ab22b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -89,8 +89,8 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type)
 			MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
 		priv->params.mpwqe_log_stride_sz =
 			MLX5E_GET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS) ?
-			MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS :
-			MLX5_MPWRQ_LOG_STRIDE_SIZE;
+			MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(priv->mdev) :
+			MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(priv->mdev);
 		priv->params.mpwqe_log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ -
 			priv->params.mpwqe_log_stride_sz;
 		break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f7e50ba67f94..c4242a4e8130 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -543,6 +543,12 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 
 	MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
 
+	if (MLX5_CAP_GEN_MAX(dev, cache_line_128byte))
+		MLX5_SET(cmd_hca_cap,
+			 set_hca_cap,
+			 cache_line_128byte,
+			 cache_line_size() == 128 ? 1 : 0);
+
 	err = set_caps(dev, set_ctx, set_sz,
 		       MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a919dfb920ae..cc8ae860cd45 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -804,10 +804,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_150[0xa];
 	u8         log_max_ra_res_qp[0x6];
 
-	u8         pad_cap[0x1];
+	u8         end_pad[0x1];
 	u8         cc_query_allowed[0x1];
 	u8         cc_modify_allowed[0x1];
-	u8         reserved_at_163[0xd];
+	u8         start_pad[0x1];
+	u8         cache_line_128byte[0x1];
+	u8         reserved_at_163[0xb];
 	u8         gid_table_size[0x10];
 
 	u8         out_of_seq_cnt[0x1];
-- 
cgit v1.2.3


From 2b31f7ae5f645edd852addfca445895b5806f3f9 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 28 Nov 2016 18:04:50 +0200
Subject: net/mlx5: TX WQE update

Add new TX WQE fields for Connect-X5 vlan insertion support,
type and vlan_tci, when type = MLX5_ETH_WQE_INSERT_VLAN the
HW will insert the vlan and prio fields (vlan_tci) to the packet.

Those bits and the inline header fields are mutually exclusive, and
valid only when:
MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_NOT_REQUIRED
and MLX5_CAP_ETH(mdev, wqe_vlan_insert),
who will be set in ConnectX-5 and later HW generations.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
---
 drivers/infiniband/hw/mlx5/qp.c                 |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |  8 ++++----
 include/linux/mlx5/mlx5_ifc.h                   |  3 ++-
 include/linux/mlx5/qp.h                         | 16 ++++++++++++++--
 5 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6a83fb32599d..e31bf11ae64f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2984,20 +2984,20 @@ static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
 
 	if (wr->opcode == IB_WR_LSO) {
 		struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
-		int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
+		int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start);
 		u64 left, leftlen, copysz;
 		void *pdata = ud_wr->header;
 
 		left = ud_wr->hlen;
 		eseg->mss = cpu_to_be16(ud_wr->mss);
-		eseg->inline_hdr_sz = cpu_to_be16(left);
+		eseg->inline_hdr.sz = cpu_to_be16(left);
 
 		/*
 		 * check if there is space till the end of queue, if yes,
 		 * copy all in one shot, otherwise copy till the end of queue,
 		 * rollback and than the copy the left
 		 */
-		leftlen = qend - (void *)eseg->inline_hdr_start;
+		leftlen = qend - (void *)eseg->inline_hdr.start;
 		copysz = min_t(u64, leftlen, left);
 
 		memcpy(seg - size_of_inl_hdr_start, pdata, copysz);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index fd8dff6acc12..965e69e9ff1e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -687,8 +687,8 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 	memset(wqe, 0, sizeof(*wqe));
 
 	/* copy the inline part */
-	memcpy(eseg->inline_hdr_start, xdp->data, MLX5E_XDP_MIN_INLINE);
-	eseg->inline_hdr_sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+	memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
+	eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
 
 	dseg = (struct mlx5_wqe_data_seg *)cseg + (MLX5E_XDP_TX_DS_COUNT - 1);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index cfb68371c397..678c07c8fbb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -284,18 +284,18 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	wi->num_bytes = num_bytes;
 
 	if (skb_vlan_tag_present(skb)) {
-		mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs, &skb_data,
+		mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, &skb_data,
 				  &skb_len);
 		ihs += VLAN_HLEN;
 	} else {
-		memcpy(eseg->inline_hdr_start, skb_data, ihs);
+		memcpy(eseg->inline_hdr.start, skb_data, ihs);
 		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
 	}
 
-	eseg->inline_hdr_sz = cpu_to_be16(ihs);
+	eseg->inline_hdr.sz = cpu_to_be16(ihs);
 
 	ds_cnt  = sizeof(*wqe) / MLX5_SEND_WQE_DS;
-	ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr_start),
+	ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start),
 			       MLX5_SEND_WQE_DS);
 	dseg    = (struct mlx5_wqe_data_seg *)cseg + ds_cnt;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cc8ae860cd45..afcd4736d8df 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -577,7 +577,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         lro_cap[0x1];
 	u8         lro_psh_flag[0x1];
 	u8         lro_time_stamp[0x1];
-	u8         reserved_at_5[0x3];
+	u8         reserved_at_5[0x2];
+	u8         wqe_vlan_insert[0x1];
 	u8         self_lb_en_modifiable[0x1];
 	u8         reserved_at_9[0x2];
 	u8         max_lso_cap[0x5];
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 219c699c17b7..3096370fe831 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -221,14 +221,26 @@ enum {
 	MLX5_ETH_WQE_L4_CSUM            = 1 << 7,
 };
 
+enum {
+	MLX5_ETH_WQE_INSERT_VLAN        = 1 << 15,
+};
+
 struct mlx5_wqe_eth_seg {
 	u8              rsvd0[4];
 	u8              cs_flags;
 	u8              rsvd1;
 	__be16          mss;
 	__be32          rsvd2;
-	__be16          inline_hdr_sz;
-	u8              inline_hdr_start[2];
+	union {
+		struct {
+			__be16 sz;
+			u8     start[2];
+		} inline_hdr;
+		struct {
+			__be16 type;
+			__be16 vlan_tci;
+		} insert;
+	};
 };
 
 struct mlx5_wqe_xrc_seg {
-- 
cgit v1.2.3


From a8eca326151ee1beac82a4fd86d9edad3a37aaed Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 6 Feb 2017 16:20:14 +0100
Subject: net: remove ndo_neigh_{construct, destroy} from stacked devices

In commit 18bfb924f000 ("net: introduce default neigh_construct/destroy
ndo calls for L2 upper devices") we added these ndos to stacked devices
such as team and bond, so that calls will be propagated to mlxsw.

However, previous commit removed the reliance on these ndos and no new
users of these ndos have appeared since above mentioned commit. We can
therefore safely remove this dead code.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  2 --
 drivers/net/team/team.c         |  2 --
 include/linux/netdevice.h       |  4 ----
 net/8021q/vlan_dev.c            |  2 --
 net/bridge/br_device.c          |  2 --
 net/core/dev.c                  | 44 -----------------------------------------
 6 files changed, 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e6af04716cf7..6321f12630c8 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4145,8 +4145,6 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_add_slave		= bond_enslave,
 	.ndo_del_slave		= bond_release,
 	.ndo_fix_features	= bond_fix_features,
-	.ndo_neigh_construct	= netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	= netdev_default_l2upper_neigh_destroy,
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
 	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index a3711769544b..4a24b5d15f5a 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2001,8 +2001,6 @@ static const struct net_device_ops team_netdev_ops = {
 	.ndo_add_slave		= team_add_slave,
 	.ndo_del_slave		= team_del_slave,
 	.ndo_fix_features	= team_fix_features,
-	.ndo_neigh_construct	= netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	= netdev_default_l2upper_neigh_destroy,
 	.ndo_change_carrier     = team_change_carrier,
 	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
 	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 014fbe256d55..58afbd1cc659 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3888,10 +3888,6 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev);
 void netdev_lower_state_changed(struct net_device *lower_dev,
 				void *lower_state_info);
-int netdev_default_l2upper_neigh_construct(struct net_device *dev,
-					   struct neighbour *n);
-void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
-					  struct neighbour *n);
 
 /* RSS keys are 40 or 52 bytes long */
 #define NETDEV_RSS_KEY_LEN 52
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 116455ac3db5..e97ab824e368 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -791,8 +791,6 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_netpoll_cleanup	= vlan_dev_netpoll_cleanup,
 #endif
 	.ndo_fix_features	= vlan_dev_fix_features,
-	.ndo_neigh_construct	= netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	= netdev_default_l2upper_neigh_destroy,
 	.ndo_fdb_add		= switchdev_port_fdb_add,
 	.ndo_fdb_del		= switchdev_port_fdb_del,
 	.ndo_fdb_dump		= switchdev_port_fdb_dump,
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 6c46d1b4cdbb..5ba0b558f8ae 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -347,8 +347,6 @@ static const struct net_device_ops br_netdev_ops = {
 	.ndo_add_slave		 = br_add_slave,
 	.ndo_del_slave		 = br_del_slave,
 	.ndo_fix_features        = br_fix_features,
-	.ndo_neigh_construct	 = netdev_default_l2upper_neigh_construct,
-	.ndo_neigh_destroy	 = netdev_default_l2upper_neigh_destroy,
 	.ndo_fdb_add		 = br_fdb_add,
 	.ndo_fdb_del		 = br_fdb_delete,
 	.ndo_fdb_dump		 = br_fdb_dump,
diff --git a/net/core/dev.c b/net/core/dev.c
index 404d2e6d5d32..3e1a60102e64 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6111,50 +6111,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
 }
 EXPORT_SYMBOL(netdev_lower_state_changed);
 
-int netdev_default_l2upper_neigh_construct(struct net_device *dev,
-					   struct neighbour *n)
-{
-	struct net_device *lower_dev, *stop_dev;
-	struct list_head *iter;
-	int err;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (!lower_dev->netdev_ops->ndo_neigh_construct)
-			continue;
-		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
-		if (err) {
-			stop_dev = lower_dev;
-			goto rollback;
-		}
-	}
-	return 0;
-
-rollback:
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (lower_dev == stop_dev)
-			break;
-		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
-			continue;
-		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
-	}
-	return err;
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
-
-void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
-					  struct neighbour *n)
-{
-	struct net_device *lower_dev;
-	struct list_head *iter;
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
-			continue;
-		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
-	}
-}
-EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
-
 static void dev_change_rx_flags(struct net_device *dev, int flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-- 
cgit v1.2.3


From 455a7b238cd6bc68c4a550cbbd37c1e22b64f71c Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Fri, 3 Feb 2017 12:50:31 -0700
Subject: block: Add Sed-opal library

This patch implements the necessary logic to bring an Opal
enabled drive out of a factory-enabled into a working
Opal state.

This patch set also enables logic to save a password to
be replayed during a resume from suspend.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Rafael Antognolli <Rafael.Antognolli@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 MAINTAINERS              |   11 +
 block/Kconfig            |    7 +
 block/Makefile           |    1 +
 block/opal_proto.h       |  429 ++++++++
 block/sed-opal.c         | 2448 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sed-opal.h |  178 ++++
 6 files changed, 3074 insertions(+)
 create mode 100644 block/opal_proto.h
 create mode 100644 block/sed-opal.c
 create mode 100644 include/linux/sed-opal.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index f481713fe44c..f96181c927be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11075,6 +11075,17 @@ L:	linux-mmc@vger.kernel.org
 S:	Maintained
 F:	drivers/mmc/host/sdhci-spear.c
 
+SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
+M:	Scott Bauer <scott.bauer@intel.com>
+M:	Jonathan Derrick <jonathan.derrick@intel.com>
+M:	Rafael Antognolli <rafael.antognolli@intel.com>
+L:	linux-nvme@lists.infradead.org
+S:	Supported
+F:	block/sed*
+F:	block/opal_proto.h
+F:	include/linux/sed*
+F:	include/uapi/linux/sed*
+
 SECURITY SUBSYSTEM
 M:	James Morris <james.l.morris@oracle.com>
 M:	"Serge E. Hallyn" <serge@hallyn.com>
diff --git a/block/Kconfig b/block/Kconfig
index 7f659b23850c..1aef809affae 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -159,6 +159,13 @@ config BLK_DEBUG_FS
 	Unless you are building a kernel for a tiny system, you should
 	say Y here.
 
+config BLK_SED_OPAL
+	bool "Logic for interfacing with Opal enabled SEDs"
+	---help---
+	Builds Logic for interfacing with Opal enabled controllers.
+	Enabling this option enables users to setup/unlock/lock
+	Locking ranges for SED devices using the Opal protocol.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 317165f8708c..6ba1b1bc9529 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -27,3 +27,4 @@ obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
 obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
+obj-$(CONFIG_BLK_SED_OPAL)	+= sed-opal.o
diff --git a/block/opal_proto.h b/block/opal_proto.h
new file mode 100644
index 000000000000..af9abc56c157
--- /dev/null
+++ b/block/opal_proto.h
@@ -0,0 +1,429 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/types.h>
+
+#ifndef _OPAL_PROTO_H
+#define _OPAL_PROTO_H
+
+#define DTAERROR_NO_METHOD_STATUS 0x89
+#define GENERIC_HOST_SESSION_NUM 0x41
+
+#define TPER_SYNC_SUPPORTED 0x01
+
+#define TINY_ATOM_DATA_MASK 0x3F
+#define TINY_ATOM_SIGNED 0x40
+
+#define SHORT_ATOM_ID 0x80
+#define SHORT_ATOM_BYTESTRING 0x20
+#define SHORT_ATOM_SIGNED 0x10
+#define SHORT_ATOM_LEN_MASK 0xF
+
+#define MEDIUM_ATOM_ID 0xC0
+#define MEDIUM_ATOM_BYTESTRING 0x10
+#define MEDIUM_ATOM_SIGNED 0x8
+#define MEDIUM_ATOM_LEN_MASK 0x7
+
+#define LONG_ATOM_ID 0xe0
+#define LONG_ATOM_BYTESTRING 0x2
+#define LONG_ATOM_SIGNED 0x1
+
+/* Derived from TCG Core spec 2.01 Section:
+ * 3.2.2.1
+ * Data Type
+ */
+#define TINY_ATOM_BYTE   0x7F
+#define SHORT_ATOM_BYTE  0xBF
+#define MEDIUM_ATOM_BYTE 0xDF
+#define LONG_ATOM_BYTE   0xE3
+
+#define OPAL_INVAL_PARAM 12
+#define OPAL_MANUFACTURED_INACTIVE 0x08
+#define OPAL_DISCOVERY_COMID 0x0001
+
+#define LOCKING_RANGE_NON_GLOBAL 0x03
+/*
+ * User IDs used in the TCG storage SSCs
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+#define OPAL_UID_LENGTH 8
+#define OPAL_METHOD_LENGTH 8
+#define OPAL_MSID_KEYLEN 15
+#define OPAL_UID_LENGTH_HALF 4
+
+/* Enum to index OPALUID array */
+enum opal_uid {
+	/* users */
+	OPAL_SMUID_UID,
+	OPAL_THISSP_UID,
+	OPAL_ADMINSP_UID,
+	OPAL_LOCKINGSP_UID,
+	OPAL_ENTERPRISE_LOCKINGSP_UID,
+	OPAL_ANYBODY_UID,
+	OPAL_SID_UID,
+	OPAL_ADMIN1_UID,
+	OPAL_USER1_UID,
+	OPAL_USER2_UID,
+	OPAL_PSID_UID,
+	OPAL_ENTERPRISE_BANDMASTER0_UID,
+	OPAL_ENTERPRISE_ERASEMASTER_UID,
+	/* tables */
+	OPAL_LOCKINGRANGE_GLOBAL,
+	OPAL_LOCKINGRANGE_ACE_RDLOCKED,
+	OPAL_LOCKINGRANGE_ACE_WRLOCKED,
+	OPAL_MBRCONTROL,
+	OPAL_MBR,
+	OPAL_AUTHORITY_TABLE,
+	OPAL_C_PIN_TABLE,
+	OPAL_LOCKING_INFO_TABLE,
+	OPAL_ENTERPRISE_LOCKING_INFO_TABLE,
+	/* C_PIN_TABLE object ID's */
+	OPAL_C_PIN_MSID,
+	OPAL_C_PIN_SID,
+	OPAL_C_PIN_ADMIN1,
+	/* half UID's (only first 4 bytes used) */
+	OPAL_HALF_UID_AUTHORITY_OBJ_REF,
+	OPAL_HALF_UID_BOOLEAN_ACE,
+	/* omitted optional parameter */
+	OPAL_UID_HEXFF,
+};
+
+#define OPAL_METHOD_LENGTH 8
+
+/* Enum for indexing the OPALMETHOD array */
+enum opal_method {
+	OPAL_PROPERTIES,
+	OPAL_STARTSESSION,
+	OPAL_REVERT,
+	OPAL_ACTIVATE,
+	OPAL_EGET,
+	OPAL_ESET,
+	OPAL_NEXT,
+	OPAL_EAUTHENTICATE,
+	OPAL_GETACL,
+	OPAL_GENKEY,
+	OPAL_REVERTSP,
+	OPAL_GET,
+	OPAL_SET,
+	OPAL_AUTHENTICATE,
+	OPAL_RANDOM,
+	OPAL_ERASE,
+};
+
+enum opal_token {
+	/* Boolean */
+	OPAL_TRUE = 0x01,
+	OPAL_FALSE = 0x00,
+	OPAL_BOOLEAN_EXPR = 0x03,
+	/* cellblocks */
+	OPAL_TABLE = 0x00,
+	OPAL_STARTROW = 0x01,
+	OPAL_ENDROW = 0x02,
+	OPAL_STARTCOLUMN = 0x03,
+	OPAL_ENDCOLUMN = 0x04,
+	OPAL_VALUES = 0x01,
+	/* authority table */
+	OPAL_PIN = 0x03,
+	/* locking tokens */
+	OPAL_RANGESTART = 0x03,
+	OPAL_RANGELENGTH = 0x04,
+	OPAL_READLOCKENABLED = 0x05,
+	OPAL_WRITELOCKENABLED = 0x06,
+	OPAL_READLOCKED = 0x07,
+	OPAL_WRITELOCKED = 0x08,
+	OPAL_ACTIVEKEY = 0x0A,
+	/* locking info table */
+	OPAL_MAXRANGES = 0x04,
+	 /* mbr control */
+	OPAL_MBRENABLE = 0x01,
+	OPAL_MBRDONE = 0x02,
+	/* properties */
+	OPAL_HOSTPROPERTIES = 0x00,
+	/* atoms */
+	OPAL_STARTLIST = 0xf0,
+	OPAL_ENDLIST = 0xf1,
+	OPAL_STARTNAME = 0xf2,
+	OPAL_ENDNAME = 0xf3,
+	OPAL_CALL = 0xf8,
+	OPAL_ENDOFDATA = 0xf9,
+	OPAL_ENDOFSESSION = 0xfa,
+	OPAL_STARTTRANSACTON = 0xfb,
+	OPAL_ENDTRANSACTON = 0xfC,
+	OPAL_EMPTYATOM = 0xff,
+	OPAL_WHERE = 0x00,
+};
+
+/* Locking state for a locking range */
+enum opal_lockingstate {
+	OPAL_LOCKING_READWRITE = 0x01,
+	OPAL_LOCKING_READONLY = 0x02,
+	OPAL_LOCKING_LOCKED = 0x03,
+};
+
+/* Packets derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Secion: 3.2.3 ComPackets, Packets & Subpackets
+ */
+
+/* Comm Packet (header) for transmissions. */
+struct opal_compacket {
+	__be32 reserved0;
+	u8 extendedComID[4];
+	__be32 outstandingData;
+	__be32 minTransfer;
+	__be32 length;
+};
+
+/* Packet structure. */
+struct opal_packet {
+	__be32 tsn;
+	__be32 hsn;
+	__be32 seq_number;
+	__be16 reserved0;
+	__be16 ack_type;
+	__be32 acknowledgment;
+	__be32 length;
+};
+
+/* Data sub packet header */
+struct opal_data_subpacket {
+	u8 reserved0[6];
+	__be16 kind;
+	__be32 length;
+};
+
+/* header of a response */
+struct opal_header {
+	struct opal_compacket cp;
+	struct opal_packet pkt;
+	struct opal_data_subpacket subpkt;
+};
+
+#define FC_TPER       0x0001
+#define FC_LOCKING    0x0002
+#define FC_GEOMETRY   0x0003
+#define FC_ENTERPRISE 0x0100
+#define FC_DATASTORE  0x0202
+#define FC_SINGLEUSER 0x0201
+#define FC_OPALV100   0x0200
+#define FC_OPALV200   0x0203
+
+/*
+ * The Discovery 0 Header. As defined in
+ * Opal SSC Documentation
+ * Section: 3.3.5 Capability Discovery
+ */
+struct d0_header {
+	__be32 length; /* the length of the header 48 in 2.00.100 */
+	__be32 revision; /**< revision of the header 1 in 2.00.100 */
+	__be32 reserved01;
+	__be32 reserved02;
+	/*
+	 * the remainder of the structure is vendor specific and will not be
+	 * addressed now
+	 */
+	u8 ignored[32];
+};
+
+/*
+ * TPer Feature Descriptor. Contains flags indicating support for the
+ * TPer features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x001 in 2.00.100
+ */
+struct d0_tper_features {
+	/*
+	 * supported_features bits:
+	 * bit 7: reserved
+	 * bit 6: com ID management
+	 * bit 5: reserved
+	 * bit 4: streaming support
+	 * bit 3: buffer management
+	 * bit 2: ACK/NACK
+	 * bit 1: async
+	 * bit 0: sync
+	 */
+	u8 supported_features;
+	/*
+	 * bytes 5 through 15 are reserved, but we represent the first 3 as
+	 * u8 to keep the other two 32bits integers aligned.
+	 */
+	u8 reserved01[3];
+	__be32 reserved02;
+	__be32 reserved03;
+};
+
+/*
+ * Locking Feature Descriptor. Contains flags indicating support for the
+ * locking features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0002 in 2.00.100
+ */
+struct d0_locking_features {
+	/*
+	 * supported_features bits:
+	 * bits 6-7: reserved
+	 * bit 5: MBR done
+	 * bit 4: MBR enabled
+	 * bit 3: media encryption
+	 * bit 2: locked
+	 * bit 1: locking enabled
+	 * bit 0: locking supported
+	 */
+	u8 supported_features;
+	/*
+	 * bytes 5 through 15 are reserved, but we represent the first 3 as
+	 * u8 to keep the other two 32bits integers aligned.
+	 */
+	u8 reserved01[3];
+	__be32 reserved02;
+	__be32 reserved03;
+};
+
+/*
+ * Geometry Feature Descriptor. Contains flags indicating support for the
+ * geometry features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0003 in 2.00.100
+ */
+struct d0_geometry_features {
+	/*
+	 * skip 32 bits from header, needed to align the struct to 64 bits.
+	 */
+	u8 header[4];
+	/*
+	 * reserved01:
+	 * bits 1-6: reserved
+	 * bit 0: align
+	 */
+	u8 reserved01;
+	u8 reserved02[7];
+	__be32 logical_block_size;
+	__be64 alignment_granularity;
+	__be64 lowest_aligned_lba;
+};
+
+/*
+ * Enterprise SSC Feature
+ *
+ * code == 0x0100
+ */
+struct d0_enterprise_ssc {
+	__be16 baseComID;
+	__be16 numComIDs;
+	/* range_crossing:
+	 * bits 1-6: reserved
+	 * bit 0: range crossing
+	 */
+	u8 range_crossing;
+	u8 reserved01;
+	__be16 reserved02;
+	__be32 reserved03;
+	__be32 reserved04;
+};
+
+/*
+ * Opal V1 feature
+ *
+ * code == 0x0200
+ */
+struct d0_opal_v100 {
+	__be16 baseComID;
+	__be16 numComIDs;
+};
+
+/*
+ * Single User Mode feature
+ *
+ * code == 0x0201
+ */
+struct d0_single_user_mode {
+	__be32 num_locking_objects;
+	/* reserved01:
+	 * bit 0: any
+	 * bit 1: all
+	 * bit 2: policy
+	 * bits 3-7: reserved
+	 */
+	u8 reserved01;
+	u8 reserved02;
+	__be16 reserved03;
+	__be32 reserved04;
+};
+
+/*
+ * Additonal Datastores feature
+ *
+ * code == 0x0202
+ */
+struct d0_datastore_table {
+	__be16 reserved01;
+	__be16 max_tables;
+	__be32 max_size_tables;
+	__be32 table_size_alignment;
+};
+
+/*
+ * OPAL 2.0 feature
+ *
+ * code == 0x0203
+ */
+struct d0_opal_v200 {
+	__be16 baseComID;
+	__be16 numComIDs;
+	/* range_crossing:
+	 * bits 1-6: reserved
+	 * bit 0: range crossing
+	 */
+	u8 range_crossing;
+	/* num_locking_admin_auth:
+	 * not aligned to 16 bits, so use two u8.
+	 * stored in big endian:
+	 * 0: MSB
+	 * 1: LSB
+	 */
+	u8 num_locking_admin_auth[2];
+	/* num_locking_user_auth:
+	 * not aligned to 16 bits, so use two u8.
+	 * stored in big endian:
+	 * 0: MSB
+	 * 1: LSB
+	 */
+	u8 num_locking_user_auth[2];
+	u8 initialPIN;
+	u8 revertedPIN;
+	u8 reserved01;
+	__be32 reserved02;
+};
+
+/* Union of features used to parse the discovery 0 response */
+struct d0_features {
+	__be16 code;
+	/*
+	 * r_version bits:
+	 * bits 4-7: version
+	 * bits 0-3: reserved
+	 */
+	u8 r_version;
+	u8 length;
+	u8 features[];
+};
+
+#endif /* _OPAL_PROTO_H */
diff --git a/block/sed-opal.c b/block/sed-opal.c
new file mode 100644
index 000000000000..bf1406e5159b
--- /dev/null
+++ b/block/sed-opal.c
@@ -0,0 +1,2448 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/sed-opal.h>
+#include <linux/sed-opal.h>
+#include <linux/string.h>
+#include <linux/kdev_t.h>
+
+#include "opal_proto.h"
+
+static const u8 opaluid[][OPAL_UID_LENGTH] = {
+	/* users */
+	[OPAL_SMUID_UID] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff },
+	[OPAL_THISSP_UID] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_ADMINSP_UID] =
+		{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_LOCKINGSP_UID] =
+		{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 },
+	[OPAL_ENTERPRISE_LOCKINGSP_UID] =
+		{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 },
+	[OPAL_ANYBODY_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_SID_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 },
+	[OPAL_ADMIN1_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 },
+	[OPAL_USER1_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 },
+	[OPAL_USER2_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 },
+	[OPAL_PSID_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 },
+	[OPAL_ENTERPRISE_BANDMASTER0_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 },
+	[OPAL_ENTERPRISE_ERASEMASTER_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 },
+
+	/* tables */
+
+	[OPAL_LOCKINGRANGE_GLOBAL] =
+		{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
+		{ 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 },
+	[OPAL_LOCKINGRANGE_ACE_WRLOCKED] =
+		{ 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 },
+	[OPAL_MBRCONTROL] =
+		{ 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_MBR] =
+		{ 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 },
+	[OPAL_AUTHORITY_TABLE] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00},
+	[OPAL_C_PIN_TABLE] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00},
+	[OPAL_LOCKING_INFO_TABLE] =
+		{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_ENTERPRISE_LOCKING_INFO_TABLE] =
+		{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
+
+	/* C_PIN_TABLE object ID's */
+
+        [OPAL_C_PIN_MSID] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
+	[OPAL_C_PIN_SID] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01},
+	[OPAL_C_PIN_ADMIN1] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
+
+	/* half UID's (only first 4 bytes used) */
+
+	[OPAL_HALF_UID_AUTHORITY_OBJ_REF] =
+		{ 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
+	[OPAL_HALF_UID_BOOLEAN_ACE] =
+		{ 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff },
+
+	/* special value for omitted optional parameter */
+	[OPAL_UID_HEXFF] =
+		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+};
+
+/*
+ * TCG Storage SSC Methods.
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+static const u8 opalmethod[][OPAL_UID_LENGTH] = {
+	[OPAL_PROPERTIES] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
+	[OPAL_STARTSESSION] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 },
+	[OPAL_REVERT] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 },
+	[OPAL_ACTIVATE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 },
+	[OPAL_EGET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 },
+	[OPAL_ESET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 },
+	[OPAL_NEXT] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 },
+	[OPAL_EAUTHENTICATE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c },
+	[OPAL_GETACL] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d },
+	[OPAL_GENKEY] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 },
+	[OPAL_REVERTSP] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 },
+	[OPAL_GET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 },
+	[OPAL_SET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 },
+	[OPAL_AUTHENTICATE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c },
+	[OPAL_RANDOM] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
+	[OPAL_ERASE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
+};
+
+typedef int (cont_fn)(struct opal_dev *dev);
+
+static int end_opal_session_error(struct opal_dev *dev);
+
+struct opal_suspend_data {
+	struct opal_lock_unlock unlk;
+	u8 lr;
+	struct list_head node;
+};
+
+/*
+ * Derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 5.1.5 Method Status Codes
+ */
+static const char * const opal_errors[] = {
+	"Success",
+	"Not Authorized",
+	"Unknown Error",
+	"SP Busy",
+	"SP Failed",
+	"SP Disabled",
+	"SP Frozen",
+	"No Sessions Available",
+	"Uniqueness Conflict",
+	"Insufficient Space",
+	"Insufficient Rows",
+	"Invalid Function",
+	"Invalid Parameter",
+	"Invalid Reference",
+	"Unknown Error",
+	"TPER Malfunction",
+	"Transaction Failure",
+	"Response Overflow",
+	"Authority Locked Out",
+};
+
+static const char *opal_error_to_human(int error)
+{
+	if (error == 0x3f)
+		return "Failed";
+
+	if (error >= ARRAY_SIZE(opal_errors) || error < 0)
+		return "Unknown Error";
+
+	return opal_errors[error];
+}
+
+static void print_buffer(const u8 *ptr, u32 length)
+{
+#ifdef DEBUG
+	print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length);
+	pr_debug("\n");
+#endif
+}
+
+static bool check_tper(const void *data)
+{
+	const struct d0_tper_features *tper = data;
+	u8 flags = tper->supported_features;
+
+	if (!(flags & TPER_SYNC_SUPPORTED)) {
+		pr_err("TPer sync not supported. flags = %d\n",
+		       tper->supported_features);
+		return false;
+	}
+
+	return true;
+}
+
+static bool check_sum(const void *data)
+{
+	const struct d0_single_user_mode *sum = data;
+	u32 nlo = be32_to_cpu(sum->num_locking_objects);
+
+	if (nlo == 0) {
+		pr_err("Need at least one locking object.\n");
+		return false;
+	}
+
+	pr_debug("Number of locking objects: %d\n", nlo);
+
+	return true;
+}
+
+static u16 get_comid_v100(const void *data)
+{
+	const struct d0_opal_v100 *v100 = data;
+
+	return be16_to_cpu(v100->baseComID);
+}
+
+static u16 get_comid_v200(const void *data)
+{
+	const struct d0_opal_v200 *v200 = data;
+
+	return be16_to_cpu(v200->baseComID);
+}
+
+static int opal_send_cmd(struct opal_dev *dev)
+{
+	return dev->send_recv(dev, dev->comid, TCG_SECP_01,
+			      dev->cmd, IO_BUFFER_LENGTH,
+			      true);
+}
+
+static int opal_recv_cmd(struct opal_dev *dev)
+{
+	return dev->send_recv(dev, dev->comid, TCG_SECP_01,
+			      dev->resp, IO_BUFFER_LENGTH,
+			      false);
+}
+
+static int opal_recv_check(struct opal_dev *dev)
+{
+	size_t buflen = IO_BUFFER_LENGTH;
+	void *buffer = dev->resp;
+	struct opal_header *hdr = buffer;
+	int ret;
+
+	do {
+		pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n",
+			 hdr->cp.outstandingData,
+			 hdr->cp.minTransfer);
+
+		if (hdr->cp.outstandingData == 0 ||
+		    hdr->cp.minTransfer != 0)
+			return 0;
+
+		memset(buffer, 0, buflen);
+		ret = opal_recv_cmd(dev);
+	} while (!ret);
+
+	return ret;
+}
+
+static int opal_send_recv(struct opal_dev *dev, cont_fn *cont)
+{
+	int ret;
+
+	ret = opal_send_cmd(dev);
+	if (ret)
+		return ret;
+	ret = opal_recv_cmd(dev);
+	if (ret)
+		return ret;
+	ret = opal_recv_check(dev);
+	if (ret)
+		return ret;
+	return cont(dev);
+}
+
+static void check_geometry(struct opal_dev *dev, const void *data)
+{
+	const struct d0_geometry_features *geo = data;
+
+	dev->align = geo->alignment_granularity;
+	dev->lowest_lba = geo->lowest_aligned_lba;
+}
+
+static int next(struct opal_dev *dev)
+{
+	opal_step func;
+	int error = 0;
+
+	do {
+		func = dev->funcs[dev->state];
+		if (!func)
+			break;
+
+		error = func(dev);
+		if (error) {
+			pr_err("Error on step function: %d with error %d: %s\n",
+			       dev->state, error,
+			       opal_error_to_human(error));
+
+			/* For each OPAL command we do a discovery0 then we
+			 * start some sort of session.
+			 * If we haven't passed state 1 then there was an error
+			 * on discovery0 or during the attempt to start a
+			 * session. Therefore we shouldn't attempt to terminate
+			 * a session, as one has not yet been created.
+			 */
+			if (dev->state > 1)
+				return end_opal_session_error(dev);
+		}
+		dev->state++;
+	} while (!error);
+
+	return error;
+}
+
+static int opal_discovery0_end(struct opal_dev *dev)
+{
+	bool found_com_id = false, supported = true, single_user = false;
+	const struct d0_header *hdr = (struct d0_header *)dev->resp;
+	const u8 *epos = dev->resp, *cpos = dev->resp;
+	u16 comid = 0;
+
+	print_buffer(dev->resp, be32_to_cpu(hdr->length));
+
+	epos += be32_to_cpu(hdr->length); /* end of buffer */
+	cpos += sizeof(*hdr); /* current position on buffer */
+
+	while (cpos < epos && supported) {
+		const struct d0_features *body =
+			(const struct d0_features *)cpos;
+
+		switch (be16_to_cpu(body->code)) {
+		case FC_TPER:
+			supported = check_tper(body->features);
+			break;
+		case FC_SINGLEUSER:
+			single_user = check_sum(body->features);
+			break;
+		case FC_GEOMETRY:
+			check_geometry(dev, body);
+			break;
+		case FC_LOCKING:
+		case FC_ENTERPRISE:
+		case FC_DATASTORE:
+			/* some ignored properties */
+			pr_debug("Found OPAL feature description: %d\n",
+				 be16_to_cpu(body->code));
+			break;
+		case FC_OPALV100:
+			comid = get_comid_v100(body->features);
+			found_com_id = true;
+			break;
+		case FC_OPALV200:
+			comid = get_comid_v200(body->features);
+			found_com_id = true;
+			break;
+		case 0xbfff ... 0xffff:
+			/* vendor specific, just ignore */
+			break;
+		default:
+			pr_debug("OPAL Unknown feature: %d\n",
+				 be16_to_cpu(body->code));
+
+		}
+		cpos += body->length + 4;
+	}
+
+	if (!supported) {
+		pr_err("This device is not Opal enabled. Not Supported!\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!single_user)
+		pr_warn("Device doesn't support single user mode\n");
+
+
+	if (!found_com_id) {
+		pr_warn("Could not find OPAL comid for device. Returning early\n");
+		return -EOPNOTSUPP;;
+	}
+
+	dev->comid = comid;
+
+	return 0;
+}
+
+static int opal_discovery0(struct opal_dev *dev)
+{
+	int ret;
+
+	memset(dev->resp, 0, IO_BUFFER_LENGTH);
+	dev->comid = OPAL_DISCOVERY_COMID;
+	ret = opal_recv_cmd(dev);
+	if (ret)
+		return ret;
+	return opal_discovery0_end(dev);
+}
+
+static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
+{
+	if (*err)
+		return;
+	if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
+		pr_err("Error adding u8: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+	cmd->cmd[cmd->pos++] = tok;
+}
+
+static void add_short_atom_header(struct opal_dev *cmd, bool bytestring,
+				  bool has_sign, int len)
+{
+	u8 atom;
+	int err = 0;
+
+	atom = SHORT_ATOM_ID;
+	atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0;
+	atom |= has_sign ? SHORT_ATOM_SIGNED : 0;
+	atom |= len & SHORT_ATOM_LEN_MASK;
+
+	add_token_u8(&err, cmd, atom);
+}
+
+static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
+				   bool has_sign, int len)
+{
+	u8 header0;
+
+	header0 = MEDIUM_ATOM_ID;
+	header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0;
+	header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0;
+	header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK;
+	cmd->cmd[cmd->pos++] = header0;
+	cmd->cmd[cmd->pos++] = len;
+}
+
+static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
+{
+
+	size_t len;
+	int msb;
+	u8 n;
+
+	if (!(number & ~TINY_ATOM_DATA_MASK)) {
+		add_token_u8(err, cmd, number);
+		return;
+	}
+
+	msb = fls(number);
+	len = DIV_ROUND_UP(msb, 4);
+
+	if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
+		pr_err("Error adding u64: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+	add_short_atom_header(cmd, false, false, len);
+	while (len--) {
+		n = number >> (len * 8);
+		add_token_u8(err, cmd, n);
+	}
+}
+
+static void add_token_bytestring(int *err, struct opal_dev *cmd,
+				 const u8 *bytestring, size_t len)
+{
+	size_t header_len = 1;
+	bool is_short_atom = true;
+
+	if (*err)
+		return;
+
+	if (len & ~SHORT_ATOM_LEN_MASK) {
+		header_len = 2;
+		is_short_atom = false;
+	}
+
+	if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
+		pr_err("Error adding bytestring: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+
+	if (is_short_atom)
+		add_short_atom_header(cmd, true, false, len);
+	else
+		add_medium_atom_header(cmd, true, false, len);
+
+	memcpy(&cmd->cmd[cmd->pos], bytestring, len);
+	cmd->pos += len;
+
+}
+
+static int build_locking_range(u8 *buffer, size_t length, u8 lr)
+{
+	if (length > OPAL_UID_LENGTH) {
+		pr_err("Can't build locking range. Length OOB\n");
+		return -ERANGE;
+	}
+
+	memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH);
+
+	if (lr == 0)
+		return 0;
+	buffer[5] = LOCKING_RANGE_NON_GLOBAL;
+	buffer[7] = lr;
+
+	return 0;
+}
+
+static int build_locking_user(u8 *buffer, size_t length, u8 lr)
+{
+	if (length > OPAL_UID_LENGTH) {
+		pr_err("Can't build locking range user, Length OOB\n");
+		return -ERANGE;
+	}
+
+	memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+	buffer[7] = lr + 1;
+
+	return 0;
+}
+
+static void set_comid(struct opal_dev *cmd, u16 comid)
+{
+	struct opal_header *hdr = (struct opal_header *)cmd->cmd;
+
+	hdr->cp.extendedComID[0] = comid >> 8;
+	hdr->cp.extendedComID[1] = comid;
+	hdr->cp.extendedComID[2] = 0;
+	hdr->cp.extendedComID[3] = 0;
+}
+
+static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
+{
+	struct opal_header *hdr;
+	int err = 0;
+
+	add_token_u8(&err, cmd, OPAL_ENDOFDATA);
+	add_token_u8(&err, cmd, OPAL_STARTLIST);
+	add_token_u8(&err, cmd, 0);
+	add_token_u8(&err, cmd, 0);
+	add_token_u8(&err, cmd, 0);
+	add_token_u8(&err, cmd, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error finalizing command.\n");
+		return -EFAULT;
+	}
+
+	hdr = (struct opal_header *) cmd->cmd;
+
+	hdr->pkt.tsn = cpu_to_be32(tsn);
+	hdr->pkt.hsn = cpu_to_be32(hsn);
+
+	hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
+	while (cmd->pos % 4) {
+		if (cmd->pos >= IO_BUFFER_LENGTH) {
+			pr_err("Error: Buffer overrun\n");
+			return -ERANGE;
+		}
+		cmd->cmd[cmd->pos++] = 0;
+	}
+	hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) -
+				      sizeof(hdr->pkt));
+	hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp));
+
+	return 0;
+}
+
+static enum opal_response_token token_type(const struct parsed_resp *resp,
+					   int n)
+{
+	const struct opal_resp_tok *tok;
+
+	if (n >= resp->num) {
+		pr_err("Token number doesn't exist: %d, resp: %d\n",
+		       n, resp->num);
+		return OPAL_DTA_TOKENID_INVALID;
+	}
+
+	tok = &resp->toks[n];
+	if (tok->len == 0) {
+		pr_err("Token length must be non-zero\n");
+		return OPAL_DTA_TOKENID_INVALID;
+	}
+
+	return tok->type;
+}
+
+/*
+ * This function returns 0 in case of invalid token. One should call
+ * token_type() first to find out if the token is valid or not.
+ */
+static enum opal_token response_get_token(const struct parsed_resp *resp,
+					  int n)
+{
+	const struct opal_resp_tok *tok;
+
+	if (n >= resp->num) {
+		pr_err("Token number doesn't exist: %d, resp: %d\n",
+		       n, resp->num);
+		return 0;
+	}
+
+	tok = &resp->toks[n];
+	if (tok->len == 0) {
+		pr_err("Token length must be non-zero\n");
+		return 0;
+	}
+
+	return tok->pos[0];
+}
+
+static size_t response_parse_tiny(struct opal_resp_tok *tok,
+				  const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = 1;
+	tok->width = OPAL_WIDTH_TINY;
+
+	if (pos[0] & TINY_ATOM_SIGNED) {
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		tok->type = OPAL_DTA_TOKENID_UINT;
+		tok->stored.u = pos[0] & 0x3f;
+	}
+
+	return tok->len;
+}
+
+static size_t response_parse_short(struct opal_resp_tok *tok,
+				   const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
+	tok->width = OPAL_WIDTH_SHORT;
+
+	if (pos[0] & SHORT_ATOM_BYTESTRING) {
+		tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+	} else if (pos[0] & SHORT_ATOM_SIGNED) {
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		u64 u_integer = 0;
+		int i, b = 0;
+
+		tok->type = OPAL_DTA_TOKENID_UINT;
+		if (tok->len > 9) {
+			pr_warn("uint64 with more than 8 bytes\n");
+			return -EINVAL;
+		}
+		for (i = tok->len - 1; i > 0; i--) {
+			u_integer |= ((u64)pos[i] << (8 * b));
+			b++;
+		}
+		tok->stored.u = u_integer;
+	}
+
+	return tok->len;
+}
+
+static size_t response_parse_medium(struct opal_resp_tok *tok,
+				    const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
+	tok->width = OPAL_WIDTH_MEDIUM;
+
+	if (pos[0] & MEDIUM_ATOM_BYTESTRING)
+		tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+	else if (pos[0] & MEDIUM_ATOM_SIGNED)
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	else
+		tok->type = OPAL_DTA_TOKENID_UINT;
+
+	return tok->len;
+}
+
+static size_t response_parse_long(struct opal_resp_tok *tok,
+				  const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
+	tok->width = OPAL_WIDTH_LONG;
+
+	if (pos[0] & LONG_ATOM_BYTESTRING)
+		tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+	else if (pos[0] & LONG_ATOM_SIGNED)
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	else
+		tok->type = OPAL_DTA_TOKENID_UINT;
+
+	return tok->len;
+}
+
+static size_t response_parse_token(struct opal_resp_tok *tok,
+				   const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = 1;
+	tok->type = OPAL_DTA_TOKENID_TOKEN;
+	tok->width = OPAL_WIDTH_TOKEN;
+
+	return tok->len;
+}
+
+static int response_parse(const u8 *buf, size_t length,
+			  struct parsed_resp *resp)
+{
+	const struct opal_header *hdr;
+	struct opal_resp_tok *iter;
+	int num_entries = 0;
+	int total;
+	size_t token_length;
+	const u8 *pos;
+
+	if (!buf)
+		return -EFAULT;
+
+	if (!resp)
+		return -EFAULT;
+
+	hdr = (struct opal_header *)buf;
+	pos = buf;
+	pos += sizeof(*hdr);
+
+	pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
+		 be32_to_cpu(hdr->cp.length),
+		 be32_to_cpu(hdr->pkt.length),
+		 be32_to_cpu(hdr->subpkt.length));
+
+	if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
+	    hdr->subpkt.length == 0) {
+		pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
+		       be32_to_cpu(hdr->cp.length),
+		       be32_to_cpu(hdr->pkt.length),
+		       be32_to_cpu(hdr->subpkt.length));
+		print_buffer(pos, sizeof(*hdr));
+		return -EINVAL;
+	}
+
+	if (pos > buf + length)
+		return -EFAULT;
+
+	iter = resp->toks;
+	total = be32_to_cpu(hdr->subpkt.length);
+	print_buffer(pos, total);
+	while (total > 0) {
+		if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
+			token_length = response_parse_tiny(iter, pos);
+		else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */
+			token_length = response_parse_short(iter, pos);
+		else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */
+			token_length = response_parse_medium(iter, pos);
+		else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */
+			token_length = response_parse_long(iter, pos);
+		else /* TOKEN */
+			token_length = response_parse_token(iter, pos);
+
+		if (token_length == -EINVAL)
+			return -EINVAL;
+
+		pos += token_length;
+		total -= token_length;
+		iter++;
+		num_entries++;
+	}
+
+	if (num_entries == 0) {
+		pr_err("Couldn't parse response.\n");
+		return -EINVAL;
+	}
+	resp->num = num_entries;
+
+	return 0;
+}
+
+static size_t response_get_string(const struct parsed_resp *resp, int n,
+				  const char **store)
+{
+	*store = NULL;
+	if (!resp) {
+		pr_err("Response is NULL\n");
+		return 0;
+	}
+
+	if (n > resp->num) {
+		pr_err("Response has %d tokens. Can't access %d\n",
+		       resp->num, n);
+		return 0;
+	}
+
+	if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
+		pr_err("Token is not a byte string!\n");
+		return 0;
+	}
+
+	*store = resp->toks[n].pos + 1;
+	return resp->toks[n].len - 1;
+}
+
+static u64 response_get_u64(const struct parsed_resp *resp, int n)
+{
+	if (!resp) {
+		pr_err("Response is NULL\n");
+		return 0;
+	}
+
+	if (n > resp->num) {
+		pr_err("Response has %d tokens. Can't access %d\n",
+		       resp->num, n);
+		return 0;
+	}
+
+	if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
+		pr_err("Token is not unsigned it: %d\n",
+		       resp->toks[n].type);
+		return 0;
+	}
+
+	if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
+	      resp->toks[n].width == OPAL_WIDTH_SHORT)) {
+		pr_err("Atom is not short or tiny: %d\n",
+		       resp->toks[n].width);
+		return 0;
+	}
+
+	return resp->toks[n].stored.u;
+}
+
+static u8 response_status(const struct parsed_resp *resp)
+{
+	if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
+	    response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+		return 0;
+	}
+
+	if (resp->num < 5)
+		return DTAERROR_NO_METHOD_STATUS;
+
+	if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
+	    token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
+	    response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
+	    response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+		return DTAERROR_NO_METHOD_STATUS;
+
+	return response_get_u64(resp, resp->num - 4);
+}
+
+/* Parses and checks for errors */
+static int parse_and_check_status(struct opal_dev *dev)
+{
+	int error;
+
+	print_buffer(dev->cmd, dev->pos);
+
+	error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
+	if (error) {
+		pr_err("Couldn't parse response.\n");
+		return error;
+	}
+
+	return response_status(&dev->parsed);
+}
+
+static void clear_opal_cmd(struct opal_dev *dev)
+{
+	dev->pos = sizeof(struct opal_header);
+	memset(dev->cmd, 0, IO_BUFFER_LENGTH);
+}
+
+static int start_opal_session_cont(struct opal_dev *dev)
+{
+	u32 hsn, tsn;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+
+	hsn = response_get_u64(&dev->parsed, 4);
+	tsn = response_get_u64(&dev->parsed, 5);
+
+	if (hsn == 0 && tsn == 0) {
+		pr_err("Couldn't authenticate session\n");
+		return -EPERM;
+	}
+
+	dev->hsn = hsn;
+	dev->tsn = tsn;
+	return 0;
+}
+
+static void add_suspend_info(struct opal_dev *dev,
+			     struct opal_suspend_data *sus)
+{
+	struct opal_suspend_data *iter;
+
+	list_for_each_entry(iter, &dev->unlk_lst, node) {
+		if (iter->lr == sus->lr) {
+			list_del(&iter->node);
+			kfree(iter);
+			break;
+		}
+	}
+	list_add_tail(&sus->node, &dev->unlk_lst);
+}
+
+static int end_session_cont(struct opal_dev *dev)
+{
+	dev->hsn = 0;
+	dev->tsn = 0;
+	return parse_and_check_status(dev);
+}
+
+static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
+{
+	int ret;
+
+	ret = cmd_finalize(dev, dev->hsn, dev->tsn);
+	if (ret) {
+		pr_err("Error finalizing command buffer: %d\n", ret);
+		return ret;
+	}
+
+	print_buffer(dev->cmd, dev->pos);
+
+	return opal_send_recv(dev, cont);
+}
+
+static int gen_key(struct opal_dev *dev)
+{
+	const u8 *method;
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
+	method = opalmethod[OPAL_GENKEY];
+	kfree(dev->prev_data);
+	dev->prev_data = NULL;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building gen key command\n");
+		return err;
+
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_active_key_cont(struct opal_dev *dev)
+{
+	const char *activekey;
+	size_t keylen;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+	keylen = response_get_string(&dev->parsed, 4, &activekey);
+	if (!activekey) {
+		pr_err("%s: Couldn't extract the Activekey from the response\n",
+		       __func__);
+		return OPAL_INVAL_PARAM;
+	}
+	dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
+
+	if (!dev->prev_data)
+		return -ENOMEM;
+
+	dev->prev_d_len = keylen;
+
+	return 0;
+}
+
+static int get_active_key(struct opal_dev *dev)
+{
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+	u8 *lr;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+	lr = dev->func_data[dev->state];
+
+	err = build_locking_range(uid, sizeof(uid), *lr);
+	if (err)
+		return err;
+
+	err = 0;
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* startCloumn */
+	add_token_u8(&err, dev, 10); /* ActiveKey */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 4); /* endColumn */
+	add_token_u8(&err, dev, 10); /* ActiveKey */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	if (err) {
+		pr_err("Error building get active key command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, get_active_key_cont);
+}
+
+static int generic_lr_enable_disable(struct opal_dev *dev,
+				     u8 *uid, bool rle, bool wle,
+				     bool rl, bool wl)
+{
+	int err = 0;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 5); /* ReadLockEnabled */
+	add_token_u8(&err, dev, rle);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 6); /* WriteLockEnabled */
+	add_token_u8(&err, dev, wle);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_READLOCKED);
+	add_token_u8(&err, dev, rl);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_WRITELOCKED);
+	add_token_u8(&err, dev, wl);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	return err;
+}
+
+static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
+				   struct opal_user_lr_setup *setup)
+{
+	int err;
+
+	err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
+					0, 0);
+	if (err)
+		pr_err("Failed to create enable global lr command\n");
+	return err;
+}
+
+static int setup_locking_range(struct opal_dev *dev)
+{
+	u8 uid[OPAL_UID_LENGTH];
+	struct opal_user_lr_setup *setup;
+	u8 lr;
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	setup = dev->func_data[dev->state];
+	lr = setup->session.opal_key.lr;
+	err = build_locking_range(uid, sizeof(uid), lr);
+	if (err)
+		return err;
+
+	if (lr == 0)
+		err = enable_global_lr(dev, uid, setup);
+	else {
+		add_token_u8(&err, dev, OPAL_CALL);
+		add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+		add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+				     OPAL_UID_LENGTH);
+
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, OPAL_VALUES);
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 3); /* Ranges Start */
+		add_token_u64(&err, dev, setup->range_start);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 4); /* Ranges length */
+		add_token_u64(&err, dev, setup->range_length);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 5); /*ReadLockEnabled */
+		add_token_u64(&err, dev, !!setup->RLE);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 6); /*WriteLockEnabled*/
+		add_token_u64(&err, dev, !!setup->WLE);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	}
+	if (err) {
+		pr_err("Error building Setup Locking range command.\n");
+		return err;
+
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int start_generic_opal_session(struct opal_dev *dev,
+				      enum opal_uid auth,
+				      enum opal_uid sp_type,
+				      const char *key,
+				      u8 key_len)
+{
+	u32 hsn;
+	int err = 0;
+
+	if (key == NULL && auth != OPAL_ANYBODY_UID) {
+		pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
+		       "Challenge, and not as the Anybody UID\n", __func__);
+		return OPAL_INVAL_PARAM;
+	}
+
+	clear_opal_cmd(dev);
+
+	set_comid(dev, dev->comid);
+	hsn = GENERIC_HOST_SESSION_NUM;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u64(&err, dev, hsn);
+	add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, 1);
+
+	switch (auth) {
+	case OPAL_ANYBODY_UID:
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		break;
+	case OPAL_ADMIN1_UID:
+	case OPAL_SID_UID:
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 0); /* HostChallenge */
+		add_token_bytestring(&err, dev, key, key_len);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 3); /* HostSignAuth */
+		add_token_bytestring(&err, dev, opaluid[auth],
+				     OPAL_UID_LENGTH);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		break;
+	default:
+		pr_err("Cannot start Admin SP session with auth %d\n", auth);
+		return OPAL_INVAL_PARAM;
+	}
+
+	if (err) {
+		pr_err("Error building start adminsp session command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int start_anybodyASP_opal_session(struct opal_dev *dev)
+{
+	return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
+					  OPAL_ADMINSP_UID, NULL, 0);
+}
+
+static int start_SIDASP_opal_session(struct opal_dev *dev)
+{
+	int ret;
+	const u8 *key = dev->prev_data;
+	struct opal_key *okey;
+
+	if (!key) {
+		okey = dev->func_data[dev->state];
+		ret = start_generic_opal_session(dev, OPAL_SID_UID,
+						 OPAL_ADMINSP_UID,
+						 okey->key,
+						 okey->key_len);
+	} else {
+		ret = start_generic_opal_session(dev, OPAL_SID_UID,
+						 OPAL_ADMINSP_UID,
+						 key, dev->prev_d_len);
+		kfree(key);
+		dev->prev_data = NULL;
+	}
+	return ret;
+}
+
+static inline int start_admin1LSP_opal_session(struct opal_dev *dev)
+{
+	struct opal_key *key = dev->func_data[dev->state];
+
+	return start_generic_opal_session(dev, OPAL_ADMIN1_UID,
+					  OPAL_LOCKINGSP_UID,
+					  key->key, key->key_len);
+}
+
+static int start_auth_opal_session(struct opal_dev *dev)
+{
+	u8 lk_ul_user[OPAL_UID_LENGTH];
+	int err = 0;
+
+	struct opal_session_info *session = dev->func_data[dev->state];
+	size_t keylen = session->opal_key.key_len;
+	u8 *key = session->opal_key.key;
+	u32 hsn = GENERIC_HOST_SESSION_NUM;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	if (session->sum) {
+		err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+					 session->opal_key.lr);
+		if (err)
+			return err;
+
+	} else if (session->who != OPAL_ADMIN1 && !session->sum) {
+		err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+					 session->who - 1);
+		if (err)
+			return err;
+	} else
+		memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+			     OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u64(&err, dev, hsn);
+	add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, 1);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 0);
+	add_token_bytestring(&err, dev, key, keylen);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3);
+	add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building STARTSESSION command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int revert_tper(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	if (err) {
+		pr_err("Error building REVERT TPER command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int internal_activate_user(struct opal_dev *dev)
+{
+	struct opal_session_info *session = dev->func_data[dev->state];
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+	uid[7] = session->who;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 5); /* Enabled */
+	add_token_u8(&err, dev, OPAL_TRUE);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building Activate UserN command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int erase_locking_range(struct opal_dev *dev)
+{
+	struct opal_session_info *session;
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+	session = dev->func_data[dev->state];
+
+	if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0)
+		return -ERANGE;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building Erase Locking Range Command.\n");
+		return err;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_done(struct opal_dev *dev)
+{
+	u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 2); /* Done */
+	add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error Building set MBR Done command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_enable_disable(struct opal_dev *dev)
+{
+	u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 1);
+	add_token_u8(&err, dev, mbr_en_dis);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error Building set MBR done command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
+			  struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* PIN */
+	add_token_bytestring(&err, dev, key, key_len);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	return err;
+}
+
+static int set_new_pw(struct opal_dev *dev)
+{
+	u8 cpin_uid[OPAL_UID_LENGTH];
+	struct opal_session_info *usr = dev->func_data[dev->state];
+
+
+	memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH);
+
+	if (usr->who != OPAL_ADMIN1) {
+		cpin_uid[5] = 0x03;
+		if (usr->sum)
+			cpin_uid[7] = usr->opal_key.lr + 1;
+		else
+			cpin_uid[7] = usr->who;
+	}
+
+	if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
+			   cpin_uid, dev)) {
+		pr_err("Error building set password command.\n");
+		return -ERANGE;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_sid_cpin_pin(struct opal_dev *dev)
+{
+	u8 cpin_uid[OPAL_UID_LENGTH];
+	struct opal_key *key = dev->func_data[dev->state];
+
+	memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
+
+	if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
+		pr_err("Error building Set SID cpin\n");
+		return -ERANGE;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int add_user_to_lr(struct opal_dev *dev)
+{
+	u8 lr_buffer[OPAL_UID_LENGTH];
+	u8 user_uid[OPAL_UID_LENGTH];
+	struct opal_lock_unlock *lkul;
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	lkul = dev->func_data[dev->state];
+
+	memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
+	       OPAL_UID_LENGTH);
+
+	if (lkul->l_state == OPAL_RW)
+		memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED],
+		       OPAL_UID_LENGTH);
+
+	lr_buffer[7] = lkul->session.opal_key.lr;
+
+	memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+	user_uid[7] = lkul->session.who;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+			     OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_bytestring(&err, dev,
+			     opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+			     OPAL_UID_LENGTH/2);
+	add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_bytestring(&err, dev,
+			     opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+			     OPAL_UID_LENGTH/2);
+	add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE],
+			     OPAL_UID_LENGTH/2);
+	add_token_u8(&err, dev, 1);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building add user to locking range command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int lock_unlock_locking_range(struct opal_dev *dev)
+{
+	u8 lr_buffer[OPAL_UID_LENGTH];
+	const u8 *method;
+	struct opal_lock_unlock *lkul;
+	u8 read_locked = 1, write_locked = 1;
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	method = opalmethod[OPAL_SET];
+	lkul = dev->func_data[dev->state];
+	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+				lkul->session.opal_key.lr) < 0)
+		return -ERANGE;
+
+	switch (lkul->l_state) {
+	case OPAL_RO:
+		read_locked = 0;
+		write_locked = 1;
+		break;
+	case OPAL_RW:
+		read_locked = 0;
+		write_locked = 0;
+		break;
+	case OPAL_LK:
+		/* vars are initalized to locked */
+		break;
+	default:
+		pr_err("Tried to set an invalid locking state... returning to uland\n");
+		return OPAL_INVAL_PARAM;
+	}
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_READLOCKED);
+	add_token_u8(&err, dev, read_locked);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_WRITELOCKED);
+	add_token_u8(&err, dev, write_locked);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building SET command.\n");
+		return err;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+
+static int lock_unlock_locking_range_sum(struct opal_dev *dev)
+{
+	u8 lr_buffer[OPAL_UID_LENGTH];
+	u8 read_locked = 1, write_locked = 1;
+	const u8 *method;
+	struct opal_lock_unlock *lkul;
+	int ret;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	method = opalmethod[OPAL_SET];
+	lkul = dev->func_data[dev->state];
+	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+				lkul->session.opal_key.lr) < 0)
+		return -ERANGE;
+
+	switch (lkul->l_state) {
+	case OPAL_RO:
+		read_locked = 0;
+		write_locked = 1;
+		break;
+	case OPAL_RW:
+		read_locked = 0;
+		write_locked = 0;
+		break;
+	case OPAL_LK:
+		/* vars are initalized to locked */
+		break;
+	default:
+		pr_err("Tried to set an invalid locking state.\n");
+		return OPAL_INVAL_PARAM;
+	}
+	ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
+					read_locked, write_locked);
+
+	if (ret < 0) {
+		pr_err("Error building SET command.\n");
+		return ret;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int activate_lsp(struct opal_dev *dev)
+{
+	struct opal_lr_act *opal_act;
+	u8 user_lr[OPAL_UID_LENGTH];
+	u8 uint_3 = 0x83;
+	int err = 0, i;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	opal_act = dev->func_data[dev->state];
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE],
+			     OPAL_UID_LENGTH);
+
+
+	if (opal_act->sum) {
+		err = build_locking_range(user_lr, sizeof(user_lr),
+					  opal_act->lr[0]);
+		if (err)
+			return err;
+
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, uint_3);
+		add_token_u8(&err, dev, 6);
+		add_token_u8(&err, dev, 0);
+		add_token_u8(&err, dev, 0);
+
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+		for (i = 1; i < opal_act->num_lrs; i++) {
+			user_lr[7] = opal_act->lr[i];
+			add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+		}
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	} else {
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+	}
+
+	if (err) {
+		pr_err("Error building Activate LockingSP command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_lsp_lifecycle_cont(struct opal_dev *dev)
+{
+	u8 lc_status;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+
+	lc_status = response_get_u64(&dev->parsed, 4);
+	/* 0x08 is Manufacured Inactive */
+	/* 0x09 is Manufactured */
+	if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
+		pr_err("Couldn't determine the status of the Lifcycle state\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/* Determine if we're in the Manufactured Inactive or Active state */
+static int get_lsp_lifecycle(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* Start Column */
+	add_token_u8(&err, dev, 6); /* Lifecycle Column */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 4); /* End Column */
+	add_token_u8(&err, dev, 6); /* Lifecycle Column */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error Building GET Lifecycle Status command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, get_lsp_lifecycle_cont);
+}
+
+static int get_msid_cpin_pin_cont(struct opal_dev *dev)
+{
+	const char *msid_pin;
+	size_t strlen;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+
+	strlen = response_get_string(&dev->parsed, 4, &msid_pin);
+	if (!msid_pin) {
+		pr_err("%s: Couldn't extract PIN from response\n", __func__);
+		return OPAL_INVAL_PARAM;
+	}
+
+	dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL);
+	if (!dev->prev_data)
+		return -ENOMEM;
+
+	dev->prev_d_len = strlen;
+
+	return 0;
+}
+
+static int get_msid_cpin_pin(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* Start Column */
+	add_token_u8(&err, dev, 3); /* PIN */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 4); /* End Column */
+	add_token_u8(&err, dev, 3); /* Lifecycle Column */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building Get MSID CPIN PIN command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, get_msid_cpin_pin_cont);
+}
+
+static int build_end_opal_session(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+
+	set_comid(dev, dev->comid);
+	add_token_u8(&err, dev, OPAL_ENDOFSESSION);
+	return err;
+}
+
+static int end_opal_session(struct opal_dev *dev)
+{
+	int ret = build_end_opal_session(dev);
+
+	if (ret < 0)
+		return ret;
+	return finalize_and_send(dev, end_session_cont);
+}
+
+static int end_opal_session_error(struct opal_dev *dev)
+{
+	const opal_step error_end_session[] = {
+		end_opal_session,
+		NULL,
+	};
+	dev->funcs = error_end_session;
+	dev->state = 0;
+	return next(dev);
+}
+
+static inline void setup_opal_dev(struct opal_dev *dev,
+				  const opal_step *funcs)
+{
+	dev->state = 0;
+	dev->funcs = funcs;
+	dev->tsn = 0;
+	dev->hsn = 0;
+	dev->func_data = NULL;
+	dev->prev_data = NULL;
+}
+
+static int check_opal_support(struct opal_dev *dev)
+{
+	static const opal_step funcs[] = {
+		opal_discovery0,
+		NULL
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, funcs);
+	ret = next(dev);
+	dev->supported = !ret;
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv)
+{
+	if (opal_dev->initialized)
+		return;
+	INIT_LIST_HEAD(&opal_dev->unlk_lst);
+	mutex_init(&opal_dev->dev_lock);
+	opal_dev->send_recv = send_recv;
+	if (check_opal_support(opal_dev) < 0)
+		pr_warn("Opal is not supported on this device\n");
+	opal_dev->initialized = true;
+}
+EXPORT_SYMBOL(init_opal_dev);
+
+static int opal_secure_erase_locking_range(struct opal_dev *dev,
+					   struct opal_session_info *opal_session)
+{
+	void *data[3] = { NULL };
+	static const opal_step erase_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		get_active_key,
+		gen_key,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, erase_funcs);
+
+	dev->func_data = data;
+	dev->func_data[1] = opal_session;
+	dev->func_data[2] = &opal_session->opal_key.lr;
+
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_erase_locking_range(struct opal_dev *dev,
+				    struct opal_session_info *opal_session)
+{
+	void *data[3] = { NULL };
+	static const opal_step erase_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		erase_locking_range,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, erase_funcs);
+
+	dev->func_data = data;
+	dev->func_data[1] = opal_session;
+	dev->func_data[2] = opal_session;
+
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
+					  struct opal_mbr_data *opal_mbr)
+{
+	void *func_data[6] = { NULL };
+	static const opal_step mbr_funcs[] = {
+		opal_discovery0,
+		start_admin1LSP_opal_session,
+		set_mbr_done,
+		end_opal_session,
+		start_admin1LSP_opal_session,
+		set_mbr_enable_disable,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	if (opal_mbr->enable_disable != OPAL_MBR_ENABLE &&
+	    opal_mbr->enable_disable != OPAL_MBR_DISABLE)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, mbr_funcs);
+	dev->func_data = func_data;
+	dev->func_data[1] = &opal_mbr->key;
+	dev->func_data[2] = &opal_mbr->enable_disable;
+	dev->func_data[4] = &opal_mbr->key;
+	dev->func_data[5] = &opal_mbr->enable_disable;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+	struct opal_suspend_data *suspend;
+
+	suspend = kzalloc(sizeof(*suspend), GFP_KERNEL);
+	if (!suspend)
+		return -ENOMEM;
+
+	suspend->unlk = *lk_unlk;
+	suspend->lr = lk_unlk->session.opal_key.lr;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, NULL);
+	add_suspend_info(dev, suspend);
+	mutex_unlock(&dev->dev_lock);
+	return 0;
+}
+
+static int opal_add_user_to_lr(struct opal_dev *dev,
+			       struct opal_lock_unlock *lk_unlk)
+{
+	void *func_data[3] = { NULL };
+	static const opal_step funcs[] = {
+		opal_discovery0,
+		start_admin1LSP_opal_session,
+		add_user_to_lr,
+		end_opal_session,
+		NULL
+	};
+	int ret;
+
+	if (lk_unlk->l_state != OPAL_RO &&
+	    lk_unlk->l_state != OPAL_RW) {
+		pr_err("Locking state was not RO or RW\n");
+		return -EINVAL;
+	}
+	if (lk_unlk->session.who < OPAL_USER1 &&
+	    lk_unlk->session.who > OPAL_USER9) {
+		pr_err("Authority was not within the range of users: %d\n",
+		       lk_unlk->session.who);
+		return -EINVAL;
+	}
+	if (lk_unlk->session.sum) {
+		pr_err("%s not supported in sum. Use setup locking range\n",
+		       __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, funcs);
+	dev->func_data = func_data;
+	dev->func_data[1] = &lk_unlk->session.opal_key;
+	dev->func_data[2] = lk_unlk;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
+{
+	void *data[2] = { NULL };
+	static const opal_step revert_funcs[] = {
+		opal_discovery0,
+		start_SIDASP_opal_session,
+		revert_tper, /* controller will terminate session */
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, revert_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = opal;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int __opal_lock_unlock_sum(struct opal_dev *dev)
+{
+	static const opal_step ulk_funcs_sum[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		lock_unlock_locking_range_sum,
+		end_opal_session,
+		NULL
+	};
+
+	dev->funcs = ulk_funcs_sum;
+	return next(dev);
+}
+
+static int __opal_lock_unlock(struct opal_dev *dev)
+{
+	static const opal_step _unlock_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		lock_unlock_locking_range,
+		end_opal_session,
+		NULL
+	};
+
+	dev->funcs = _unlock_funcs;
+	return next(dev);
+}
+
+static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+	void *func_data[3] = { NULL };
+	int ret;
+
+	if (lk_unlk->session.who < OPAL_ADMIN1 ||
+	    lk_unlk->session.who > OPAL_USER9)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, NULL);
+	dev->func_data = func_data;
+	dev->func_data[1] = &lk_unlk->session;
+	dev->func_data[2] = lk_unlk;
+
+	if (lk_unlk->session.sum)
+		ret = __opal_lock_unlock_sum(dev);
+	else
+		ret = __opal_lock_unlock(dev);
+
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
+{
+	static const opal_step owner_funcs[] = {
+		opal_discovery0,
+		start_anybodyASP_opal_session,
+		get_msid_cpin_pin,
+		end_opal_session,
+		start_SIDASP_opal_session,
+		set_sid_cpin_pin,
+		end_opal_session,
+		NULL
+	};
+	void *data[6] = { NULL };
+	int ret;
+
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, owner_funcs);
+	dev->func_data = data;
+	dev->func_data[4] = opal;
+	dev->func_data[5] = opal;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act)
+{
+	void *data[4] = { NULL };
+	static const opal_step active_funcs[] = {
+		opal_discovery0,
+		start_SIDASP_opal_session, /* Open session as SID auth */
+		get_lsp_lifecycle,
+		activate_lsp,
+		end_opal_session,
+		NULL
+	};
+	int ret;
+
+	if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, active_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = &opal_lr_act->key;
+	dev->func_data[3] = opal_lr_act;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_setup_locking_range(struct opal_dev *dev,
+				    struct opal_user_lr_setup *opal_lrs)
+{
+	void *data[3] = { NULL };
+	static const opal_step lr_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		setup_locking_range,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, lr_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = &opal_lrs->session;
+	dev->func_data[2] = opal_lrs;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+	static const opal_step pw_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		set_new_pw,
+		end_opal_session,
+		NULL
+	};
+	void *data[3] = { NULL };
+	int ret;
+
+	if (opal_pw->session.who < OPAL_ADMIN1 ||
+	    opal_pw->session.who > OPAL_USER9  ||
+	    opal_pw->new_user_pw.who < OPAL_ADMIN1 ||
+	    opal_pw->new_user_pw.who > OPAL_USER9)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, pw_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = (void *) &opal_pw->session;
+	dev->func_data[2] = (void *) &opal_pw->new_user_pw;
+
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_activate_user(struct opal_dev *dev,
+			      struct opal_session_info *opal_session)
+{
+	static const opal_step act_funcs[] = {
+		opal_discovery0,
+		start_admin1LSP_opal_session,
+		internal_activate_user,
+		end_opal_session,
+		NULL
+	};
+	void *data[3] = { NULL };
+	int ret;
+
+	/* We can't activate Admin1 it's active as manufactured */
+	if (opal_session->who < OPAL_USER1 &&
+	    opal_session->who > OPAL_USER9) {
+		pr_err("Who was not a valid user: %d\n", opal_session->who);
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, act_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = &opal_session->opal_key;
+	dev->func_data[2] = opal_session;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+bool opal_unlock_from_suspend(struct opal_dev *dev)
+{
+	struct opal_suspend_data *suspend;
+	void *func_data[3] = { NULL };
+	bool was_failure = false;
+	int ret = 0;
+
+	if (!dev)
+		return false;
+	if (!dev->supported)
+		return false;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, NULL);
+	dev->func_data = func_data;
+
+	list_for_each_entry(suspend, &dev->unlk_lst, node) {
+		dev->state = 0;
+		dev->func_data[1] = &suspend->unlk.session;
+		dev->func_data[2] = &suspend->unlk;
+		dev->tsn = 0;
+		dev->hsn = 0;
+
+		if (suspend->unlk.session.sum)
+			ret = __opal_lock_unlock_sum(dev);
+		else
+			ret = __opal_lock_unlock(dev);
+		if (ret) {
+			pr_warn("Failed to unlock LR %hhu with sum %d\n",
+				suspend->unlk.session.opal_key.lr,
+				suspend->unlk.session.sum);
+			was_failure = true;
+		}
+	}
+	mutex_unlock(&dev->dev_lock);
+	return was_failure;
+}
+EXPORT_SYMBOL(opal_unlock_from_suspend);
+
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr)
+{
+	void __user *arg = (void __user *)ptr;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (!dev->supported) {
+		pr_err("Not supported\n");
+		return -ENOTSUPP;
+	}
+
+	switch (cmd) {
+	case IOC_OPAL_SAVE: {
+		struct opal_lock_unlock lk_unlk;
+
+		if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk)))
+			return -EFAULT;
+		return opal_save(dev, &lk_unlk);
+	}
+	case IOC_OPAL_LOCK_UNLOCK: {
+		struct opal_lock_unlock lk_unlk;
+
+		if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk)))
+			return -EFAULT;
+		return opal_lock_unlock(dev, &lk_unlk);
+	}
+	case IOC_OPAL_TAKE_OWNERSHIP: {
+		struct opal_key opal_key;
+
+		if (copy_from_user(&opal_key, arg, sizeof(opal_key)))
+			return -EFAULT;
+		return opal_take_ownership(dev, &opal_key);
+	}
+	case IOC_OPAL_ACTIVATE_LSP: {
+		struct opal_lr_act opal_lr_act;
+
+		if (copy_from_user(&opal_lr_act, arg, sizeof(opal_lr_act)))
+			return -EFAULT;
+		return opal_activate_lsp(dev, &opal_lr_act);
+	}
+	case IOC_OPAL_SET_PW: {
+		struct opal_new_pw opal_pw;
+
+		if (copy_from_user(&opal_pw, arg, sizeof(opal_pw)))
+			return -EFAULT;
+		return opal_set_new_pw(dev, &opal_pw);
+	}
+	case IOC_OPAL_ACTIVATE_USR: {
+		struct opal_session_info session;
+
+		if (copy_from_user(&session, arg, sizeof(session)))
+			return -EFAULT;
+		return opal_activate_user(dev, &session);
+	}
+	case IOC_OPAL_REVERT_TPR: {
+		struct opal_key opal_key;
+
+		if (copy_from_user(&opal_key, arg, sizeof(opal_key)))
+			return -EFAULT;
+		return opal_reverttper(dev, &opal_key);
+	}
+	case IOC_OPAL_LR_SETUP: {
+		struct opal_user_lr_setup lrs;
+
+		if (copy_from_user(&lrs, arg, sizeof(lrs)))
+			return -EFAULT;
+		return opal_setup_locking_range(dev, &lrs);
+	}
+	case IOC_OPAL_ADD_USR_TO_LR: {
+		struct opal_lock_unlock lk_unlk;
+
+		if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk)))
+			return -EFAULT;
+		return opal_add_user_to_lr(dev, &lk_unlk);
+	}
+	case IOC_OPAL_ENABLE_DISABLE_MBR: {
+		struct opal_mbr_data mbr;
+
+		if (copy_from_user(&mbr, arg, sizeof(mbr)))
+			return -EFAULT;
+		return opal_enable_disable_shadow_mbr(dev, &mbr);
+	}
+	case IOC_OPAL_ERASE_LR: {
+		struct opal_session_info session;
+
+		if (copy_from_user(&session, arg, sizeof(session)))
+			return -EFAULT;
+		return opal_erase_locking_range(dev, &session);
+	}
+	case IOC_OPAL_SECURE_ERASE_LR: {
+		struct opal_session_info session;
+
+		if (copy_from_user(&session, arg, sizeof(session)))
+			return -EFAULT;
+		return opal_secure_erase_locking_range(dev, &session);
+	}
+	default:
+		pr_warn("No such Opal Ioctl %u\n", cmd);
+	}
+	return -ENOTTY;
+}
+EXPORT_SYMBOL_GPL(sed_ioctl);
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
new file mode 100644
index 000000000000..af1a85eae193
--- /dev/null
+++ b/include/linux/sed-opal.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef LINUX_OPAL_H
+#define LINUX_OPAL_H
+
+#include <uapi/linux/sed-opal.h>
+#include <linux/kernel.h>
+
+/*
+ * These constant values come from:
+ * SPC-4 section
+ * 6.30 SECURITY PROTOCOL IN command / table 265.
+ */
+enum {
+	TCG_SECP_00 = 0,
+	TCG_SECP_01,
+};
+struct opal_dev;
+
+#define IO_BUFFER_LENGTH 2048
+#define MAX_TOKS 64
+
+typedef int (*opal_step)(struct opal_dev *dev);
+typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp,
+			    void *buffer, size_t len, bool send);
+
+
+enum opal_atom_width {
+	OPAL_WIDTH_TINY,
+	OPAL_WIDTH_SHORT,
+	OPAL_WIDTH_MEDIUM,
+	OPAL_WIDTH_LONG,
+	OPAL_WIDTH_TOKEN
+};
+
+/*
+ * Token defs derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * 3.2.2 Data Stream Encoding
+ */
+enum opal_response_token {
+	OPAL_DTA_TOKENID_BYTESTRING = 0xe0,
+	OPAL_DTA_TOKENID_SINT = 0xe1,
+	OPAL_DTA_TOKENID_UINT = 0xe2,
+	OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */
+	OPAL_DTA_TOKENID_INVALID = 0X0
+};
+
+/*
+ * On the parsed response, we don't store again the toks that are already
+ * stored in the response buffer. Instead, for each token, we just store a
+ * pointer to the position in the buffer where the token starts, and the size
+ * of the token in bytes.
+ */
+struct opal_resp_tok {
+	const u8 *pos;
+	size_t len;
+	enum opal_response_token type;
+	enum opal_atom_width width;
+	union {
+		u64 u;
+		s64 s;
+	} stored;
+};
+
+/*
+ * From the response header it's not possible to know how many tokens there are
+ * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later
+ * if we start dealing with messages that have more than that, we can increase
+ * this number. This is done to avoid having to make two passes through the
+ * response, the first one counting how many tokens we have and the second one
+ * actually storing the positions.
+ */
+struct parsed_resp {
+	int num;
+	struct opal_resp_tok toks[MAX_TOKS];
+};
+
+/**
+ * struct opal_dev - The structure representing a OPAL enabled SED.
+ * @supported: Whether or not OPAL is supported on this controller.
+ * @send_recv: The combined sec_send/sec_recv function pointer.
+ * @opal_step: A series of opal methods that are necessary to complete a command.
+ * @func_data: An array of parameters for the opal methods above.
+ * @state: Describes the current opal_step we're working on.
+ * @dev_lock: Locks the entire opal_dev structure.
+ * @parsed: Parsed response from controller.
+ * @prev_data: Data returned from a method to the controller.
+ * @unlk_lst: A list of Locking ranges to unlock on this device during a resume.
+ */
+struct opal_dev {
+	bool initialized;
+	bool supported;
+	sec_send_recv *send_recv;
+
+	const opal_step *funcs;
+	void **func_data;
+	int state;
+	struct mutex dev_lock;
+	u16 comid;
+	u32 hsn;
+	u32 tsn;
+	u64 align;
+	u64 lowest_lba;
+
+	size_t pos;
+	u8 cmd[IO_BUFFER_LENGTH];
+	u8 resp[IO_BUFFER_LENGTH];
+
+	struct parsed_resp parsed;
+	size_t prev_d_len;
+	void *prev_data;
+
+	struct list_head unlk_lst;
+};
+
+#ifdef CONFIG_BLK_SED_OPAL
+bool opal_unlock_from_suspend(struct opal_dev *dev);
+void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv);
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr);
+
+static inline bool is_sed_ioctl(unsigned int cmd)
+{
+	switch (cmd) {
+	case IOC_OPAL_SAVE:
+	case IOC_OPAL_LOCK_UNLOCK:
+	case IOC_OPAL_TAKE_OWNERSHIP:
+	case IOC_OPAL_ACTIVATE_LSP:
+	case IOC_OPAL_SET_PW:
+	case IOC_OPAL_ACTIVATE_USR:
+	case IOC_OPAL_REVERT_TPR:
+	case IOC_OPAL_LR_SETUP:
+	case IOC_OPAL_ADD_USR_TO_LR:
+	case IOC_OPAL_ENABLE_DISABLE_MBR:
+	case IOC_OPAL_ERASE_LR:
+	case IOC_OPAL_SECURE_ERASE_LR:
+		return true;
+	}
+	return false;
+}
+#else
+static inline bool is_sed_ioctl(unsigned int cmd)
+{
+	return false;
+}
+
+static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd,
+			    unsigned long ptr)
+{
+	return 0;
+}
+static inline bool opal_unlock_from_suspend(struct opal_dev *dev)
+{
+	return false;
+}
+static inline void init_opal_dev(struct opal_dev *opal_dev,
+				 sec_send_recv *send_recv)
+{
+	opal_dev->supported = false;
+	opal_dev->initialized = true;
+}
+#endif /* CONFIG_BLK_SED_OPAL */
+#endif /* LINUX_OPAL_H */
-- 
cgit v1.2.3


From 2aad40d911eeb7dcac91c669f2762a28134f0eb1 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Fri, 27 Jan 2017 03:12:57 -0800
Subject: remoteproc: Move qcom_mdt_loader into drivers/soc/qcom

With the remoteproc parts cleaned out of the MDT loader we can move it
to drivers/soc/qcom.

Acked-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/Kconfig           |   3 -
 drivers/remoteproc/Makefile          |   1 -
 drivers/remoteproc/qcom_adsp_pil.c   |   2 +-
 drivers/remoteproc/qcom_mdt_loader.c | 205 -----------------------------------
 drivers/remoteproc/qcom_mdt_loader.h |  13 ---
 drivers/remoteproc/qcom_q6v5_pil.c   |   2 +-
 drivers/remoteproc/qcom_wcnss.c      |   2 +-
 drivers/soc/qcom/Kconfig             |   4 +
 drivers/soc/qcom/Makefile            |   1 +
 drivers/soc/qcom/mdt_loader.c        | 204 ++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/mdt_loader.h  |  18 +++
 11 files changed, 230 insertions(+), 225 deletions(-)
 delete mode 100644 drivers/remoteproc/qcom_mdt_loader.c
 delete mode 100644 drivers/remoteproc/qcom_mdt_loader.h
 create mode 100644 drivers/soc/qcom/mdt_loader.c
 create mode 100644 include/linux/soc/qcom/mdt_loader.h

(limited to 'include/linux')

diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index 71ea703190c6..555dba04b5ae 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -87,9 +87,6 @@ config QCOM_ADSP_PIL
 config QCOM_RPROC_COMMON
 	tristate
 
-config QCOM_MDT_LOADER
-	tristate
-
 config QCOM_Q6V5_PIL
 	tristate "Qualcomm Hexagon V5 Peripherial Image Loader"
 	depends on OF && ARCH_QCOM
diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile
index d4f9525a226d..ffc5e430df27 100644
--- a/drivers/remoteproc/Makefile
+++ b/drivers/remoteproc/Makefile
@@ -12,7 +12,6 @@ obj-$(CONFIG_OMAP_REMOTEPROC)		+= omap_remoteproc.o
 obj-$(CONFIG_WKUP_M3_RPROC)		+= wkup_m3_rproc.o
 obj-$(CONFIG_DA8XX_REMOTEPROC)		+= da8xx_remoteproc.o
 obj-$(CONFIG_QCOM_ADSP_PIL)		+= qcom_adsp_pil.o
-obj-$(CONFIG_QCOM_MDT_LOADER)		+= qcom_mdt_loader.o
 obj-$(CONFIG_QCOM_RPROC_COMMON)		+= qcom_common.o
 obj-$(CONFIG_QCOM_Q6V5_PIL)		+= qcom_q6v5_pil.o
 obj-$(CONFIG_QCOM_WCNSS_PIL)		+= qcom_wcnss_pil.o
diff --git a/drivers/remoteproc/qcom_adsp_pil.c b/drivers/remoteproc/qcom_adsp_pil.c
index c1ee5c818b42..301b820216f6 100644
--- a/drivers/remoteproc/qcom_adsp_pil.c
+++ b/drivers/remoteproc/qcom_adsp_pil.c
@@ -26,11 +26,11 @@
 #include <linux/qcom_scm.h>
 #include <linux/regulator/consumer.h>
 #include <linux/remoteproc.h>
+#include <linux/soc/qcom/mdt_loader.h>
 #include <linux/soc/qcom/smem.h>
 #include <linux/soc/qcom/smem_state.h>
 
 #include "qcom_common.h"
-#include "qcom_mdt_loader.h"
 #include "remoteproc_internal.h"
 
 struct adsp_data {
diff --git a/drivers/remoteproc/qcom_mdt_loader.c b/drivers/remoteproc/qcom_mdt_loader.c
deleted file mode 100644
index 790ab31f31ff..000000000000
--- a/drivers/remoteproc/qcom_mdt_loader.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Qualcomm Peripheral Image Loader
- *
- * Copyright (C) 2016 Linaro Ltd
- * Copyright (C) 2015 Sony Mobile Communications Inc
- * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/device.h>
-#include <linux/elf.h>
-#include <linux/firmware.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/qcom_scm.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-
-#include "qcom_mdt_loader.h"
-
-static bool mdt_phdr_valid(const struct elf32_phdr *phdr)
-{
-	if (phdr->p_type != PT_LOAD)
-		return false;
-
-	if ((phdr->p_flags & QCOM_MDT_TYPE_MASK) == QCOM_MDT_TYPE_HASH)
-		return false;
-
-	if (!phdr->p_memsz)
-		return false;
-
-	return true;
-}
-
-/**
- * qcom_mdt_get_size() - acquire size of the memory region needed to load mdt
- * @fw:		firmware object for the mdt file
- *
- * Returns size of the loaded firmware blob, or -EINVAL on failure.
- */
-ssize_t qcom_mdt_get_size(const struct firmware *fw)
-{
-	const struct elf32_phdr *phdrs;
-	const struct elf32_phdr *phdr;
-	const struct elf32_hdr *ehdr;
-	phys_addr_t min_addr = (phys_addr_t)ULLONG_MAX;
-	phys_addr_t max_addr = 0;
-	int i;
-
-	ehdr = (struct elf32_hdr *)fw->data;
-	phdrs = (struct elf32_phdr *)(ehdr + 1);
-
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		phdr = &phdrs[i];
-
-		if (!mdt_phdr_valid(phdr))
-			continue;
-
-		if (phdr->p_paddr < min_addr)
-			min_addr = phdr->p_paddr;
-
-		if (phdr->p_paddr + phdr->p_memsz > max_addr)
-			max_addr = ALIGN(phdr->p_paddr + phdr->p_memsz, SZ_4K);
-	}
-
-	return min_addr < max_addr ? max_addr - min_addr : -EINVAL;
-}
-EXPORT_SYMBOL_GPL(qcom_mdt_get_size);
-
-/**
- * qcom_mdt_load() - load the firmware which header is loaded as fw
- * @dev:	device handle to associate resources with
- * @fw:		firmware object for the mdt file
- * @firmware:	name of the firmware, for construction of segment file names
- * @pas_id:	PAS identifier
- * @mem_region:	allocated memory region to load firmware into
- * @mem_phys:	physical address of allocated memory region
- * @mem_size:	size of the allocated memory region
- *
- * Returns 0 on success, negative errno otherwise.
- */
-int qcom_mdt_load(struct device *dev, const struct firmware *fw,
-		  const char *firmware, int pas_id, void *mem_region,
-		  phys_addr_t mem_phys, size_t mem_size)
-{
-	const struct elf32_phdr *phdrs;
-	const struct elf32_phdr *phdr;
-	const struct elf32_hdr *ehdr;
-	const struct firmware *seg_fw;
-	phys_addr_t mem_reloc;
-	phys_addr_t min_addr = (phys_addr_t)ULLONG_MAX;
-	phys_addr_t max_addr = 0;
-	size_t fw_name_len;
-	size_t offset;
-	char *fw_name;
-	bool relocate = false;
-	void *ptr;
-	int ret;
-	int i;
-
-	if (!fw || !mem_region || !mem_phys || !mem_size)
-		return -EINVAL;
-
-	ehdr = (struct elf32_hdr *)fw->data;
-	phdrs = (struct elf32_phdr *)(ehdr + 1);
-
-	fw_name_len = strlen(firmware);
-	if (fw_name_len <= 4)
-		return -EINVAL;
-
-	fw_name = kstrdup(firmware, GFP_KERNEL);
-	if (!fw_name)
-		return -ENOMEM;
-
-	ret = qcom_scm_pas_init_image(pas_id, fw->data, fw->size);
-	if (ret) {
-		dev_err(dev, "invalid firmware metadata\n");
-		goto out;
-	}
-
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		phdr = &phdrs[i];
-
-		if (!mdt_phdr_valid(phdr))
-			continue;
-
-		if (phdr->p_flags & QCOM_MDT_RELOCATABLE)
-			relocate = true;
-
-		if (phdr->p_paddr < min_addr)
-			min_addr = phdr->p_paddr;
-
-		if (phdr->p_paddr + phdr->p_memsz > max_addr)
-			max_addr = ALIGN(phdr->p_paddr + phdr->p_memsz, SZ_4K);
-	}
-
-	if (relocate) {
-		ret = qcom_scm_pas_mem_setup(pas_id, mem_phys, max_addr - min_addr);
-		if (ret) {
-			dev_err(dev, "unable to setup relocation\n");
-			goto out;
-		}
-
-		/*
-		 * The image is relocatable, so offset each segment based on
-		 * the lowest segment address.
-		 */
-		mem_reloc = min_addr;
-	} else {
-		/*
-		 * Image is not relocatable, so offset each segment based on
-		 * the allocated physical chunk of memory.
-		 */
-		mem_reloc = mem_phys;
-	}
-
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		phdr = &phdrs[i];
-
-		if (!mdt_phdr_valid(phdr))
-			continue;
-
-		offset = phdr->p_paddr - mem_reloc;
-		if (offset < 0 || offset + phdr->p_memsz > mem_size) {
-			dev_err(dev, "segment outside memory range\n");
-			ret = -EINVAL;
-			break;
-		}
-
-		ptr = mem_region + offset;
-
-		if (phdr->p_filesz) {
-			sprintf(fw_name + fw_name_len - 3, "b%02d", i);
-			ret = request_firmware(&seg_fw, fw_name, dev);
-			if (ret) {
-				dev_err(dev, "failed to load %s\n", fw_name);
-				break;
-			}
-
-			memcpy(ptr, seg_fw->data, seg_fw->size);
-
-			release_firmware(seg_fw);
-		}
-
-		if (phdr->p_memsz > phdr->p_filesz)
-			memset(ptr + phdr->p_filesz, 0, phdr->p_memsz - phdr->p_filesz);
-	}
-
-out:
-	kfree(fw_name);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(qcom_mdt_load);
-
-MODULE_DESCRIPTION("Firmware parser for Qualcomm MDT format");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/remoteproc/qcom_mdt_loader.h b/drivers/remoteproc/qcom_mdt_loader.h
deleted file mode 100644
index ff6e45b77326..000000000000
--- a/drivers/remoteproc/qcom_mdt_loader.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __QCOM_MDT_LOADER_H__
-#define __QCOM_MDT_LOADER_H__
-
-#define QCOM_MDT_TYPE_MASK	(7 << 24)
-#define QCOM_MDT_TYPE_HASH	(2 << 24)
-#define QCOM_MDT_RELOCATABLE	BIT(27)
-
-ssize_t qcom_mdt_get_size(const struct firmware *fw);
-int qcom_mdt_load(struct device *dev, const struct firmware *fw,
-		  const char *fw_name, int pas_id, void *mem_region,
-		  phys_addr_t mem_phys, size_t mem_size);
-
-#endif
diff --git a/drivers/remoteproc/qcom_q6v5_pil.c b/drivers/remoteproc/qcom_q6v5_pil.c
index 2e44c06e604a..b129261d7e5e 100644
--- a/drivers/remoteproc/qcom_q6v5_pil.c
+++ b/drivers/remoteproc/qcom_q6v5_pil.c
@@ -29,12 +29,12 @@
 #include <linux/regulator/consumer.h>
 #include <linux/remoteproc.h>
 #include <linux/reset.h>
+#include <linux/soc/qcom/mdt_loader.h>
 #include <linux/soc/qcom/smem.h>
 #include <linux/soc/qcom/smem_state.h>
 
 #include "remoteproc_internal.h"
 #include "qcom_common.h"
-#include "qcom_mdt_loader.h"
 
 #include <linux/qcom_scm.h>
 
diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c
index fbb25ea4ae8a..781211c144c6 100644
--- a/drivers/remoteproc/qcom_wcnss.c
+++ b/drivers/remoteproc/qcom_wcnss.c
@@ -28,12 +28,12 @@
 #include <linux/qcom_scm.h>
 #include <linux/regulator/consumer.h>
 #include <linux/remoteproc.h>
+#include <linux/soc/qcom/mdt_loader.h>
 #include <linux/soc/qcom/smem.h>
 #include <linux/soc/qcom/smem_state.h>
 #include <linux/rpmsg/qcom_smd.h>
 
 #include "qcom_common.h"
-#include "qcom_mdt_loader.h"
 #include "remoteproc_internal.h"
 #include "qcom_wcnss.h"
 
diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 461b387d03cc..78b1bb7bcf20 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -10,6 +10,10 @@ config QCOM_GSBI
           functions for connecting the underlying serial UART, SPI, and I2C
           devices to the output pins.
 
+config QCOM_MDT_LOADER
+	tristate
+	select QCOM_SCM
+
 config QCOM_PM
 	bool "Qualcomm Power Management"
 	depends on ARCH_QCOM && !ARM64
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index fdd664edf0bd..1f30260b06b8 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_QCOM_GSBI)	+=	qcom_gsbi.o
+obj-$(CONFIG_QCOM_MDT_LOADER)	+= mdt_loader.o
 obj-$(CONFIG_QCOM_PM)	+=	spm.o
 obj-$(CONFIG_QCOM_SMD) +=	smd.o
 obj-$(CONFIG_QCOM_SMD_RPM)	+= smd-rpm.o
diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c
new file mode 100644
index 000000000000..98b2373c3a97
--- /dev/null
+++ b/drivers/soc/qcom/mdt_loader.c
@@ -0,0 +1,204 @@
+/*
+ * Qualcomm Peripheral Image Loader
+ *
+ * Copyright (C) 2016 Linaro Ltd
+ * Copyright (C) 2015 Sony Mobile Communications Inc
+ * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/device.h>
+#include <linux/elf.h>
+#include <linux/firmware.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/qcom_scm.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/soc/qcom/mdt_loader.h>
+
+static bool mdt_phdr_valid(const struct elf32_phdr *phdr)
+{
+	if (phdr->p_type != PT_LOAD)
+		return false;
+
+	if ((phdr->p_flags & QCOM_MDT_TYPE_MASK) == QCOM_MDT_TYPE_HASH)
+		return false;
+
+	if (!phdr->p_memsz)
+		return false;
+
+	return true;
+}
+
+/**
+ * qcom_mdt_get_size() - acquire size of the memory region needed to load mdt
+ * @fw:		firmware object for the mdt file
+ *
+ * Returns size of the loaded firmware blob, or -EINVAL on failure.
+ */
+ssize_t qcom_mdt_get_size(const struct firmware *fw)
+{
+	const struct elf32_phdr *phdrs;
+	const struct elf32_phdr *phdr;
+	const struct elf32_hdr *ehdr;
+	phys_addr_t min_addr = (phys_addr_t)ULLONG_MAX;
+	phys_addr_t max_addr = 0;
+	int i;
+
+	ehdr = (struct elf32_hdr *)fw->data;
+	phdrs = (struct elf32_phdr *)(ehdr + 1);
+
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr = &phdrs[i];
+
+		if (!mdt_phdr_valid(phdr))
+			continue;
+
+		if (phdr->p_paddr < min_addr)
+			min_addr = phdr->p_paddr;
+
+		if (phdr->p_paddr + phdr->p_memsz > max_addr)
+			max_addr = ALIGN(phdr->p_paddr + phdr->p_memsz, SZ_4K);
+	}
+
+	return min_addr < max_addr ? max_addr - min_addr : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(qcom_mdt_get_size);
+
+/**
+ * qcom_mdt_load() - load the firmware which header is loaded as fw
+ * @dev:	device handle to associate resources with
+ * @fw:		firmware object for the mdt file
+ * @firmware:	name of the firmware, for construction of segment file names
+ * @pas_id:	PAS identifier
+ * @mem_region:	allocated memory region to load firmware into
+ * @mem_phys:	physical address of allocated memory region
+ * @mem_size:	size of the allocated memory region
+ *
+ * Returns 0 on success, negative errno otherwise.
+ */
+int qcom_mdt_load(struct device *dev, const struct firmware *fw,
+		  const char *firmware, int pas_id, void *mem_region,
+		  phys_addr_t mem_phys, size_t mem_size)
+{
+	const struct elf32_phdr *phdrs;
+	const struct elf32_phdr *phdr;
+	const struct elf32_hdr *ehdr;
+	const struct firmware *seg_fw;
+	phys_addr_t mem_reloc;
+	phys_addr_t min_addr = (phys_addr_t)ULLONG_MAX;
+	phys_addr_t max_addr = 0;
+	size_t fw_name_len;
+	size_t offset;
+	char *fw_name;
+	bool relocate = false;
+	void *ptr;
+	int ret;
+	int i;
+
+	if (!fw || !mem_region || !mem_phys || !mem_size)
+		return -EINVAL;
+
+	ehdr = (struct elf32_hdr *)fw->data;
+	phdrs = (struct elf32_phdr *)(ehdr + 1);
+
+	fw_name_len = strlen(firmware);
+	if (fw_name_len <= 4)
+		return -EINVAL;
+
+	fw_name = kstrdup(firmware, GFP_KERNEL);
+	if (!fw_name)
+		return -ENOMEM;
+
+	ret = qcom_scm_pas_init_image(pas_id, fw->data, fw->size);
+	if (ret) {
+		dev_err(dev, "invalid firmware metadata\n");
+		goto out;
+	}
+
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr = &phdrs[i];
+
+		if (!mdt_phdr_valid(phdr))
+			continue;
+
+		if (phdr->p_flags & QCOM_MDT_RELOCATABLE)
+			relocate = true;
+
+		if (phdr->p_paddr < min_addr)
+			min_addr = phdr->p_paddr;
+
+		if (phdr->p_paddr + phdr->p_memsz > max_addr)
+			max_addr = ALIGN(phdr->p_paddr + phdr->p_memsz, SZ_4K);
+	}
+
+	if (relocate) {
+		ret = qcom_scm_pas_mem_setup(pas_id, mem_phys, max_addr - min_addr);
+		if (ret) {
+			dev_err(dev, "unable to setup relocation\n");
+			goto out;
+		}
+
+		/*
+		 * The image is relocatable, so offset each segment based on
+		 * the lowest segment address.
+		 */
+		mem_reloc = min_addr;
+	} else {
+		/*
+		 * Image is not relocatable, so offset each segment based on
+		 * the allocated physical chunk of memory.
+		 */
+		mem_reloc = mem_phys;
+	}
+
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr = &phdrs[i];
+
+		if (!mdt_phdr_valid(phdr))
+			continue;
+
+		offset = phdr->p_paddr - mem_reloc;
+		if (offset < 0 || offset + phdr->p_memsz > mem_size) {
+			dev_err(dev, "segment outside memory range\n");
+			ret = -EINVAL;
+			break;
+		}
+
+		ptr = mem_region + offset;
+
+		if (phdr->p_filesz) {
+			sprintf(fw_name + fw_name_len - 3, "b%02d", i);
+			ret = request_firmware(&seg_fw, fw_name, dev);
+			if (ret) {
+				dev_err(dev, "failed to load %s\n", fw_name);
+				break;
+			}
+
+			memcpy(ptr, seg_fw->data, seg_fw->size);
+
+			release_firmware(seg_fw);
+		}
+
+		if (phdr->p_memsz > phdr->p_filesz)
+			memset(ptr + phdr->p_filesz, 0, phdr->p_memsz - phdr->p_filesz);
+	}
+
+out:
+	kfree(fw_name);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(qcom_mdt_load);
+
+MODULE_DESCRIPTION("Firmware parser for Qualcomm MDT format");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h
new file mode 100644
index 000000000000..f423001db3a9
--- /dev/null
+++ b/include/linux/soc/qcom/mdt_loader.h
@@ -0,0 +1,18 @@
+#ifndef __QCOM_MDT_LOADER_H__
+#define __QCOM_MDT_LOADER_H__
+
+#include <linux/types.h>
+
+#define QCOM_MDT_TYPE_MASK	(7 << 24)
+#define QCOM_MDT_TYPE_HASH	(2 << 24)
+#define QCOM_MDT_RELOCATABLE	BIT(27)
+
+struct device;
+struct firmware;
+
+ssize_t qcom_mdt_get_size(const struct firmware *fw);
+int qcom_mdt_load(struct device *dev, const struct firmware *fw,
+		  const char *fw_name, int pas_id, void *mem_region,
+		  phys_addr_t mem_phys, size_t mem_size);
+
+#endif
-- 
cgit v1.2.3


From 4d29b2e5ad37a50b186f828d73086cdbf36580bf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 2 Feb 2017 01:30:49 +0100
Subject: PM / core: Update kerneldoc comments in pm.h

Refresh the struct dev_pm_ops kerneldoc comment, so that it looks
better and is more readable after processing by Sphinx, and drop
the kerneldoc marker from a few other comments ("PM_EVENT_ messages"
and a couple of enum types declarations) which are not proper
kerneldoc and generally confuse Sphinx.

Also change the comment describing struct dev_pm_domain into
a kerneldoc one.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/pm.h | 113 +++++++++++++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index f926af41e122..10867b11d503 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -64,24 +64,7 @@ typedef struct pm_message {
 } pm_message_t;
 
 /**
- * struct dev_pm_ops - device PM callbacks
- *
- * Several device power state transitions are externally visible, affecting
- * the state of pending I/O queues and (for drivers that touch hardware)
- * interrupts, wakeups, DMA, and other hardware state.  There may also be
- * internal transitions to various low-power modes which are transparent
- * to the rest of the driver stack (such as a driver that's ON gating off
- * clocks which are not in active use).
- *
- * The externally visible transitions are handled with the help of callbacks
- * included in this structure in such a way that two levels of callbacks are
- * involved.  First, the PM core executes callbacks provided by PM domains,
- * device types, classes and bus types.  They are the subsystem-level callbacks
- * supposed to execute callbacks provided by device drivers, although they may
- * choose not to do that.  If the driver callbacks are executed, they have to
- * collaborate with the subsystem-level callbacks to achieve the goals
- * appropriate for the given system transition, given transition phase and the
- * subsystem the device belongs to.
+ * struct dev_pm_ops - device PM callbacks.
  *
  * @prepare: The principal role of this callback is to prevent new children of
  *	the device from being registered after it has returned (the driver's
@@ -240,34 +223,6 @@ typedef struct pm_message {
  *	driver's interrupt handler, which is guaranteed not to run while
  *	@restore_noirq() is being executed.  Analogous to @resume_noirq().
  *
- * All of the above callbacks, except for @complete(), return error codes.
- * However, the error codes returned by the resume operations, @resume(),
- * @thaw(), @restore(), @resume_noirq(), @thaw_noirq(), and @restore_noirq(), do
- * not cause the PM core to abort the resume transition during which they are
- * returned.  The error codes returned in those cases are only printed by the PM
- * core to the system logs for debugging purposes.  Still, it is recommended
- * that drivers only return error codes from their resume methods in case of an
- * unrecoverable failure (i.e. when the device being handled refuses to resume
- * and becomes unusable) to allow us to modify the PM core in the future, so
- * that it can avoid attempting to handle devices that failed to resume and
- * their children.
- *
- * It is allowed to unregister devices while the above callbacks are being
- * executed.  However, a callback routine must NOT try to unregister the device
- * it was called for, although it may unregister children of that device (for
- * example, if it detects that a child was unplugged while the system was
- * asleep).
- *
- * Refer to Documentation/power/admin-guide/devices.rst for more information about the role
- * of the above callbacks in the system suspend process.
- *
- * There also are callbacks related to runtime power management of devices.
- * Again, these callbacks are executed by the PM core only for subsystems
- * (PM domains, device types, classes and bus types) and the subsystem-level
- * callbacks are supposed to invoke the driver callbacks.  Moreover, the exact
- * actions to be performed by a device driver's callbacks generally depend on
- * the platform and subsystem the device belongs to.
- *
  * @runtime_suspend: Prepare the device for a condition in which it won't be
  *	able to communicate with the CPU(s) and RAM due to power management.
  *	This need not mean that the device should be put into a low-power state.
@@ -287,11 +242,54 @@ typedef struct pm_message {
  *	Check these conditions, and return 0 if it's appropriate to let the PM
  *	core queue a suspend request for the device.
  *
- * Refer to Documentation/power/runtime_pm.txt for more information about the
- * role of the above callbacks in device runtime power management.
+ * Several device power state transitions are externally visible, affecting
+ * the state of pending I/O queues and (for drivers that touch hardware)
+ * interrupts, wakeups, DMA, and other hardware state.  There may also be
+ * internal transitions to various low-power modes which are transparent
+ * to the rest of the driver stack (such as a driver that's ON gating off
+ * clocks which are not in active use).
  *
+ * The externally visible transitions are handled with the help of callbacks
+ * included in this structure in such a way that, typically, two levels of
+ * callbacks are involved.  First, the PM core executes callbacks provided by PM
+ * domains, device types, classes and bus types.  They are the subsystem-level
+ * callbacks expected to execute callbacks provided by device drivers, although
+ * they may choose not to do that.  If the driver callbacks are executed, they
+ * have to collaborate with the subsystem-level callbacks to achieve the goals
+ * appropriate for the given system transition, given transition phase and the
+ * subsystem the device belongs to.
+ *
+ * All of the above callbacks, except for @complete(), return error codes.
+ * However, the error codes returned by @resume(), @thaw(), @restore(),
+ * @resume_noirq(), @thaw_noirq(), and @restore_noirq(), do not cause the PM
+ * core to abort the resume transition during which they are returned.  The
+ * error codes returned in those cases are only printed to the system logs for
+ * debugging purposes.  Still, it is recommended that drivers only return error
+ * codes from their resume methods in case of an unrecoverable failure (i.e.
+ * when the device being handled refuses to resume and becomes unusable) to
+ * allow the PM core to be modified in the future, so that it can avoid
+ * attempting to handle devices that failed to resume and their children.
+ *
+ * It is allowed to unregister devices while the above callbacks are being
+ * executed.  However, a callback routine MUST NOT try to unregister the device
+ * it was called for, although it may unregister children of that device (for
+ * example, if it detects that a child was unplugged while the system was
+ * asleep).
+ *
+ * Refer to Documentation/power/devices.txt for more information about the role
+ * of the above callbacks in the system suspend process.
+ *
+ * There also are callbacks related to runtime power management of devices.
+ * Again, as a rule these callbacks are executed by the PM core for subsystems
+ * (PM domains, device types, classes and bus types) and the subsystem-level
+ * callbacks are expected to invoke the driver callbacks.  Moreover, the exact
+ * actions to be performed by a device driver's callbacks generally depend on
+ * the platform and subsystem the device belongs to.
+ *
+ * Refer to Documentation/power/runtime_pm.txt for more information about the
+ * role of the @runtime_suspend(), @runtime_resume() and @runtime_idle()
+ * callbacks in device runtime power management.
  */
-
 struct dev_pm_ops {
 	int (*prepare)(struct device *dev);
 	void (*complete)(struct device *dev);
@@ -391,7 +389,7 @@ const struct dev_pm_ops name = { \
 	SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
 }
 
-/**
+/*
  * PM_EVENT_ messages
  *
  * The following PM_EVENT_ messages are defined for the internal use of the PM
@@ -487,7 +485,7 @@ const struct dev_pm_ops name = { \
 
 #define PMSG_IS_AUTO(msg)	(((msg).event & PM_EVENT_AUTO) != 0)
 
-/**
+/*
  * Device run-time power management status.
  *
  * These status labels are used internally by the PM core to indicate the
@@ -517,7 +515,7 @@ enum rpm_status {
 	RPM_SUSPENDING,
 };
 
-/**
+/*
  * Device run-time power management request types.
  *
  * RPM_REQ_NONE		Do nothing.
@@ -616,15 +614,18 @@ extern void update_pm_runtime_accounting(struct device *dev);
 extern int dev_pm_get_subsys_data(struct device *dev);
 extern void dev_pm_put_subsys_data(struct device *dev);
 
-/*
- * Power domains provide callbacks that are executed during system suspend,
- * hibernation, system resume and during runtime PM transitions along with
- * subsystem-level and driver-level callbacks.
+/**
+ * struct dev_pm_domain - power management domain representation.
  *
+ * @ops: Power management operations associated with this domain.
  * @detach: Called when removing a device from the domain.
  * @activate: Called before executing probe routines for bus types and drivers.
  * @sync: Called after successful driver probe.
  * @dismiss: Called after unsuccessful driver probe and after driver removal.
+ *
+ * Power domains provide callbacks that are executed during system suspend,
+ * hibernation, system resume and during runtime PM transitions instead of
+ * subsystem-level and driver-level callbacks.
  */
 struct dev_pm_domain {
 	struct dev_pm_ops	ops;
-- 
cgit v1.2.3


From 2728b2d2e5be4b828a523a06089cd605419fc65c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 2 Feb 2017 01:32:13 +0100
Subject: PM / core / docs: Convert sleep states API document to reST

Move the document describing the system sleep state transitions API
for devices to Documentation/driver-api/pm/, convert it to reST and
update it to use current terminology.  Also remove the remaining
reference to the old version of it from pm.h.

The new document still contains references to some documents in the
.txt format that will be converted later.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/driver-api/index.rst      |   1 +
 Documentation/driver-api/pm/conf.py     |  10 +
 Documentation/driver-api/pm/devices.rst | 732 ++++++++++++++++++++++++++++++++
 Documentation/driver-api/pm/index.rst   |  15 +
 Documentation/driver-api/pm/types.rst   |   5 +
 Documentation/power/devices.txt         | 716 -------------------------------
 include/linux/pm.h                      |   3 -
 7 files changed, 763 insertions(+), 719 deletions(-)
 create mode 100644 Documentation/driver-api/pm/conf.py
 create mode 100644 Documentation/driver-api/pm/devices.rst
 create mode 100644 Documentation/driver-api/pm/index.rst
 create mode 100644 Documentation/driver-api/pm/types.rst
 delete mode 100644 Documentation/power/devices.txt

(limited to 'include/linux')

diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 9cdf81b59ec5..ea580c0aa232 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -16,6 +16,7 @@ available subsections can be seen below.
 
    basics
    infrastructure
+   pm/index
    device-io
    dma-buf
    device_link
diff --git a/Documentation/driver-api/pm/conf.py b/Documentation/driver-api/pm/conf.py
new file mode 100644
index 000000000000..a89fac11272f
--- /dev/null
+++ b/Documentation/driver-api/pm/conf.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8; mode: python -*-
+
+project = "Device Power Management"
+
+tags.add("subproject")
+
+latex_documents = [
+    ('index', 'pm.tex', project,
+     'The kernel development community', 'manual'),
+]
diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
new file mode 100644
index 000000000000..f165cf6f733e
--- /dev/null
+++ b/Documentation/driver-api/pm/devices.rst
@@ -0,0 +1,732 @@
+.. |struct| replace:: :c:type:`struct`
+
+==============================
+Device Power Management Basics
+==============================
+
+::
+
+ Copyright (c) 2010-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
+ Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+Most of the code in Linux is device drivers, so most of the Linux power
+management (PM) code is also driver-specific.  Most drivers will do very
+little; others, especially for platforms with small batteries (like cell
+phones), will do a lot.
+
+This writeup gives an overview of how drivers interact with system-wide
+power management goals, emphasizing the models and interfaces that are
+shared by everything that hooks up to the driver model core.  Read it as
+background for the domain-specific work you'd do with any specific driver.
+
+
+Two Models for Device Power Management
+======================================
+
+Drivers will use one or both of these models to put devices into low-power
+states:
+
+    System Sleep model:
+
+	Drivers can enter low-power states as part of entering system-wide
+	low-power states like "suspend" (also known as "suspend-to-RAM"), or
+	(mostly for systems with disks) "hibernation" (also known as
+	"suspend-to-disk").
+
+	This is something that device, bus, and class drivers collaborate on
+	by implementing various role-specific suspend and resume methods to
+	cleanly power down hardware and software subsystems, then reactivate
+	them without loss of data.
+
+	Some drivers can manage hardware wakeup events, which make the system
+	leave the low-power state.  This feature may be enabled or disabled
+	using the relevant :file:`/sys/devices/.../power/wakeup` file (for
+	Ethernet drivers the ioctl interface used by ethtool may also be used
+	for this purpose); enabling it may cost some power usage, but let the
+	whole system enter low-power states more often.
+
+    Runtime Power Management model:
+
+	Devices may also be put into low-power states while the system is
+	running, independently of other power management activity in principle.
+	However, devices are not generally independent of each other (for
+	example, a parent device cannot be suspended unless all of its child
+	devices have been suspended).  Moreover, depending on the bus type the
+	device is on, it may be necessary to carry out some bus-specific
+	operations on the device for this purpose.  Devices put into low power
+	states at run time may require special handling during system-wide power
+	transitions (suspend or hibernation).
+
+	For these reasons not only the device driver itself, but also the
+	appropriate subsystem (bus type, device type or device class) driver and
+	the PM core are involved in runtime power management.  As in the system
+	sleep power management case, they need to collaborate by implementing
+	various role-specific suspend and resume methods, so that the hardware
+	is cleanly powered down and reactivated without data or service loss.
+
+There's not a lot to be said about those low-power states except that they are
+very system-specific, and often device-specific.  Also, that if enough devices
+have been put into low-power states (at runtime), the effect may be very similar
+to entering some system-wide low-power state (system sleep) ... and that
+synergies exist, so that several drivers using runtime PM might put the system
+into a state where even deeper power saving options are available.
+
+Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except
+for wakeup events), no more data read or written, and requests from upstream
+drivers are no longer accepted.  A given bus or platform may have different
+requirements though.
+
+Examples of hardware wakeup events include an alarm from a real time clock,
+network wake-on-LAN packets, keyboard or mouse activity, and media insertion
+or removal (for PCMCIA, MMC/SD, USB, and so on).
+
+Interfaces for Entering System Sleep States
+===========================================
+
+There are programming interfaces provided for subsystems (bus type, device type,
+device class) and device drivers to allow them to participate in the power
+management of devices they are concerned with.  These interfaces cover both
+system sleep and runtime power management.
+
+
+Device Power Management Operations
+----------------------------------
+
+Device power management operations, at the subsystem level as well as at the
+device driver level, are implemented by defining and populating objects of type
+|struct| :c:type:`dev_pm_ops` defined in :file:`include/linux/pm.h`.
+The roles of the methods included in it will be explained in what follows.  For
+now, it should be sufficient to remember that the last three methods are
+specific to runtime power management while the remaining ones are used during
+system-wide power transitions.
+
+There also is a deprecated "old" or "legacy" interface for power management
+operations available at least for some subsystems.  This approach does not use
+|struct| :c:type:`dev_pm_ops` objects and it is suitable only for implementing
+system sleep power management methods in a limited way.  Therefore it is not
+described in this document, so please refer directly to the source code for more
+information about it.
+
+
+Subsystem-Level Methods
+-----------------------
+
+The core methods to suspend and resume devices reside in
+|struct| :c:type:`dev_pm_ops` pointed to by the :c:member:`ops`
+member of |struct| :c:type:`dev_pm_domain`, or by the :c:member:`pm`
+member of |struct| :c:type:`bus_type`, |struct| :c:type:`device_type` and
+|struct| :c:type:`class`.  They are mostly of interest to the people writing
+infrastructure for platforms and buses, like PCI or USB, or device type and
+device class drivers.  They also are relevant to the writers of device drivers
+whose subsystems (PM domains, device types, device classes and bus types) don't
+provide all power management methods.
+
+Bus drivers implement these methods as appropriate for the hardware and the
+drivers using it; PCI works differently from USB, and so on.  Not many people
+write subsystem-level drivers; most driver code is a "device driver" that builds
+on top of bus-specific framework code.
+
+For more information on these driver calls, see the description later;
+they are called in phases for every device, respecting the parent-child
+sequencing in the driver model tree.
+
+
+:file:`/sys/devices/.../power/wakeup` files
+-------------------------------------------
+
+All device objects in the driver model contain fields that control the handling
+of system wakeup events (hardware signals that can force the system out of a
+sleep state).  These fields are initialized by bus or device driver code using
+:c:func:`device_set_wakeup_capable()` and :c:func:`device_set_wakeup_enable()`,
+defined in :file:`include/linux/pm_wakeup.h`.
+
+The :c:member:`power.can_wakeup` flag just records whether the device (and its
+driver) can physically support wakeup events.  The
+:c:func:`device_set_wakeup_capable()` routine affects this flag.  The
+:c:member:`power.wakeup` field is a pointer to an object of type
+|struct| :c:type:`wakeup_source` used for controlling whether or not
+the device should use its system wakeup mechanism and for notifying the
+PM core of system wakeup events signaled by the device.  This object is only
+present for wakeup-capable devices (i.e. devices whose
+:c:member:`can_wakeup` flags are set) and is created (or removed) by
+:c:func:`device_set_wakeup_capable()`.
+
+Whether or not a device is capable of issuing wakeup events is a hardware
+matter, and the kernel is responsible for keeping track of it.  By contrast,
+whether or not a wakeup-capable device should issue wakeup events is a policy
+decision, and it is managed by user space through a sysfs attribute: the
+:file:`power/wakeup` file.  User space can write the "enabled" or "disabled"
+strings to it to indicate whether or not, respectively, the device is supposed
+to signal system wakeup.  This file is only present if the
+:c:member:`power.wakeup` object exists for the given device and is created (or
+removed) along with that object, by :c:func:`device_set_wakeup_capable()`.
+Reads from the file will return the corresponding string.
+
+The initial value in the :file:`power/wakeup` file is "disabled" for the
+majority of devices; the major exceptions are power buttons, keyboards, and
+Ethernet adapters whose WoL (wake-on-LAN) feature has been set up with ethtool.
+It should also default to "enabled" for devices that don't generate wakeup
+requests on their own but merely forward wakeup requests from one bus to another
+(like PCI Express ports).
+
+The :c:func:`device_may_wakeup()` routine returns true only if the
+:c:member:`power.wakeup` object exists and the corresponding :file:`power/wakeup`
+file contains the "enabled" string.  This information is used by subsystems,
+like the PCI bus type code, to see whether or not to enable the devices' wakeup
+mechanisms.  If device wakeup mechanisms are enabled or disabled directly by
+drivers, they also should use :c:func:`device_may_wakeup()` to decide what to do
+during a system sleep transition.  Device drivers, however, are not expected to
+call :c:func:`device_set_wakeup_enable()` directly in any case.
+
+It ought to be noted that system wakeup is conceptually different from "remote
+wakeup" used by runtime power management, although it may be supported by the
+same physical mechanism.  Remote wakeup is a feature allowing devices in
+low-power states to trigger specific interrupts to signal conditions in which
+they should be put into the full-power state.  Those interrupts may or may not
+be used to signal system wakeup events, depending on the hardware design.  On
+some systems it is impossible to trigger them from system sleep states.  In any
+case, remote wakeup should always be enabled for runtime power management for
+all devices and drivers that support it.
+
+
+:file:`/sys/devices/.../power/control` files
+--------------------------------------------
+
+Each device in the driver model has a flag to control whether it is subject to
+runtime power management.  This flag, :c:member:`runtime_auto`, is initialized
+by the bus type (or generally subsystem) code using :c:func:`pm_runtime_allow()`
+or :c:func:`pm_runtime_forbid()`; the default is to allow runtime power
+management.
+
+The setting can be adjusted by user space by writing either "on" or "auto" to
+the device's :file:`power/control` sysfs file.  Writing "auto" calls
+:c:func:`pm_runtime_allow()`, setting the flag and allowing the device to be
+runtime power-managed by its driver.  Writing "on" calls
+:c:func:`pm_runtime_forbid()`, clearing the flag, returning the device to full
+power if it was in a low-power state, and preventing the
+device from being runtime power-managed.  User space can check the current value
+of the :c:member:`runtime_auto` flag by reading that file.
+
+The device's :c:member:`runtime_auto` flag has no effect on the handling of
+system-wide power transitions.  In particular, the device can (and in the
+majority of cases should and will) be put into a low-power state during a
+system-wide transition to a sleep state even though its :c:member:`runtime_auto`
+flag is clear.
+
+For more information about the runtime power management framework, refer to
+:file:`Documentation/power/runtime_pm.txt`.
+
+
+Calling Drivers to Enter and Leave System Sleep States
+======================================================
+
+When the system goes into a sleep state, each device's driver is asked to
+suspend the device by putting it into a state compatible with the target
+system state.  That's usually some version of "off", but the details are
+system-specific.  Also, wakeup-enabled devices will usually stay partly
+functional in order to wake the system.
+
+When the system leaves that low-power state, the device's driver is asked to
+resume it by returning it to full power.  The suspend and resume operations
+always go together, and both are multi-phase operations.
+
+For simple drivers, suspend might quiesce the device using class code
+and then turn its hardware as "off" as possible during suspend_noirq.  The
+matching resume calls would then completely reinitialize the hardware
+before reactivating its class I/O queues.
+
+More power-aware drivers might prepare the devices for triggering system wakeup
+events.
+
+
+Call Sequence Guarantees
+------------------------
+
+To ensure that bridges and similar links needing to talk to a device are
+available when the device is suspended or resumed, the device hierarchy is
+walked in a bottom-up order to suspend devices.  A top-down order is
+used to resume those devices.
+
+The ordering of the device hierarchy is defined by the order in which devices
+get registered:  a child can never be registered, probed or resumed before
+its parent; and can't be removed or suspended after that parent.
+
+The policy is that the device hierarchy should match hardware bus topology.
+[Or at least the control bus, for devices which use multiple busses.]
+In particular, this means that a device registration may fail if the parent of
+the device is suspending (i.e. has been chosen by the PM core as the next
+device to suspend) or has already suspended, as well as after all of the other
+devices have been suspended.  Device drivers must be prepared to cope with such
+situations.
+
+
+System Power Management Phases
+------------------------------
+
+Suspending or resuming the system is done in several phases.  Different phases
+are used for suspend-to-idle, shallow (standby), and deep ("suspend-to-RAM")
+sleep states and the hibernation state ("suspend-to-disk").  Each phase involves
+executing callbacks for every device before the next phase begins.  Not all
+buses or classes support all these callbacks and not all drivers use all the
+callbacks.  The various phases always run after tasks have been frozen and
+before they are unfrozen.  Furthermore, the ``*_noirq phases`` run at a time
+when IRQ handlers have been disabled (except for those marked with the
+IRQF_NO_SUSPEND flag).
+
+All phases use PM domain, bus, type, class or driver callbacks (that is, methods
+defined in ``dev->pm_domain->ops``, ``dev->bus->pm``, ``dev->type->pm``,
+``dev->class->pm`` or ``dev->driver->pm``).  These callbacks are regarded by the
+PM core as mutually exclusive.  Moreover, PM domain callbacks always take
+precedence over all of the other callbacks and, for example, type callbacks take
+precedence over bus, class and driver callbacks.  To be precise, the following
+rules are used to determine which callback to execute in the given phase:
+
+    1.	If ``dev->pm_domain`` is present, the PM core will choose the callback
+	provided by ``dev->pm_domain->ops`` for execution.
+
+    2.	Otherwise, if both ``dev->type`` and ``dev->type->pm`` are present, the
+	callback provided by ``dev->type->pm`` will be chosen for execution.
+
+    3.	Otherwise, if both ``dev->class`` and ``dev->class->pm`` are present,
+	the callback provided by ``dev->class->pm`` will be chosen for
+	execution.
+
+    4.	Otherwise, if both ``dev->bus`` and ``dev->bus->pm`` are present, the
+	callback provided by ``dev->bus->pm`` will be chosen for execution.
+
+This allows PM domains and device types to override callbacks provided by bus
+types or device classes if necessary.
+
+The PM domain, type, class and bus callbacks may in turn invoke device- or
+driver-specific methods stored in ``dev->driver->pm``, but they don't have to do
+that.
+
+If the subsystem callback chosen for execution is not present, the PM core will
+execute the corresponding method from the ``dev->driver->pm`` set instead if
+there is one.
+
+
+Entering System Suspend
+-----------------------
+
+When the system goes into the freeze, standby or memory sleep state,
+the phases are: ``prepare``, ``suspend``, ``suspend_late``, ``suspend_noirq``.
+
+    1.	The ``prepare`` phase is meant to prevent races by preventing new
+	devices from being registered; the PM core would never know that all the
+	children of a device had been suspended if new children could be
+	registered at will.  [By contrast, from the PM core's perspective,
+	devices may be unregistered at any time.]  Unlike the other
+	suspend-related phases, during the ``prepare`` phase the device
+	hierarchy is traversed top-down.
+
+	After the ``->prepare`` callback method returns, no new children may be
+	registered below the device.  The method may also prepare the device or
+	driver in some way for the upcoming system power transition, but it
+	should not put the device into a low-power state.
+
+	For devices supporting runtime power management, the return value of the
+	prepare callback can be used to indicate to the PM core that it may
+	safely leave the device in runtime suspend (if runtime-suspended
+	already), provided that all of the device's descendants are also left in
+	runtime suspend.  Namely, if the prepare callback returns a positive
+	number and that happens for all of the descendants of the device too,
+	and all of them (including the device itself) are runtime-suspended, the
+	PM core will skip the ``suspend``, ``suspend_late`` and
+	``suspend_noirq`` phases as well as all of the corresponding phases of
+	the subsequent device resume for all of these devices.	In that case,
+	the ``->complete`` callback will be invoked directly after the
+	``->prepare`` callback and is entirely responsible for putting the
+	device into a consistent state as appropriate.
+
+	Note that this direct-complete procedure applies even if the device is
+	disabled for runtime PM; only the runtime-PM status matters.  It follows
+	that if a device has system-sleep callbacks but does not support runtime
+	PM, then its prepare callback must never return a positive value.  This
+	is because all such devices are initially set to runtime-suspended with
+	runtime PM disabled.
+
+    2.	The ``->suspend`` methods should quiesce the device to stop it from
+	performing I/O.  They also may save the device registers and put it into
+	the appropriate low-power state, depending on the bus type the device is
+	on, and they may enable wakeup events.
+
+    3.	For a number of devices it is convenient to split suspend into the
+	"quiesce device" and "save device state" phases, in which cases
+	``suspend_late`` is meant to do the latter.  It is always executed after
+	runtime power management has been disabled for the device in question.
+
+    4.	The ``suspend_noirq`` phase occurs after IRQ handlers have been disabled,
+	which means that the driver's interrupt handler will not be called while
+	the callback method is running.  The ``->suspend_noirq`` methods should
+	save the values of the device's registers that weren't saved previously
+	and finally put the device into the appropriate low-power state.
+
+	The majority of subsystems and device drivers need not implement this
+	callback.  However, bus types allowing devices to share interrupt
+	vectors, like PCI, generally need it; otherwise a driver might encounter
+	an error during the suspend phase by fielding a shared interrupt
+	generated by some other device after its own device had been set to low
+	power.
+
+At the end of these phases, drivers should have stopped all I/O transactions
+(DMA, IRQs), saved enough state that they can re-initialize or restore previous
+state (as needed by the hardware), and placed the device into a low-power state.
+On many platforms they will gate off one or more clock sources; sometimes they
+will also switch off power supplies or reduce voltages.  [Drivers supporting
+runtime PM may already have performed some or all of these steps.]
+
+If :c:func:`device_may_wakeup(dev)` returns ``true``, the device should be
+prepared for generating hardware wakeup signals to trigger a system wakeup event
+when the system is in the sleep state.  For example, :c:func:`enable_irq_wake()`
+might identify GPIO signals hooked up to a switch or other external hardware,
+and :c:func:`pci_enable_wake()` does something similar for the PCI PME signal.
+
+If any of these callbacks returns an error, the system won't enter the desired
+low-power state.  Instead, the PM core will unwind its actions by resuming all
+the devices that were suspended.
+
+
+Leaving System Suspend
+----------------------
+
+When resuming from freeze, standby or memory sleep, the phases are:
+``resume_noirq``, ``resume_early``, ``resume``, ``complete``.
+
+    1.	The ``->resume_noirq`` callback methods should perform any actions
+	needed before the driver's interrupt handlers are invoked.  This
+	generally means undoing the actions of the ``suspend_noirq`` phase.  If
+	the bus type permits devices to share interrupt vectors, like PCI, the
+	method should bring the device and its driver into a state in which the
+	driver can recognize if the device is the source of incoming interrupts,
+	if any, and handle them correctly.
+
+	For example, the PCI bus type's ``->pm.resume_noirq()`` puts the device
+	into the full-power state (D0 in the PCI terminology) and restores the
+	standard configuration registers of the device.  Then it calls the
+	device driver's ``->pm.resume_noirq()`` method to perform device-specific
+	actions.
+
+    2.	The ``->resume_early`` methods should prepare devices for the execution
+	of the resume methods.  This generally involves undoing the actions of
+	the preceding ``suspend_late`` phase.
+
+    3.	The ``->resume`` methods should bring the device back to its operating
+	state, so that it can perform normal I/O.  This generally involves
+	undoing the actions of the ``suspend`` phase.
+
+    4.	The ``complete`` phase should undo the actions of the ``prepare`` phase.
+        For this reason, unlike the other resume-related phases, during the
+        ``complete`` phase the device hierarchy is traversed bottom-up.
+
+	Note, however, that new children may be registered below the device as
+	soon as the ``->resume`` callbacks occur; it's not necessary to wait
+	until the ``complete`` phase with that.
+
+	Moreover, if the preceding ``->prepare`` callback returned a positive
+	number, the device may have been left in runtime suspend throughout the
+	whole system suspend and resume (the ``suspend``, ``suspend_late``,
+	``suspend_noirq`` phases of system suspend and the ``resume_noirq``,
+	``resume_early``, ``resume`` phases of system resume may have been
+	skipped for it).  In that case, the ``->complete`` callback is entirely
+	responsible for putting the device into a consistent state after system
+	suspend if necessary.  [For example, it may need to queue up a runtime
+	resume request for the device for this purpose.]  To check if that is
+	the case, the ``->complete`` callback can consult the device's
+	``power.direct_complete`` flag.  Namely, if that flag is set when the
+	``->complete`` callback is being run, it has been called directly after
+	the preceding ``->prepare`` and special actions may be required
+	to make the device work correctly afterward.
+
+At the end of these phases, drivers should be as functional as they were before
+suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
+gated on.
+
+However, the details here may again be platform-specific.  For example,
+some systems support multiple "run" states, and the mode in effect at
+the end of resume might not be the one which preceded suspension.
+That means availability of certain clocks or power supplies changed,
+which could easily affect how a driver works.
+
+Drivers need to be able to handle hardware which has been reset since all of the
+suspend methods were called, for example by complete reinitialization.
+This may be the hardest part, and the one most protected by NDA'd documents
+and chip errata.  It's simplest if the hardware state hasn't changed since
+the suspend was carried out, but that can only be guaranteed if the target
+system sleep entered was suspend-to-idle.  For the other system sleep states
+that may not be the case (and usually isn't for ACPI-defined system sleep
+states, like S3).
+
+Drivers must also be prepared to notice that the device has been removed
+while the system was powered down, whenever that's physically possible.
+PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
+where common Linux platforms will see such removal.  Details of how drivers
+will notice and handle such removals are currently bus-specific, and often
+involve a separate thread.
+
+These callbacks may return an error value, but the PM core will ignore such
+errors since there's nothing it can do about them other than printing them in
+the system log.
+
+
+Entering Hibernation
+--------------------
+
+Hibernating the system is more complicated than putting it into sleep states,
+because it involves creating and saving a system image.  Therefore there are
+more phases for hibernation, with a different set of callbacks.  These phases
+always run after tasks have been frozen and enough memory has been freed.
+
+The general procedure for hibernation is to quiesce all devices ("freeze"),
+create an image of the system memory while everything is stable, reactivate all
+devices ("thaw"), write the image to permanent storage, and finally shut down
+the system ("power off").  The phases used to accomplish this are: ``prepare``,
+``freeze``, ``freeze_late``, ``freeze_noirq``, ``thaw_noirq``, ``thaw_early``,
+``thaw``, ``complete``, ``prepare``, ``poweroff``, ``poweroff_late``,
+``poweroff_noirq``.
+
+    1.	The ``prepare`` phase is discussed in the "Entering System Suspend"
+	section above.
+
+    2.	The ``->freeze`` methods should quiesce the device so that it doesn't
+	generate IRQs or DMA, and they may need to save the values of device
+	registers.  However the device does not have to be put in a low-power
+	state, and to save time it's best not to do so.  Also, the device should
+	not be prepared to generate wakeup events.
+
+    3.	The ``freeze_late`` phase is analogous to the ``suspend_late`` phase
+	described earlier, except that the device should not be put into a
+	low-power state and should not be allowed to generate wakeup events.
+
+    4.	The ``freeze_noirq`` phase is analogous to the ``suspend_noirq`` phase
+	discussed earlier, except again that the device should not be put into
+	a low-power state and should not be allowed to generate wakeup events.
+
+At this point the system image is created.  All devices should be inactive and
+the contents of memory should remain undisturbed while this happens, so that the
+image forms an atomic snapshot of the system state.
+
+    5.	The ``thaw_noirq`` phase is analogous to the ``resume_noirq`` phase
+	discussed earlier.  The main difference is that its methods can assume
+	the device is in the same state as at the end of the ``freeze_noirq``
+	phase.
+
+    6.	The ``thaw_early`` phase is analogous to the ``resume_early`` phase
+	described above.  Its methods should undo the actions of the preceding
+	``freeze_late``, if necessary.
+
+    7.	The ``thaw`` phase is analogous to the ``resume`` phase discussed
+	earlier.  Its methods should bring the device back to an operating
+	state, so that it can be used for saving the image if necessary.
+
+    8.	The ``complete`` phase is discussed in the "Leaving System Suspend"
+	section above.
+
+At this point the system image is saved, and the devices then need to be
+prepared for the upcoming system shutdown.  This is much like suspending them
+before putting the system into the suspend-to-idle, shallow or deep sleep state,
+and the phases are similar.
+
+    9.	The ``prepare`` phase is discussed above.
+
+    10.	The ``poweroff`` phase is analogous to the ``suspend`` phase.
+
+    11.	The ``poweroff_late`` phase is analogous to the ``suspend_late`` phase.
+
+    12.	The ``poweroff_noirq`` phase is analogous to the ``suspend_noirq`` phase.
+
+The ``->poweroff``, ``->poweroff_late`` and ``->poweroff_noirq`` callbacks
+should do essentially the same things as the ``->suspend``, ``->suspend_late``
+and ``->suspend_noirq`` callbacks, respectively.  The only notable difference is
+that they need not store the device register values, because the registers
+should already have been stored during the ``freeze``, ``freeze_late`` or
+``freeze_noirq`` phases.
+
+
+Leaving Hibernation
+-------------------
+
+Resuming from hibernation is, again, more complicated than resuming from a sleep
+state in which the contents of main memory are preserved, because it requires
+a system image to be loaded into memory and the pre-hibernation memory contents
+to be restored before control can be passed back to the image kernel.
+
+Although in principle the image might be loaded into memory and the
+pre-hibernation memory contents restored by the boot loader, in practice this
+can't be done because boot loaders aren't smart enough and there is no
+established protocol for passing the necessary information.  So instead, the
+boot loader loads a fresh instance of the kernel, called "the restore kernel",
+into memory and passes control to it in the usual way.  Then the restore kernel
+reads the system image, restores the pre-hibernation memory contents, and passes
+control to the image kernel.  Thus two different kernel instances are involved
+in resuming from hibernation.  In fact, the restore kernel may be completely
+different from the image kernel: a different configuration and even a different
+version.  This has important consequences for device drivers and their
+subsystems.
+
+To be able to load the system image into memory, the restore kernel needs to
+include at least a subset of device drivers allowing it to access the storage
+medium containing the image, although it doesn't need to include all of the
+drivers present in the image kernel.  After the image has been loaded, the
+devices managed by the boot kernel need to be prepared for passing control back
+to the image kernel.  This is very similar to the initial steps involved in
+creating a system image, and it is accomplished in the same way, using
+``prepare``, ``freeze``, and ``freeze_noirq`` phases.  However, the devices
+affected by these phases are only those having drivers in the restore kernel;
+other devices will still be in whatever state the boot loader left them.
+
+Should the restoration of the pre-hibernation memory contents fail, the restore
+kernel would go through the "thawing" procedure described above, using the
+``thaw_noirq``, ``thaw_early``, ``thaw``, and ``complete`` phases, and then
+continue running normally.  This happens only rarely.  Most often the
+pre-hibernation memory contents are restored successfully and control is passed
+to the image kernel, which then becomes responsible for bringing the system back
+to the working state.
+
+To achieve this, the image kernel must restore the devices' pre-hibernation
+functionality.  The operation is much like waking up from a sleep state (with
+the memory contents preserved), although it involves different phases:
+``restore_noirq``, ``restore_early``, ``restore``, ``complete``.
+
+    1.	The ``restore_noirq`` phase is analogous to the ``resume_noirq`` phase.
+
+    2.	The ``restore_early`` phase is analogous to the ``resume_early`` phase.
+
+    3.	The ``restore`` phase is analogous to the ``resume`` phase.
+
+    4.	The ``complete`` phase is discussed above.
+
+The main difference from ``resume[_early|_noirq]`` is that
+``restore[_early|_noirq]`` must assume the device has been accessed and
+reconfigured by the boot loader or the restore kernel.  Consequently, the state
+of the device may be different from the state remembered from the ``freeze``,
+``freeze_late`` and ``freeze_noirq`` phases.  The device may even need to be
+reset and completely re-initialized.  In many cases this difference doesn't
+matter, so the ``->resume[_early|_noirq]`` and ``->restore[_early|_norq]``
+method pointers can be set to the same routines.  Nevertheless, different
+callback pointers are used in case there is a situation where it actually does
+matter.
+
+
+Power Management Notifiers
+==========================
+
+There are some operations that cannot be carried out by the power management
+callbacks discussed above, because the callbacks occur too late or too early.
+To handle these cases, subsystems and device drivers may register power
+management notifiers that are called before tasks are frozen and after they have
+been thawed.  Generally speaking, the PM notifiers are suitable for performing
+actions that either require user space to be available, or at least won't
+interfere with user space.
+
+For details refer to :file:`Documentation/power/notifiers.txt`.
+
+
+Device Low-Power (suspend) States
+=================================
+
+Device low-power states aren't standard.  One device might only handle
+"on" and "off", while another might support a dozen different versions of
+"on" (how many engines are active?), plus a state that gets back to "on"
+faster than from a full "off".
+
+Some buses define rules about what different suspend states mean.  PCI
+gives one example: after the suspend sequence completes, a non-legacy
+PCI device may not perform DMA or issue IRQs, and any wakeup events it
+issues would be issued through the PME# bus signal.  Plus, there are
+several PCI-standard device states, some of which are optional.
+
+In contrast, integrated system-on-chip processors often use IRQs as the
+wakeup event sources (so drivers would call :c:func:`enable_irq_wake`) and
+might be able to treat DMA completion as a wakeup event (sometimes DMA can stay
+active too, it'd only be the CPU and some peripherals that sleep).
+
+Some details here may be platform-specific.  Systems may have devices that
+can be fully active in certain sleep states, such as an LCD display that's
+refreshed using DMA while most of the system is sleeping lightly ... and
+its frame buffer might even be updated by a DSP or other non-Linux CPU while
+the Linux control processor stays idle.
+
+Moreover, the specific actions taken may depend on the target system state.
+One target system state might allow a given device to be very operational;
+another might require a hard shut down with re-initialization on resume.
+And two different target systems might use the same device in different
+ways; the aforementioned LCD might be active in one product's "standby",
+but a different product using the same SOC might work differently.
+
+
+Device Power Management Domains
+===============================
+
+Sometimes devices share reference clocks or other power resources.  In those
+cases it generally is not possible to put devices into low-power states
+individually.  Instead, a set of devices sharing a power resource can be put
+into a low-power state together at the same time by turning off the shared
+power resource.  Of course, they also need to be put into the full-power state
+together, by turning the shared power resource on.  A set of devices with this
+property is often referred to as a power domain. A power domain may also be
+nested inside another power domain. The nested domain is referred to as the
+sub-domain of the parent domain.
+
+Support for power domains is provided through the :c:member:`pm_domain` field of
+|struct| :c:type:`device`.  This field is a pointer to an object of
+type |struct| :c:type:`dev_pm_domain`, defined in :file:`include/linux/pm.h``,
+providing a set of power management callbacks analogous to the subsystem-level
+and device driver callbacks that are executed for the given device during all
+power transitions, instead of the respective subsystem-level callbacks.
+Specifically, if a device's :c:member:`pm_domain` pointer is not NULL, the
+``->suspend()`` callback from the object pointed to by it will be executed
+instead of its subsystem's (e.g. bus type's) ``->suspend()`` callback and
+analogously for all of the remaining callbacks.  In other words, power
+management domain callbacks, if defined for the given device, always take
+precedence over the callbacks provided by the device's subsystem (e.g. bus type).
+
+The support for device power management domains is only relevant to platforms
+needing to use the same device driver power management callbacks in many
+different power domain configurations and wanting to avoid incorporating the
+support for power domains into subsystem-level callbacks, for example by
+modifying the platform bus type.  Other platforms need not implement it or take
+it into account in any way.
+
+Devices may be defined as IRQ-safe which indicates to the PM core that their
+runtime PM callbacks may be invoked with disabled interrupts (see
+:file:`Documentation/power/runtime_pm.txt` for more information).  If an
+IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be
+disallowed, unless the domain itself is defined as IRQ-safe. However, it
+makes sense to define a PM domain as IRQ-safe only if all the devices in it
+are IRQ-safe. Moreover, if an IRQ-safe domain has a parent domain, the runtime
+PM of the parent is only allowed if the parent itself is IRQ-safe too with the
+additional restriction that all child domains of an IRQ-safe parent must also
+be IRQ-safe.
+
+
+Runtime Power Management
+========================
+
+Many devices are able to dynamically power down while the system is still
+running. This feature is useful for devices that are not being used, and
+can offer significant power savings on a running system.  These devices
+often support a range of runtime power states, which might use names such
+as "off", "sleep", "idle", "active", and so on.  Those states will in some
+cases (like PCI) be partially constrained by the bus the device uses, and will
+usually include hardware states that are also used in system sleep states.
+
+A system-wide power transition can be started while some devices are in low
+power states due to runtime power management.  The system sleep PM callbacks
+should recognize such situations and react to them appropriately, but the
+necessary actions are subsystem-specific.
+
+In some cases the decision may be made at the subsystem level while in other
+cases the device driver may be left to decide.  In some cases it may be
+desirable to leave a suspended device in that state during a system-wide power
+transition, but in other cases the device must be put back into the full-power
+state temporarily, for example so that its system wakeup capability can be
+disabled.  This all depends on the hardware and the design of the subsystem and
+device driver in question.
+
+During system-wide resume from a sleep state it's easiest to put devices into
+the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
+Refer to that document for more information regarding this particular issue as
+well as for information on the device runtime power management framework in
+general.
diff --git a/Documentation/driver-api/pm/index.rst b/Documentation/driver-api/pm/index.rst
new file mode 100644
index 000000000000..f7b3ce9e9ba0
--- /dev/null
+++ b/Documentation/driver-api/pm/index.rst
@@ -0,0 +1,15 @@
+=======================
+Device Power Management
+=======================
+
+.. toctree::
+
+   devices
+   types
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/pm/types.rst b/Documentation/driver-api/pm/types.rst
new file mode 100644
index 000000000000..3ebdecc54104
--- /dev/null
+++ b/Documentation/driver-api/pm/types.rst
@@ -0,0 +1,5 @@
+==================================
+Device Power Management Data Types
+==================================
+
+.. kernel-doc:: include/linux/pm.h
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
deleted file mode 100644
index 73ddea39a9ce..000000000000
--- a/Documentation/power/devices.txt
+++ /dev/null
@@ -1,716 +0,0 @@
-Device Power Management
-
-Copyright (c) 2010-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
-Copyright (c) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-
-Most of the code in Linux is device drivers, so most of the Linux power
-management (PM) code is also driver-specific.  Most drivers will do very
-little; others, especially for platforms with small batteries (like cell
-phones), will do a lot.
-
-This writeup gives an overview of how drivers interact with system-wide
-power management goals, emphasizing the models and interfaces that are
-shared by everything that hooks up to the driver model core.  Read it as
-background for the domain-specific work you'd do with any specific driver.
-
-
-Two Models for Device Power Management
-======================================
-Drivers will use one or both of these models to put devices into low-power
-states:
-
-    System Sleep model:
-	Drivers can enter low-power states as part of entering system-wide
-	low-power states like "suspend" (also known as "suspend-to-RAM"), or
-	(mostly for systems with disks) "hibernation" (also known as
-	"suspend-to-disk").
-
-	This is something that device, bus, and class drivers collaborate on
-	by implementing various role-specific suspend and resume methods to
-	cleanly power down hardware and software subsystems, then reactivate
-	them without loss of data.
-
-	Some drivers can manage hardware wakeup events, which make the system
-	leave the low-power state.  This feature may be enabled or disabled
-	using the relevant /sys/devices/.../power/wakeup file (for Ethernet
-	drivers the ioctl interface used by ethtool may also be used for this
-	purpose); enabling it may cost some power usage, but let the whole
-	system enter low-power states more often.
-
-    Runtime Power Management model:
-	Devices may also be put into low-power states while the system is
-	running, independently of other power management activity in principle.
-	However, devices are not generally independent of each other (for
-	example, a parent device cannot be suspended unless all of its child
-	devices have been suspended).  Moreover, depending on the bus type the
-	device is on, it may be necessary to carry out some bus-specific
-	operations on the device for this purpose.  Devices put into low power
-	states at run time may require special handling during system-wide power
-	transitions (suspend or hibernation).
-
-	For these reasons not only the device driver itself, but also the
-	appropriate subsystem (bus type, device type or device class) driver and
-	the PM core are involved in runtime power management.  As in the system
-	sleep power management case, they need to collaborate by implementing
-	various role-specific suspend and resume methods, so that the hardware
-	is cleanly powered down and reactivated without data or service loss.
-
-There's not a lot to be said about those low-power states except that they are
-very system-specific, and often device-specific.  Also, that if enough devices
-have been put into low-power states (at runtime), the effect may be very similar
-to entering some system-wide low-power state (system sleep) ... and that
-synergies exist, so that several drivers using runtime PM might put the system
-into a state where even deeper power saving options are available.
-
-Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except
-for wakeup events), no more data read or written, and requests from upstream
-drivers are no longer accepted.  A given bus or platform may have different
-requirements though.
-
-Examples of hardware wakeup events include an alarm from a real time clock,
-network wake-on-LAN packets, keyboard or mouse activity, and media insertion
-or removal (for PCMCIA, MMC/SD, USB, and so on).
-
-
-Interfaces for Entering System Sleep States
-===========================================
-There are programming interfaces provided for subsystems (bus type, device type,
-device class) and device drivers to allow them to participate in the power
-management of devices they are concerned with.  These interfaces cover both
-system sleep and runtime power management.
-
-
-Device Power Management Operations
-----------------------------------
-Device power management operations, at the subsystem level as well as at the
-device driver level, are implemented by defining and populating objects of type
-struct dev_pm_ops:
-
-struct dev_pm_ops {
-	int (*prepare)(struct device *dev);
-	void (*complete)(struct device *dev);
-	int (*suspend)(struct device *dev);
-	int (*resume)(struct device *dev);
-	int (*freeze)(struct device *dev);
-	int (*thaw)(struct device *dev);
-	int (*poweroff)(struct device *dev);
-	int (*restore)(struct device *dev);
-	int (*suspend_late)(struct device *dev);
-	int (*resume_early)(struct device *dev);
-	int (*freeze_late)(struct device *dev);
-	int (*thaw_early)(struct device *dev);
-	int (*poweroff_late)(struct device *dev);
-	int (*restore_early)(struct device *dev);
-	int (*suspend_noirq)(struct device *dev);
-	int (*resume_noirq)(struct device *dev);
-	int (*freeze_noirq)(struct device *dev);
-	int (*thaw_noirq)(struct device *dev);
-	int (*poweroff_noirq)(struct device *dev);
-	int (*restore_noirq)(struct device *dev);
-	int (*runtime_suspend)(struct device *dev);
-	int (*runtime_resume)(struct device *dev);
-	int (*runtime_idle)(struct device *dev);
-};
-
-This structure is defined in include/linux/pm.h and the methods included in it
-are also described in that file.  Their roles will be explained in what follows.
-For now, it should be sufficient to remember that the last three methods are
-specific to runtime power management while the remaining ones are used during
-system-wide power transitions.
-
-There also is a deprecated "old" or "legacy" interface for power management
-operations available at least for some subsystems.  This approach does not use
-struct dev_pm_ops objects and it is suitable only for implementing system sleep
-power management methods.  Therefore it is not described in this document, so
-please refer directly to the source code for more information about it.
-
-
-Subsystem-Level Methods
------------------------
-The core methods to suspend and resume devices reside in struct dev_pm_ops
-pointed to by the ops member of struct dev_pm_domain, or by the pm member of
-struct bus_type, struct device_type and struct class.  They are mostly of
-interest to the people writing infrastructure for platforms and buses, like PCI
-or USB, or device type and device class drivers.  They also are relevant to the
-writers of device drivers whose subsystems (PM domains, device types, device
-classes and bus types) don't provide all power management methods.
-
-Bus drivers implement these methods as appropriate for the hardware and the
-drivers using it; PCI works differently from USB, and so on.  Not many people
-write subsystem-level drivers; most driver code is a "device driver" that builds
-on top of bus-specific framework code.
-
-For more information on these driver calls, see the description later;
-they are called in phases for every device, respecting the parent-child
-sequencing in the driver model tree.
-
-
-/sys/devices/.../power/wakeup files
------------------------------------
-All device objects in the driver model contain fields that control the handling
-of system wakeup events (hardware signals that can force the system out of a
-sleep state).  These fields are initialized by bus or device driver code using
-device_set_wakeup_capable() and device_set_wakeup_enable(), defined in
-include/linux/pm_wakeup.h.
-
-The "power.can_wakeup" flag just records whether the device (and its driver) can
-physically support wakeup events.  The device_set_wakeup_capable() routine
-affects this flag.  The "power.wakeup" field is a pointer to an object of type
-struct wakeup_source used for controlling whether or not the device should use
-its system wakeup mechanism and for notifying the PM core of system wakeup
-events signaled by the device.  This object is only present for wakeup-capable
-devices (i.e. devices whose "can_wakeup" flags are set) and is created (or
-removed) by device_set_wakeup_capable().
-
-Whether or not a device is capable of issuing wakeup events is a hardware
-matter, and the kernel is responsible for keeping track of it.  By contrast,
-whether or not a wakeup-capable device should issue wakeup events is a policy
-decision, and it is managed by user space through a sysfs attribute: the
-"power/wakeup" file.  User space can write the strings "enabled" or "disabled"
-to it to indicate whether or not, respectively, the device is supposed to signal
-system wakeup.  This file is only present if the "power.wakeup" object exists
-for the given device and is created (or removed) along with that object, by
-device_set_wakeup_capable().  Reads from the file will return the corresponding
-string.
-
-The "power/wakeup" file is supposed to contain the "disabled" string initially
-for the majority of devices; the major exceptions are power buttons, keyboards,
-and Ethernet adapters whose WoL (wake-on-LAN) feature has been set up with
-ethtool.  It should also default to "enabled" for devices that don't generate
-wakeup requests on their own but merely forward wakeup requests from one bus to
-another (like PCI Express ports).
-
-The device_may_wakeup() routine returns true only if the "power.wakeup" object
-exists and the corresponding "power/wakeup" file contains the string "enabled".
-This information is used by subsystems, like the PCI bus type code, to see
-whether or not to enable the devices' wakeup mechanisms.  If device wakeup
-mechanisms are enabled or disabled directly by drivers, they also should use
-device_may_wakeup() to decide what to do during a system sleep transition.
-Device drivers, however, are not supposed to call device_set_wakeup_enable()
-directly in any case.
-
-It ought to be noted that system wakeup is conceptually different from "remote
-wakeup" used by runtime power management, although it may be supported by the
-same physical mechanism.  Remote wakeup is a feature allowing devices in
-low-power states to trigger specific interrupts to signal conditions in which
-they should be put into the full-power state.  Those interrupts may or may not
-be used to signal system wakeup events, depending on the hardware design.  On
-some systems it is impossible to trigger them from system sleep states.  In any
-case, remote wakeup should always be enabled for runtime power management for
-all devices and drivers that support it.
-
-/sys/devices/.../power/control files
-------------------------------------
-Each device in the driver model has a flag to control whether it is subject to
-runtime power management.  This flag, called runtime_auto, is initialized by the
-bus type (or generally subsystem) code using pm_runtime_allow() or
-pm_runtime_forbid(); the default is to allow runtime power management.
-
-The setting can be adjusted by user space by writing either "on" or "auto" to
-the device's power/control sysfs file.  Writing "auto" calls pm_runtime_allow(),
-setting the flag and allowing the device to be runtime power-managed by its
-driver.  Writing "on" calls pm_runtime_forbid(), clearing the flag, returning
-the device to full power if it was in a low-power state, and preventing the
-device from being runtime power-managed.  User space can check the current value
-of the runtime_auto flag by reading the file.
-
-The device's runtime_auto flag has no effect on the handling of system-wide
-power transitions.  In particular, the device can (and in the majority of cases
-should and will) be put into a low-power state during a system-wide transition
-to a sleep state even though its runtime_auto flag is clear.
-
-For more information about the runtime power management framework, refer to
-Documentation/power/runtime_pm.txt.
-
-
-Calling Drivers to Enter and Leave System Sleep States
-======================================================
-When the system goes into a sleep state, each device's driver is asked to
-suspend the device by putting it into a state compatible with the target
-system state.  That's usually some version of "off", but the details are
-system-specific.  Also, wakeup-enabled devices will usually stay partly
-functional in order to wake the system.
-
-When the system leaves that low-power state, the device's driver is asked to
-resume it by returning it to full power.  The suspend and resume operations
-always go together, and both are multi-phase operations.
-
-For simple drivers, suspend might quiesce the device using class code
-and then turn its hardware as "off" as possible during suspend_noirq.  The
-matching resume calls would then completely reinitialize the hardware
-before reactivating its class I/O queues.
-
-More power-aware drivers might prepare the devices for triggering system wakeup
-events.
-
-
-Call Sequence Guarantees
-------------------------
-To ensure that bridges and similar links needing to talk to a device are
-available when the device is suspended or resumed, the device tree is
-walked in a bottom-up order to suspend devices.  A top-down order is
-used to resume those devices.
-
-The ordering of the device tree is defined by the order in which devices
-get registered:  a child can never be registered, probed or resumed before
-its parent; and can't be removed or suspended after that parent.
-
-The policy is that the device tree should match hardware bus topology.
-(Or at least the control bus, for devices which use multiple busses.)
-In particular, this means that a device registration may fail if the parent of
-the device is suspending (i.e. has been chosen by the PM core as the next
-device to suspend) or has already suspended, as well as after all of the other
-devices have been suspended.  Device drivers must be prepared to cope with such
-situations.
-
-
-System Power Management Phases
-------------------------------
-Suspending or resuming the system is done in several phases.  Different phases
-are used for freeze, standby, and memory sleep states ("suspend-to-RAM") and the
-hibernation state ("suspend-to-disk").  Each phase involves executing callbacks
-for every device before the next phase begins.  Not all busses or classes
-support all these callbacks and not all drivers use all the callbacks.  The
-various phases always run after tasks have been frozen and before they are
-unfrozen.  Furthermore, the *_noirq phases run at a time when IRQ handlers have
-been disabled (except for those marked with the IRQF_NO_SUSPEND flag).
-
-All phases use PM domain, bus, type, class or driver callbacks (that is, methods
-defined in dev->pm_domain->ops, dev->bus->pm, dev->type->pm, dev->class->pm or
-dev->driver->pm).  These callbacks are regarded by the PM core as mutually
-exclusive.  Moreover, PM domain callbacks always take precedence over all of the
-other callbacks and, for example, type callbacks take precedence over bus, class
-and driver callbacks.  To be precise, the following rules are used to determine
-which callback to execute in the given phase:
-
-    1.	If dev->pm_domain is present, the PM core will choose the callback
-	included in dev->pm_domain->ops for execution
-
-    2.	Otherwise, if both dev->type and dev->type->pm are present, the callback
-	included in dev->type->pm will be chosen for execution.
-
-    3.	Otherwise, if both dev->class and dev->class->pm are present, the
-	callback included in dev->class->pm will be chosen for execution.
-
-    4.	Otherwise, if both dev->bus and dev->bus->pm are present, the callback
-	included in dev->bus->pm will be chosen for execution.
-
-This allows PM domains and device types to override callbacks provided by bus
-types or device classes if necessary.
-
-The PM domain, type, class and bus callbacks may in turn invoke device- or
-driver-specific methods stored in dev->driver->pm, but they don't have to do
-that.
-
-If the subsystem callback chosen for execution is not present, the PM core will
-execute the corresponding method from dev->driver->pm instead if there is one.
-
-
-Entering System Suspend
------------------------
-When the system goes into the freeze, standby or memory sleep state,
-the phases are:
-
-		prepare, suspend, suspend_late, suspend_noirq.
-
-    1.	The prepare phase is meant to prevent races by preventing new devices
-	from being registered; the PM core would never know that all the
-	children of a device had been suspended if new children could be
-	registered at will.  (By contrast, devices may be unregistered at any
-	time.)  Unlike the other suspend-related phases, during the prepare
-	phase the device tree is traversed top-down.
-
-	After the prepare callback method returns, no new children may be
-	registered below the device.  The method may also prepare the device or
-	driver in some way for the upcoming system power transition, but it
-	should not put the device into a low-power state.
-
-	For devices supporting runtime power management, the return value of the
-	prepare callback can be used to indicate to the PM core that it may
-	safely leave the device in runtime suspend (if runtime-suspended
-	already), provided that all of the device's descendants are also left in
-	runtime suspend.  Namely, if the prepare callback returns a positive
-	number and that happens for all of the descendants of the device too,
-	and all of them (including the device itself) are runtime-suspended, the
-	PM core will skip the suspend, suspend_late and	suspend_noirq suspend
-	phases as well as the resume_noirq, resume_early and resume phases of
-	the following system resume for all of these devices.	In that case,
-	the complete callback will be called directly after the prepare callback
-	and is entirely responsible for bringing the device back to the
-	functional state as appropriate.
-
-	Note that this direct-complete procedure applies even if the device is
-	disabled for runtime PM; only the runtime-PM status matters.  It follows
-	that if a device has system-sleep callbacks but does not support runtime
-	PM, then its prepare callback must never return a positive value.  This
-	is because all devices are initially set to runtime-suspended with
-	runtime PM disabled.
-
-    2.	The suspend methods should quiesce the device to stop it from performing
-	I/O.  They also may save the device registers and put it into the
-	appropriate low-power state, depending on the bus type the device is on,
-	and they may enable wakeup events.
-
-    3	For a number of devices it is convenient to split suspend into the
-	"quiesce device" and "save device state" phases, in which cases
-	suspend_late is meant to do the latter.  It is always executed after
-	runtime power management has been disabled for all devices.
-
-    4.	The suspend_noirq phase occurs after IRQ handlers have been disabled,
-	which means that the driver's interrupt handler will not be called while
-	the callback method is running.  The methods should save the values of
-	the device's registers that weren't saved previously and finally put the
-	device into the appropriate low-power state.
-
-	The majority of subsystems and device drivers need not implement this
-	callback.  However, bus types allowing devices to share interrupt
-	vectors, like PCI, generally need it; otherwise a driver might encounter
-	an error during the suspend phase by fielding a shared interrupt
-	generated by some other device after its own device had been set to low
-	power.
-
-At the end of these phases, drivers should have stopped all I/O transactions
-(DMA, IRQs), saved enough state that they can re-initialize or restore previous
-state (as needed by the hardware), and placed the device into a low-power state.
-On many platforms they will gate off one or more clock sources; sometimes they
-will also switch off power supplies or reduce voltages.  (Drivers supporting
-runtime PM may already have performed some or all of these steps.)
-
-If device_may_wakeup(dev) returns true, the device should be prepared for
-generating hardware wakeup signals to trigger a system wakeup event when the
-system is in the sleep state.  For example, enable_irq_wake() might identify
-GPIO signals hooked up to a switch or other external hardware, and
-pci_enable_wake() does something similar for the PCI PME signal.
-
-If any of these callbacks returns an error, the system won't enter the desired
-low-power state.  Instead the PM core will unwind its actions by resuming all
-the devices that were suspended.
-
-
-Leaving System Suspend
-----------------------
-When resuming from freeze, standby or memory sleep, the phases are:
-
-		resume_noirq, resume_early, resume, complete.
-
-    1.	The resume_noirq callback methods should perform any actions needed
-	before the driver's interrupt handlers are invoked.  This generally
-	means undoing the actions of the suspend_noirq phase.  If the bus type
-	permits devices to share interrupt vectors, like PCI, the method should
-	bring the device and its driver into a state in which the driver can
-	recognize if the device is the source of incoming interrupts, if any,
-	and handle them correctly.
-
-	For example, the PCI bus type's ->pm.resume_noirq() puts the device into
-	the full-power state (D0 in the PCI terminology) and restores the
-	standard configuration registers of the device.  Then it calls the
-	device driver's ->pm.resume_noirq() method to perform device-specific
-	actions.
-
-    2.	The resume_early methods should prepare devices for the execution of
-	the resume methods.  This generally involves undoing the actions of the
-	preceding suspend_late phase.
-
-    3	The resume methods should bring the device back to its operating
-	state, so that it can perform normal I/O.  This generally involves
-	undoing the actions of the suspend phase.
-
-    4.	The complete phase should undo the actions of the prepare phase.  Note,
-	however, that new children may be registered below the device as soon as
-	the resume callbacks occur; it's not necessary to wait until the
-	complete phase.
-
-	Moreover, if the preceding prepare callback returned a positive number,
-	the device may have been left in runtime suspend throughout the whole
-	system suspend and resume (the suspend, suspend_late, suspend_noirq
-	phases of system suspend and the resume_noirq, resume_early, resume
-	phases of system resume may have been skipped for it).  In that case,
-	the complete callback is entirely responsible for bringing the device
-	back to the functional state after system suspend if necessary.  [For
-	example, it may need to queue up a runtime resume request for the device
-	for this purpose.]  To check if that is the case, the complete callback
-	can consult the device's power.direct_complete flag.  Namely, if that
-	flag is set when the complete callback is being run, it has been called
-	directly after the preceding prepare and special action may be required
-	to make the device work correctly afterward.
-
-At the end of these phases, drivers should be as functional as they were before
-suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
-gated on.
-
-However, the details here may again be platform-specific.  For example,
-some systems support multiple "run" states, and the mode in effect at
-the end of resume might not be the one which preceded suspension.
-That means availability of certain clocks or power supplies changed,
-which could easily affect how a driver works.
-
-Drivers need to be able to handle hardware which has been reset since the
-suspend methods were called, for example by complete reinitialization.
-This may be the hardest part, and the one most protected by NDA'd documents
-and chip errata.  It's simplest if the hardware state hasn't changed since
-the suspend was carried out, but that can't be guaranteed (in fact, it usually
-is not the case).
-
-Drivers must also be prepared to notice that the device has been removed
-while the system was powered down, whenever that's physically possible.
-PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
-where common Linux platforms will see such removal.  Details of how drivers
-will notice and handle such removals are currently bus-specific, and often
-involve a separate thread.
-
-These callbacks may return an error value, but the PM core will ignore such
-errors since there's nothing it can do about them other than printing them in
-the system log.
-
-
-Entering Hibernation
---------------------
-Hibernating the system is more complicated than putting it into the other
-sleep states, because it involves creating and saving a system image.
-Therefore there are more phases for hibernation, with a different set of
-callbacks.  These phases always run after tasks have been frozen and memory has
-been freed.
-
-The general procedure for hibernation is to quiesce all devices (freeze), create
-an image of the system memory while everything is stable, reactivate all
-devices (thaw), write the image to permanent storage, and finally shut down the
-system (poweroff).  The phases used to accomplish this are:
-
-	prepare, freeze, freeze_late, freeze_noirq, thaw_noirq, thaw_early,
-	thaw, complete, prepare, poweroff, poweroff_late, poweroff_noirq
-
-    1.	The prepare phase is discussed in the "Entering System Suspend" section
-	above.
-
-    2.	The freeze methods should quiesce the device so that it doesn't generate
-	IRQs or DMA, and they may need to save the values of device registers.
-	However the device does not have to be put in a low-power state, and to
-	save time it's best not to do so.  Also, the device should not be
-	prepared to generate wakeup events.
-
-    3.	The freeze_late phase is analogous to the suspend_late phase described
-	above, except that the device should not be put in a low-power state and
-	should not be allowed to generate wakeup events by it.
-
-    4.	The freeze_noirq phase is analogous to the suspend_noirq phase discussed
-	above, except again that the device should not be put in a low-power
-	state and should not be allowed to generate wakeup events.
-
-At this point the system image is created.  All devices should be inactive and
-the contents of memory should remain undisturbed while this happens, so that the
-image forms an atomic snapshot of the system state.
-
-    5.	The thaw_noirq phase is analogous to the resume_noirq phase discussed
-	above.  The main difference is that its methods can assume the device is
-	in the same state as at the end of the freeze_noirq phase.
-
-    6.	The thaw_early phase is analogous to the resume_early phase described
-	above.  Its methods should undo the actions of the preceding
-	freeze_late, if necessary.
-
-    7.	The thaw phase is analogous to the resume phase discussed above.  Its
-	methods should bring the device back to an operating state, so that it
-	can be used for saving the image if necessary.
-
-    8.	The complete phase is discussed in the "Leaving System Suspend" section
-	above.
-
-At this point the system image is saved, and the devices then need to be
-prepared for the upcoming system shutdown.  This is much like suspending them
-before putting the system into the freeze, standby or memory sleep state,
-and the phases are similar.
-
-    9.	The prepare phase is discussed above.
-
-    10.	The poweroff phase is analogous to the suspend phase.
-
-    11.	The poweroff_late phase is analogous to the suspend_late phase.
-
-    12.	The poweroff_noirq phase is analogous to the suspend_noirq phase.
-
-The poweroff, poweroff_late and poweroff_noirq callbacks should do essentially
-the same things as the suspend, suspend_late and suspend_noirq callbacks,
-respectively.  The only notable difference is that they need not store the
-device register values, because the registers should already have been stored
-during the freeze, freeze_late or freeze_noirq phases.
-
-
-Leaving Hibernation
--------------------
-Resuming from hibernation is, again, more complicated than resuming from a sleep
-state in which the contents of main memory are preserved, because it requires
-a system image to be loaded into memory and the pre-hibernation memory contents
-to be restored before control can be passed back to the image kernel.
-
-Although in principle, the image might be loaded into memory and the
-pre-hibernation memory contents restored by the boot loader, in practice this
-can't be done because boot loaders aren't smart enough and there is no
-established protocol for passing the necessary information.  So instead, the
-boot loader loads a fresh instance of the kernel, called the boot kernel, into
-memory and passes control to it in the usual way.  Then the boot kernel reads
-the system image, restores the pre-hibernation memory contents, and passes
-control to the image kernel.  Thus two different kernels are involved in
-resuming from hibernation.  In fact, the boot kernel may be completely different
-from the image kernel: a different configuration and even a different version.
-This has important consequences for device drivers and their subsystems.
-
-To be able to load the system image into memory, the boot kernel needs to
-include at least a subset of device drivers allowing it to access the storage
-medium containing the image, although it doesn't need to include all of the
-drivers present in the image kernel.  After the image has been loaded, the
-devices managed by the boot kernel need to be prepared for passing control back
-to the image kernel.  This is very similar to the initial steps involved in
-creating a system image, and it is accomplished in the same way, using prepare,
-freeze, and freeze_noirq phases.  However the devices affected by these phases
-are only those having drivers in the boot kernel; other devices will still be in
-whatever state the boot loader left them.
-
-Should the restoration of the pre-hibernation memory contents fail, the boot
-kernel would go through the "thawing" procedure described above, using the
-thaw_noirq, thaw, and complete phases, and then continue running normally.  This
-happens only rarely.  Most often the pre-hibernation memory contents are
-restored successfully and control is passed to the image kernel, which then
-becomes responsible for bringing the system back to the working state.
-
-To achieve this, the image kernel must restore the devices' pre-hibernation
-functionality.  The operation is much like waking up from the memory sleep
-state, although it involves different phases:
-
-	restore_noirq, restore_early, restore, complete
-
-    1.	The restore_noirq phase is analogous to the resume_noirq phase.
-
-    2.	The restore_early phase is analogous to the resume_early phase.
-
-    3.	The restore phase is analogous to the resume phase.
-
-    4.	The complete phase is discussed above.
-
-The main difference from resume[_early|_noirq] is that restore[_early|_noirq]
-must assume the device has been accessed and reconfigured by the boot loader or
-the boot kernel.  Consequently the state of the device may be different from the
-state remembered from the freeze, freeze_late and freeze_noirq phases.  The
-device may even need to be reset and completely re-initialized.  In many cases
-this difference doesn't matter, so the resume[_early|_noirq] and
-restore[_early|_norq] method pointers can be set to the same routines.
-Nevertheless, different callback pointers are used in case there is a situation
-where it actually does matter.
-
-
-Device Power Management Domains
--------------------------------
-Sometimes devices share reference clocks or other power resources.  In those
-cases it generally is not possible to put devices into low-power states
-individually.  Instead, a set of devices sharing a power resource can be put
-into a low-power state together at the same time by turning off the shared
-power resource.  Of course, they also need to be put into the full-power state
-together, by turning the shared power resource on.  A set of devices with this
-property is often referred to as a power domain. A power domain may also be
-nested inside another power domain. The nested domain is referred to as the
-sub-domain of the parent domain.
-
-Support for power domains is provided through the pm_domain field of struct
-device.  This field is a pointer to an object of type struct dev_pm_domain,
-defined in include/linux/pm.h, providing a set of power management callbacks
-analogous to the subsystem-level and device driver callbacks that are executed
-for the given device during all power transitions, instead of the respective
-subsystem-level callbacks.  Specifically, if a device's pm_domain pointer is
-not NULL, the ->suspend() callback from the object pointed to by it will be
-executed instead of its subsystem's (e.g. bus type's) ->suspend() callback and
-analogously for all of the remaining callbacks.  In other words, power
-management domain callbacks, if defined for the given device, always take
-precedence over the callbacks provided by the device's subsystem (e.g. bus
-type).
-
-The support for device power management domains is only relevant to platforms
-needing to use the same device driver power management callbacks in many
-different power domain configurations and wanting to avoid incorporating the
-support for power domains into subsystem-level callbacks, for example by
-modifying the platform bus type.  Other platforms need not implement it or take
-it into account in any way.
-
-Devices may be defined as IRQ-safe which indicates to the PM core that their
-runtime PM callbacks may be invoked with disabled interrupts (see
-Documentation/power/runtime_pm.txt for more information).  If an IRQ-safe
-device belongs to a PM domain, the runtime PM of the domain will be
-disallowed, unless the domain itself is defined as IRQ-safe. However, it
-makes sense to define a PM domain as IRQ-safe only if all the devices in it
-are IRQ-safe. Moreover, if an IRQ-safe domain has a parent domain, the runtime
-PM of the parent is only allowed if the parent itself is IRQ-safe too with the
-additional restriction that all child domains of an IRQ-safe parent must also
-be IRQ-safe.
-
-Device Low Power (suspend) States
----------------------------------
-Device low-power states aren't standard.  One device might only handle
-"on" and "off", while another might support a dozen different versions of
-"on" (how many engines are active?), plus a state that gets back to "on"
-faster than from a full "off".
-
-Some busses define rules about what different suspend states mean.  PCI
-gives one example:  after the suspend sequence completes, a non-legacy
-PCI device may not perform DMA or issue IRQs, and any wakeup events it
-issues would be issued through the PME# bus signal.  Plus, there are
-several PCI-standard device states, some of which are optional.
-
-In contrast, integrated system-on-chip processors often use IRQs as the
-wakeup event sources (so drivers would call enable_irq_wake) and might
-be able to treat DMA completion as a wakeup event (sometimes DMA can stay
-active too, it'd only be the CPU and some peripherals that sleep).
-
-Some details here may be platform-specific.  Systems may have devices that
-can be fully active in certain sleep states, such as an LCD display that's
-refreshed using DMA while most of the system is sleeping lightly ... and
-its frame buffer might even be updated by a DSP or other non-Linux CPU while
-the Linux control processor stays idle.
-
-Moreover, the specific actions taken may depend on the target system state.
-One target system state might allow a given device to be very operational;
-another might require a hard shut down with re-initialization on resume.
-And two different target systems might use the same device in different
-ways; the aforementioned LCD might be active in one product's "standby",
-but a different product using the same SOC might work differently.
-
-
-Power Management Notifiers
---------------------------
-There are some operations that cannot be carried out by the power management
-callbacks discussed above, because the callbacks occur too late or too early.
-To handle these cases, subsystems and device drivers may register power
-management notifiers that are called before tasks are frozen and after they have
-been thawed.  Generally speaking, the PM notifiers are suitable for performing
-actions that either require user space to be available, or at least won't
-interfere with user space.
-
-For details refer to Documentation/power/notifiers.txt.
-
-
-Runtime Power Management
-========================
-Many devices are able to dynamically power down while the system is still
-running. This feature is useful for devices that are not being used, and
-can offer significant power savings on a running system.  These devices
-often support a range of runtime power states, which might use names such
-as "off", "sleep", "idle", "active", and so on.  Those states will in some
-cases (like PCI) be partially constrained by the bus the device uses, and will
-usually include hardware states that are also used in system sleep states.
-
-A system-wide power transition can be started while some devices are in low
-power states due to runtime power management.  The system sleep PM callbacks
-should recognize such situations and react to them appropriately, but the
-necessary actions are subsystem-specific.
-
-In some cases the decision may be made at the subsystem level while in other
-cases the device driver may be left to decide.  In some cases it may be
-desirable to leave a suspended device in that state during a system-wide power
-transition, but in other cases the device must be put back into the full-power
-state temporarily, for example so that its system wakeup capability can be
-disabled.  This all depends on the hardware and the design of the subsystem and
-device driver in question.
-
-During system-wide resume from a sleep state it's easiest to put devices into
-the full-power state, as explained in Documentation/power/runtime_pm.txt.  Refer
-to that document for more information regarding this particular issue as well as
-for information on the device runtime power management framework in general.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 10867b11d503..a0894bc52bb4 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -276,9 +276,6 @@ typedef struct pm_message {
  * example, if it detects that a child was unplugged while the system was
  * asleep).
  *
- * Refer to Documentation/power/devices.txt for more information about the role
- * of the above callbacks in the system suspend process.
- *
  * There also are callbacks related to runtime power management of devices.
  * Again, as a rule these callbacks are executed by the PM core for subsystems
  * (PM domains, device types, classes and bus types) and the subsystem-level
-- 
cgit v1.2.3


From 88e30752dd47f0a6398cd014af82332f1b9873ea Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 30 Jan 2017 09:00:07 -0800
Subject: rpmsg: qcom: smd: Return positively when not enabled

Remoteproc treats the error codes returned from the stubbed SMD API as
errors, but the fact that SMD is not enabled should not affect
remoteproc's ability to start the remote processors.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg/qcom_smd.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rpmsg/qcom_smd.h b/include/linux/rpmsg/qcom_smd.h
index e674b2e3074b..8ec8b6439b25 100644
--- a/include/linux/rpmsg/qcom_smd.h
+++ b/include/linux/rpmsg/qcom_smd.h
@@ -18,14 +18,12 @@ static inline struct qcom_smd_edge *
 qcom_smd_register_edge(struct device *parent,
 		       struct device_node *node)
 {
-	return ERR_PTR(-ENXIO);
+	return NULL;
 }
 
 static inline int qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
 {
-	/* This shouldn't be possible */
-	WARN_ON(1);
-	return -ENXIO;
+	return 0;
 }
 
 #endif
-- 
cgit v1.2.3


From 1f318a8bafcfba9f0d623f4870c4e890fd22e659 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 1 Feb 2017 18:00:14 +0100
Subject: modules: mark __inittest/__exittest as __maybe_unused

clang warns about unused inline functions by default:

arch/arm/crypto/aes-cipher-glue.c:68:1: warning: unused function '__inittest' [-Wunused-function]
arch/arm/crypto/aes-cipher-glue.c:69:1: warning: unused function '__exittest' [-Wunused-function]

As these appear in every single module, let's just disable the warnings by marking the
two functions as __maybe_unused.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 include/linux/module.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index ef599379f9a4..4e09f469bd7d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -127,13 +127,13 @@ extern void cleanup_module(void);
 
 /* Each module must use one module_init(). */
 #define module_init(initfn)					\
-	static inline initcall_t __inittest(void)		\
+	static inline initcall_t __maybe_unused __inittest(void)		\
 	{ return initfn; }					\
 	int init_module(void) __attribute__((alias(#initfn)));
 
 /* This is only required if you want to be unloadable. */
 #define module_exit(exitfn)					\
-	static inline exitcall_t __exittest(void)		\
+	static inline exitcall_t __maybe_unused __exittest(void)		\
 	{ return exitfn; }					\
 	void cleanup_module(void) __attribute__((alias(#exitfn)));
 
-- 
cgit v1.2.3


From b6a05c823fc573a65efc4466f174abf05f922e0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 30 Jan 2017 13:18:58 +0100
Subject: scsi: remove eh_timed_out methods in the transport template

Instead define the timeout behavior purely based on the host_template
eh_timed_out method and wire up the existing transport implementations
in the host templates.  This also clears up the confusion that the
transport template method overrides the host template one, so some
drivers have to re-override the transport template one.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/libata-eh.c                  |  1 +
 drivers/ata/libata-transport.c           |  1 -
 drivers/ata/libata.h                     |  1 -
 drivers/infiniband/ulp/iser/iscsi_iser.c |  1 +
 drivers/infiniband/ulp/srp/ib_srp.c      |  1 +
 drivers/message/fusion/mptfc.c           |  1 +
 drivers/message/fusion/mptsas.c          |  2 +-
 drivers/s390/scsi/zfcp_scsi.c            |  1 +
 drivers/scsi/be2iscsi/be_main.c          |  1 +
 drivers/scsi/bfa/bfad_im.c               |  2 ++
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c        |  1 +
 drivers/scsi/bnx2i/bnx2i_iscsi.c         |  1 +
 drivers/scsi/csiostor/csio_scsi.c        |  2 ++
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c       |  1 +
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c       |  1 +
 drivers/scsi/fcoe/fcoe.c                 |  1 +
 drivers/scsi/fnic/fnic_main.c            |  1 +
 drivers/scsi/ibmvscsi/ibmvfc.c           |  1 +
 drivers/scsi/ibmvscsi/ibmvscsi.c         |  1 +
 drivers/scsi/iscsi_tcp.c                 |  1 +
 drivers/scsi/libiscsi.c                  |  5 ++---
 drivers/scsi/lpfc/lpfc_scsi.c            |  2 ++
 drivers/scsi/qedi/qedi_iscsi.c           |  1 +
 drivers/scsi/qla2xxx/qla_os.c            |  1 +
 drivers/scsi/scsi_error.c                |  4 +---
 drivers/scsi/scsi_transport_fc.c         |  9 ++++-----
 drivers/scsi/scsi_transport_srp.c        |  5 ++---
 drivers/scsi/storvsc_drv.c               |  5 -----
 include/linux/libata.h                   |  2 ++
 include/scsi/libiscsi.h                  |  1 +
 include/scsi/scsi_transport.h            | 11 -----------
 include/scsi/scsi_transport_fc.h         |  1 +
 include/scsi/scsi_transport_srp.h        |  1 +
 33 files changed, 38 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 0e1ec37070d1..50ee10db160f 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -549,6 +549,7 @@ enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 	DPRINTK("EXIT, ret=%d\n", ret);
 	return ret;
 }
+EXPORT_SYMBOL(ata_scsi_timed_out);
 
 static void ata_eh_unload(struct ata_port *ap)
 {
diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c
index 7ef16c085058..46698232e6bf 100644
--- a/drivers/ata/libata-transport.c
+++ b/drivers/ata/libata-transport.c
@@ -716,7 +716,6 @@ struct scsi_transport_template *ata_attach_transport(void)
 		return NULL;
 
 	i->t.eh_strategy_handler	= ata_scsi_error;
-	i->t.eh_timed_out		= ata_scsi_timed_out;
 	i->t.user_scan			= ata_scsi_user_scan;
 
 	i->t.host_attrs.ac.attrs = &i->port_attrs[0];
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 8f3a5596dd67..06d479d1f302 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -159,7 +159,6 @@ extern unsigned long ata_internal_cmd_timeout(struct ata_device *dev, u8 cmd);
 extern void ata_internal_cmd_timed_out(struct ata_device *dev, u8 cmd);
 extern void ata_eh_acquire(struct ata_port *ap);
 extern void ata_eh_release(struct ata_port *ap);
-extern enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern void ata_scsi_error(struct Scsi_Host *host);
 extern void ata_eh_fastdrain_timerfn(unsigned long arg);
 extern void ata_qc_schedule_eh(struct ata_queued_cmd *qc);
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 9104e6b8cac9..86fed1956d7d 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -997,6 +997,7 @@ static struct scsi_host_template iscsi_iser_sht = {
 	.change_queue_depth	= scsi_change_queue_depth,
 	.sg_tablesize           = ISCSI_ISER_DEF_SG_TABLESIZE,
 	.cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
+	.eh_timed_out		= iscsi_eh_cmd_timed_out,
 	.eh_abort_handler       = iscsi_eh_abort,
 	.eh_device_reset_handler= iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 8ddc07123193..7866cb0a8f00 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -2864,6 +2864,7 @@ static struct scsi_host_template srp_template = {
 	.info				= srp_target_info,
 	.queuecommand			= srp_queuecommand,
 	.change_queue_depth             = srp_change_queue_depth,
+	.eh_timed_out			= srp_timed_out,
 	.eh_abort_handler		= srp_abort,
 	.eh_device_reset_handler	= srp_reset_device,
 	.eh_host_reset_handler		= srp_reset_host,
diff --git a/drivers/message/fusion/mptfc.c b/drivers/message/fusion/mptfc.c
index add6a3a6ef0d..98eafae78576 100644
--- a/drivers/message/fusion/mptfc.c
+++ b/drivers/message/fusion/mptfc.c
@@ -119,6 +119,7 @@ static struct scsi_host_template mptfc_driver_template = {
 	.target_destroy			= mptfc_target_destroy,
 	.slave_destroy			= mptscsih_slave_destroy,
 	.change_queue_depth 		= mptscsih_change_queue_depth,
+	.eh_timed_out			= fc_eh_timed_out,
 	.eh_abort_handler		= mptfc_abort,
 	.eh_device_reset_handler	= mptfc_dev_reset,
 	.eh_bus_reset_handler		= mptfc_bus_reset,
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 7ee1667acde4..4ce333643943 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1983,6 +1983,7 @@ static struct scsi_host_template mptsas_driver_template = {
 	.target_destroy			= mptsas_target_destroy,
 	.slave_destroy			= mptscsih_slave_destroy,
 	.change_queue_depth 		= mptscsih_change_queue_depth,
+	.eh_timed_out			= mptsas_eh_timed_out,
 	.eh_abort_handler		= mptscsih_abort,
 	.eh_device_reset_handler	= mptscsih_dev_reset,
 	.eh_host_reset_handler		= mptscsih_host_reset,
@@ -5398,7 +5399,6 @@ mptsas_init(void)
 	    sas_attach_transport(&mptsas_transport_functions);
 	if (!mptsas_transport_template)
 		return -ENODEV;
-	mptsas_transport_template->eh_timed_out = mptsas_eh_timed_out;
 
 	mptsasDoneCtx = mpt_register(mptscsih_io_done, MPTSAS_DRIVER,
 	    "mptscsih_io_done");
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index 07ffdbb5107f..0678cf714c0e 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -330,6 +330,7 @@ static struct scsi_host_template zfcp_scsi_host_template = {
 	.module			 = THIS_MODULE,
 	.name			 = "zfcp",
 	.queuecommand		 = zfcp_scsi_queuecommand,
+	.eh_timed_out		 = fc_eh_timed_out,
 	.eh_abort_handler	 = zfcp_scsi_eh_abort_handler,
 	.eh_device_reset_handler = zfcp_scsi_eh_device_reset_handler,
 	.eh_target_reset_handler = zfcp_scsi_eh_target_reset_handler,
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index c9b9daa91091..32b2713cec93 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -389,6 +389,7 @@ static struct scsi_host_template beiscsi_sht = {
 	.change_queue_depth = scsi_change_queue_depth,
 	.slave_configure = beiscsi_slave_configure,
 	.target_alloc = iscsi_target_alloc,
+	.eh_timed_out = iscsi_eh_cmd_timed_out,
 	.eh_abort_handler = beiscsi_eh_abort,
 	.eh_device_reset_handler = beiscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_session_reset,
diff --git a/drivers/scsi/bfa/bfad_im.c b/drivers/scsi/bfa/bfad_im.c
index 02d806012fa1..7eb0eef18fdd 100644
--- a/drivers/scsi/bfa/bfad_im.c
+++ b/drivers/scsi/bfa/bfad_im.c
@@ -813,6 +813,7 @@ struct scsi_host_template bfad_im_scsi_host_template = {
 	.name = BFAD_DRIVER_NAME,
 	.info = bfad_im_info,
 	.queuecommand = bfad_im_queuecommand,
+	.eh_timed_out = fc_eh_timed_out,
 	.eh_abort_handler = bfad_im_abort_handler,
 	.eh_device_reset_handler = bfad_im_reset_lun_handler,
 	.eh_bus_reset_handler = bfad_im_reset_bus_handler,
@@ -835,6 +836,7 @@ struct scsi_host_template bfad_im_vport_template = {
 	.name = BFAD_DRIVER_NAME,
 	.info = bfad_im_info,
 	.queuecommand = bfad_im_queuecommand,
+	.eh_timed_out = fc_eh_timed_out,
 	.eh_abort_handler = bfad_im_abort_handler,
 	.eh_device_reset_handler = bfad_im_reset_lun_handler,
 	.eh_bus_reset_handler = bfad_im_reset_bus_handler,
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index c639d5a02656..b1e39f985ec9 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -2947,6 +2947,7 @@ static struct scsi_host_template bnx2fc_shost_template = {
 	.module			= THIS_MODULE,
 	.name			= "QLogic Offload FCoE Initiator",
 	.queuecommand		= bnx2fc_queuecommand,
+	.eh_timed_out		= fc_eh_timed_out,
 	.eh_abort_handler	= bnx2fc_eh_abort,	  /* abts */
 	.eh_device_reset_handler = bnx2fc_eh_device_reset, /* lun reset */
 	.eh_target_reset_handler = bnx2fc_eh_target_reset, /* tgt reset */
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 133901fd3e35..f32a66f89d25 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -2259,6 +2259,7 @@ static struct scsi_host_template bnx2i_host_template = {
 	.name			= "QLogic Offload iSCSI Initiator",
 	.proc_name		= "bnx2i",
 	.queuecommand		= iscsi_queuecommand,
+	.eh_timed_out		= iscsi_eh_cmd_timed_out,
 	.eh_abort_handler	= iscsi_eh_abort,
 	.eh_device_reset_handler = iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
index 89a52b941ea8..a1ff75f1384f 100644
--- a/drivers/scsi/csiostor/csio_scsi.c
+++ b/drivers/scsi/csiostor/csio_scsi.c
@@ -2270,6 +2270,7 @@ struct scsi_host_template csio_fcoe_shost_template = {
 	.name			= CSIO_DRV_DESC,
 	.proc_name		= KBUILD_MODNAME,
 	.queuecommand		= csio_queuecommand,
+	.eh_timed_out		= fc_eh_timed_out,
 	.eh_abort_handler	= csio_eh_abort_handler,
 	.eh_device_reset_handler = csio_eh_lun_reset_handler,
 	.slave_alloc		= csio_slave_alloc,
@@ -2289,6 +2290,7 @@ struct scsi_host_template csio_fcoe_shost_vport_template = {
 	.name			= CSIO_DRV_DESC,
 	.proc_name		= KBUILD_MODNAME,
 	.queuecommand		= csio_queuecommand,
+	.eh_timed_out		= fc_eh_timed_out,
 	.eh_abort_handler	= csio_eh_abort_handler,
 	.eh_device_reset_handler = csio_eh_lun_reset_handler,
 	.slave_alloc		= csio_slave_alloc,
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index 33e83464e091..1880eb6c68f7 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -90,6 +90,7 @@ static struct scsi_host_template cxgb3i_host_template = {
 	.sg_tablesize	= SG_ALL,
 	.max_sectors	= 0xFFFF,
 	.cmd_per_lun	= ISCSI_DEF_CMD_PER_LUN,
+	.eh_timed_out	= iscsi_eh_cmd_timed_out,
 	.eh_abort_handler = iscsi_eh_abort,
 	.eh_device_reset_handler = iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index 9a2fdc305cf2..3fb3f5708ff7 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -103,6 +103,7 @@ static struct scsi_host_template cxgb4i_host_template = {
 	.sg_tablesize	= SG_ALL,
 	.max_sectors	= 0xFFFF,
 	.cmd_per_lun	= ISCSI_DEF_CMD_PER_LUN,
+	.eh_timed_out	= iscsi_eh_cmd_timed_out,
 	.eh_abort_handler = iscsi_eh_abort,
 	.eh_device_reset_handler = iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 59150cad0353..86af57f7c11a 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -277,6 +277,7 @@ static struct scsi_host_template fcoe_shost_template = {
 	.name = "FCoE Driver",
 	.proc_name = FCOE_NAME,
 	.queuecommand = fc_queuecommand,
+	.eh_timed_out = fc_eh_timed_out,
 	.eh_abort_handler = fc_eh_abort,
 	.eh_device_reset_handler = fc_eh_device_reset,
 	.eh_host_reset_handler = fc_eh_host_reset,
diff --git a/drivers/scsi/fnic/fnic_main.c b/drivers/scsi/fnic/fnic_main.c
index 58ce9020d69c..ba58b7953263 100644
--- a/drivers/scsi/fnic/fnic_main.c
+++ b/drivers/scsi/fnic/fnic_main.c
@@ -106,6 +106,7 @@ static struct scsi_host_template fnic_host_template = {
 	.module = THIS_MODULE,
 	.name = DRV_NAME,
 	.queuecommand = fnic_queuecommand,
+	.eh_timed_out = fc_eh_timed_out,
 	.eh_abort_handler = fnic_abort_cmd,
 	.eh_device_reset_handler = fnic_device_reset,
 	.eh_host_reset_handler = fnic_host_reset,
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 78b72c28a55d..2c92dabb55f6 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3090,6 +3090,7 @@ static struct scsi_host_template driver_template = {
 	.name = "IBM POWER Virtual FC Adapter",
 	.proc_name = IBMVFC_NAME,
 	.queuecommand = ibmvfc_queuecommand,
+	.eh_timed_out = fc_eh_timed_out,
 	.eh_abort_handler = ibmvfc_eh_abort_handler,
 	.eh_device_reset_handler = ibmvfc_eh_device_reset_handler,
 	.eh_target_reset_handler = ibmvfc_eh_target_reset_handler,
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 50cd01165e35..1deb0a9f14a6 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -2072,6 +2072,7 @@ static struct scsi_host_template driver_template = {
 	.name = "IBM POWER Virtual SCSI Adapter " IBMVSCSI_VERSION,
 	.proc_name = "ibmvscsi",
 	.queuecommand = ibmvscsi_queuecommand,
+	.eh_timed_out = srp_timed_out,
 	.eh_abort_handler = ibmvscsi_eh_abort_handler,
 	.eh_device_reset_handler = ibmvscsi_eh_device_reset_handler,
 	.eh_host_reset_handler = ibmvscsi_eh_host_reset_handler,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index ace4f1f41b8e..4228aba1f654 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -967,6 +967,7 @@ static struct scsi_host_template iscsi_sw_tcp_sht = {
 	.sg_tablesize		= 4096,
 	.max_sectors		= 0xFFFF,
 	.cmd_per_lun		= ISCSI_DEF_CMD_PER_LUN,
+	.eh_timed_out		= iscsi_eh_cmd_timed_out,
 	.eh_abort_handler       = iscsi_eh_abort,
 	.eh_device_reset_handler= iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index f9b6fba689ff..834d1212b6d5 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1930,7 +1930,7 @@ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn)
 		return 0;
 }
 
-static enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
+enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 {
 	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
 	struct iscsi_task *task = NULL, *running_task;
@@ -2063,6 +2063,7 @@ done:
 		     "timer reset" : "nh");
 	return rc;
 }
+EXPORT_SYMBOL_GPL(iscsi_eh_cmd_timed_out);
 
 static void iscsi_check_transport_timeouts(unsigned long data)
 {
@@ -2585,8 +2586,6 @@ int iscsi_host_add(struct Scsi_Host *shost, struct device *pdev)
 	if (!shost->cmd_per_lun)
 		shost->cmd_per_lun = ISCSI_DEF_CMD_PER_LUN;
 
-	if (!shost->transportt->eh_timed_out)
-		shost->transportt->eh_timed_out = iscsi_eh_cmd_timed_out;
 	return scsi_add_host(shost, pdev);
 }
 EXPORT_SYMBOL_GPL(iscsi_host_add);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 19d349fc889f..1180a22beb43 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -5929,6 +5929,7 @@ struct scsi_host_template lpfc_template = {
 	.proc_name		= LPFC_DRIVER_NAME,
 	.info			= lpfc_info,
 	.queuecommand		= lpfc_queuecommand,
+	.eh_timed_out		= fc_eh_timed_out,
 	.eh_abort_handler	= lpfc_abort_handler,
 	.eh_device_reset_handler = lpfc_device_reset_handler,
 	.eh_target_reset_handler = lpfc_target_reset_handler,
@@ -5955,6 +5956,7 @@ struct scsi_host_template lpfc_vport_template = {
 	.proc_name		= LPFC_DRIVER_NAME,
 	.info			= lpfc_info,
 	.queuecommand		= lpfc_queuecommand,
+	.eh_timed_out		= fc_eh_timed_out,
 	.eh_abort_handler	= lpfc_abort_handler,
 	.eh_device_reset_handler = lpfc_device_reset_handler,
 	.eh_target_reset_handler = lpfc_target_reset_handler,
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index d6a205433b66..8b25f0a1eb8c 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -48,6 +48,7 @@ struct scsi_host_template qedi_host_template = {
 	.name = "QLogic QEDI 25/40/100Gb iSCSI Initiator Driver",
 	.proc_name = QEDI_MODULE_NAME,
 	.queuecommand = iscsi_queuecommand,
+	.eh_timed_out = iscsi_eh_cmd_timed_out,
 	.eh_abort_handler = iscsi_eh_abort,
 	.eh_device_reset_handler = iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 8521cfe302e9..c5a9d6c295a0 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -262,6 +262,7 @@ struct scsi_host_template qla2xxx_driver_template = {
 	.name			= QLA2XXX_DRIVER_NAME,
 	.queuecommand		= qla2xxx_queuecommand,
 
+	.eh_timed_out		= fc_eh_timed_out,
 	.eh_abort_handler	= qla2xxx_eh_abort,
 	.eh_device_reset_handler = qla2xxx_eh_device_reset,
 	.eh_target_reset_handler = qla2xxx_eh_target_reset,
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 996e134d79fa..9d7bfbb02389 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -279,9 +279,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 	if (host->eh_deadline != -1 && !host->last_reset)
 		host->last_reset = jiffies;
 
-	if (host->transportt->eh_timed_out)
-		rtn = host->transportt->eh_timed_out(scmd);
-	else if (host->hostt->eh_timed_out)
+	if (host->hostt->eh_timed_out)
 		rtn = host->hostt->eh_timed_out(scmd);
 
 	if (rtn == BLK_EH_NOT_HANDLED) {
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 03577bde6ac5..9a6ea6fccb06 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -2055,7 +2055,7 @@ static int fc_vport_match(struct attribute_container *cont,
 
 
 /**
- * fc_timed_out - FC Transport I/O timeout intercept handler
+ * fc_eh_timed_out - FC Transport I/O timeout intercept handler
  * @scmd:	The SCSI command which timed out
  *
  * This routine protects against error handlers getting invoked while a
@@ -2076,8 +2076,8 @@ static int fc_vport_match(struct attribute_container *cont,
  * Notes:
  *	This routine assumes no locks are held on entry.
  */
-static enum blk_eh_timer_return
-fc_timed_out(struct scsi_cmnd *scmd)
+enum blk_eh_timer_return
+fc_eh_timed_out(struct scsi_cmnd *scmd)
 {
 	struct fc_rport *rport = starget_to_rport(scsi_target(scmd->device));
 
@@ -2086,6 +2086,7 @@ fc_timed_out(struct scsi_cmnd *scmd)
 
 	return BLK_EH_NOT_HANDLED;
 }
+EXPORT_SYMBOL(fc_eh_timed_out);
 
 /*
  * Called by fc_user_scan to locate an rport on the shost that
@@ -2211,8 +2212,6 @@ fc_attach_transport(struct fc_function_template *ft)
 	/* Transport uses the shost workq for scsi scanning */
 	i->t.create_work_queue = 1;
 
-	i->t.eh_timed_out = fc_timed_out;
-
 	i->t.user_scan = fc_user_scan;
 
 	/* target-mode drivers' functions */
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
index b87a78673f65..75b57a1855b0 100644
--- a/drivers/scsi/scsi_transport_srp.c
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -591,7 +591,7 @@ EXPORT_SYMBOL(srp_reconnect_rport);
  * Note: This function is called from soft-IRQ context and with the request
  * queue lock held.
  */
-static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
+enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
 {
 	struct scsi_device *sdev = scmd->device;
 	struct Scsi_Host *shost = sdev->host;
@@ -603,6 +603,7 @@ static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
 		i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
 		BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
 }
+EXPORT_SYMBOL(srp_timed_out);
 
 static void srp_rport_release(struct device *dev)
 {
@@ -820,8 +821,6 @@ srp_attach_transport(struct srp_function_template *ft)
 	if (!i)
 		return NULL;
 
-	i->t.eh_timed_out = srp_timed_out;
-
 	i->t.tsk_mgmt_response = srp_tsk_mgmt_response;
 	i->t.it_nexus_response = srp_it_nexus_response;
 
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index ceadf6b4ebc1..585e54f6512c 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1899,11 +1899,6 @@ static int __init storvsc_drv_init(void)
 	fc_transport_template = fc_attach_transport(&fc_transport_functions);
 	if (!fc_transport_template)
 		return -ENODEV;
-
-	/*
-	 * Install Hyper-V specific timeout handler.
-	 */
-	fc_transport_template->eh_timed_out = storvsc_eh_timed_out;
 #endif
 
 	ret = vmbus_driver_register(&storvsc_drv);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c170be548b7f..46e18c0619c6 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1130,6 +1130,7 @@ extern int ata_sas_port_start(struct ata_port *ap);
 extern void ata_sas_port_stop(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
 extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
+extern enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern int sata_scr_valid(struct ata_link *link);
 extern int sata_scr_read(struct ata_link *link, int reg, u32 *val);
 extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
@@ -1355,6 +1356,7 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	.proc_name		= drv_name,			\
 	.slave_configure	= ata_scsi_slave_config,	\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
+	.eh_timed_out		= ata_scsi_timed_out,		\
 	.bios_param		= ata_std_bios_param,		\
 	.unlock_native_capacity	= ata_scsi_unlock_native_capacity, \
 	.sdev_attrs		= ata_common_sdev_attrs
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 4d1c46aac331..b0e275de6dec 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -383,6 +383,7 @@ extern int iscsi_eh_recover_target(struct scsi_cmnd *sc);
 extern int iscsi_eh_session_reset(struct scsi_cmnd *sc);
 extern int iscsi_eh_device_reset(struct scsi_cmnd *sc);
 extern int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc);
+extern enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc);
 
 /*
  * iSCSI host helpers.
diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h
index 81292392adbc..6c3bb9f3dc0f 100644
--- a/include/scsi/scsi_transport.h
+++ b/include/scsi/scsi_transport.h
@@ -57,17 +57,6 @@ struct scsi_transport_template {
 	 */
 	void (* eh_strategy_handler)(struct Scsi_Host *);
 
-	/*
-	 * This is an optional routine that allows the transport to become
-	 * involved when a scsi io timer fires. The return value tells the
-	 * timer routine how to finish the io timeout handling:
-	 * EH_HANDLED:		I fixed the error, please complete the command
-	 * EH_RESET_TIMER:	I need more time, reset the timer and
-	 *			begin counting again
-	 * EH_NOT_HANDLED	Begin normal error recovery
-	 */
-	enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
-
 	/*
 	 * Used as callback for the completion of i_t_nexus request
 	 * for target drivers.
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index 924c8e614b45..b21b8aa58c4d 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -808,6 +808,7 @@ struct fc_vport *fc_vport_create(struct Scsi_Host *shost, int channel,
 		struct fc_vport_identifiers *);
 int fc_vport_terminate(struct fc_vport *vport);
 int fc_block_scsi_eh(struct scsi_cmnd *cmnd);
+enum blk_eh_timer_return fc_eh_timed_out(struct scsi_cmnd *scmd);
 
 static inline struct Scsi_Host *fc_bsg_to_shost(struct bsg_job *job)
 {
diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h
index d40d3ef25707..20d96797edc5 100644
--- a/include/scsi/scsi_transport_srp.h
+++ b/include/scsi/scsi_transport_srp.h
@@ -124,6 +124,7 @@ extern int srp_reconnect_rport(struct srp_rport *rport);
 extern void srp_start_tl_fail_timers(struct srp_rport *rport);
 extern void srp_remove_host(struct Scsi_Host *);
 extern void srp_stop_rport_timers(struct srp_rport *rport);
+enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd);
 
 /**
  * srp_chkready() - evaluate the transport layer state before I/O
-- 
cgit v1.2.3


From 46f47e48008b63f5fd3a3bad8b79ba1a89fb625f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 24 Jan 2017 10:58:06 -0800
Subject: fscrypt: split supp and notsupp declarations into their own headers

Previously, each filesystem configured without encryption support would
define all the public fscrypt functions to their notsupp_* stubs.  This
list of #defines had to be updated in every filesystem whenever a change
was made to the public fscrypt functions.  To make things more
maintainable now that we have three filesystems using fscrypt, split the
old header fscrypto.h into several new headers.  fscrypt_supp.h contains
the real declarations and is included by filesystems when configured
with encryption support, whereas fscrypt_notsupp.h contains the inline
stubs and is included by filesystems when configured without encryption
support.  fscrypt_common.h contains common declarations needed by both.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 MAINTAINERS                     |   2 +-
 fs/crypto/fscrypt_private.h     |   2 +-
 fs/ext4/ext4.h                  |  28 +---
 fs/ext4/page-io.c               |   1 -
 fs/f2fs/f2fs.h                  |  30 +---
 fs/ubifs/ubifs.h                |  28 +---
 include/linux/fscrypt_common.h  | 146 +++++++++++++++++
 include/linux/fscrypt_notsupp.h | 168 +++++++++++++++++++
 include/linux/fscrypt_supp.h    |  66 ++++++++
 include/linux/fscrypto.h        | 347 ----------------------------------------
 10 files changed, 397 insertions(+), 421 deletions(-)
 create mode 100644 include/linux/fscrypt_common.h
 create mode 100644 include/linux/fscrypt_notsupp.h
 create mode 100644 include/linux/fscrypt_supp.h
 delete mode 100644 include/linux/fscrypto.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index cfff2c9e3d94..4d20b03648fa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5268,7 +5268,7 @@ M:	Jaegeuk Kim <jaegeuk@kernel.org>
 L:	linux-fsdevel@vger.kernel.org
 S:	Supported
 F:	fs/crypto/
-F:	include/linux/fscrypto.h
+F:	include/linux/fscrypt*.h
 
 F2FS FILE SYSTEM
 M:	Jaegeuk Kim <jaegeuk@kernel.org>
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7bff7b4c7498..e8a229e1d45d 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -11,7 +11,7 @@
 #ifndef _FSCRYPT_PRIVATE_H
 #define _FSCRYPT_PRIVATE_H
 
-#include <linux/fscrypto.h>
+#include <linux/fscrypt_supp.h>
 
 #define FS_FNAME_CRYPTO_DIGEST_SIZE	32
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6bcb9622fdf9..a5c756b80b7d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -32,7 +32,11 @@
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
-#include <linux/fscrypto.h>
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
 #include <linux/falloc.h>
 #include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
@@ -2309,28 +2313,6 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
 }
 static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
 
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx			fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx		fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page		fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page		fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages	fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
-#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
-#define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info	fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename		fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename		fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size	fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer	fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer	fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr	fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk	fscrypt_notsupp_fname_usr_to_disk
 #endif
 
 /* dir.c */
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d83b0f3c5fe9..e55624c50898 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
-#include <linux/fscrypto.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 93d38d854a41..069fc7277d8d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,7 +22,11 @@
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/fscrypto.h>
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
 #include <crypto/hash.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
@@ -2501,28 +2505,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
 
-#ifndef CONFIG_F2FS_FS_ENCRYPTION
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx			fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx		fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page		fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page		fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages	fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
-#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
-#define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info	fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename		fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename		fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size	fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer	fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer	fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr	fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk	fscrypt_notsupp_fname_usr_to_disk
-#endif
 #endif
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ca72382ce6cc..d9df379bacc6 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,7 +38,11 @@
 #include <linux/backing-dev.h>
 #include <linux/security.h>
 #include <linux/xattr.h>
-#include <linux/fscrypto.h>
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
 #include <linux/random.h>
 #include "ubifs-media.h"
 
@@ -1797,28 +1801,6 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
 #include "key.h"
 
 #ifndef CONFIG_UBIFS_FS_ENCRYPTION
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx                 fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx             fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page            fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page            fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages       fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page       fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page    fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range           fscrypt_notsupp_zeroout_range
-#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
-#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
-#define fscrypt_has_permitted_context   fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context         fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info     fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info     fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename          fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename           fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size    fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer      fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer       fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr       fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk       fscrypt_notsupp_fname_usr_to_disk
 static inline int ubifs_encrypt(const struct inode *inode,
 				struct ubifs_data_node *dn,
 				unsigned int in_len, unsigned int *out_len,
diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h
new file mode 100644
index 000000000000..547f81592ba1
--- /dev/null
+++ b/include/linux/fscrypt_common.h
@@ -0,0 +1,146 @@
+/*
+ * fscrypt_common.h: common declarations for per-file encryption
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#ifndef _LINUX_FSCRYPT_COMMON_H
+#define _LINUX_FSCRYPT_COMMON_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <crypto/skcipher.h>
+#include <uapi/linux/fs.h>
+
+#define FS_CRYPTO_BLOCK_SIZE		16
+
+struct fscrypt_info;
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 fscrypt_symlink_data_len(u32 l)
+{
+	if (l < FS_CRYPTO_BLOCK_SIZE)
+		l = FS_CRYPTO_BLOCK_SIZE;
+	return (l + sizeof(struct fscrypt_symlink_data) - 1);
+}
+
+struct fscrypt_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct fscrypt_name {
+	const struct qstr *usr_fname;
+	struct fscrypt_str disk_name;
+	u32 hash;
+	u32 minor_hash;
+	struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l)		{ .name = n, .len = l }
+#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)		((p)->disk_name.name)
+#define fname_len(p)		((p)->disk_name.len)
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+	unsigned int flags;
+	const char *key_prefix;
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*prepare_context)(struct inode *);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	int (*dummy_context)(struct inode *);
+	bool (*is_encrypted)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned (*max_namelen)(struct inode *);
+};
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	if (inode->i_sb->s_cop->dummy_context &&
+				inode->i_sb->s_cop->dummy_context(inode))
+		return true;
+	return false;
+}
+
+static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
+{
+	return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
+{
+	return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+#else
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+#endif
+}
+
+static inline int fscrypt_has_encryption_key(const struct inode *inode)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return (inode->i_crypt_info != NULL);
+#else
+	return 0;
+#endif
+}
+
+#endif	/* _LINUX_FSCRYPT_COMMON_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
new file mode 100644
index 000000000000..3511ca798804
--- /dev/null
+++ b/include/linux/fscrypt_notsupp.h
@@ -0,0 +1,168 @@
+/*
+ * fscrypt_notsupp.h
+ *
+ * This stubs out the fscrypt functions for filesystems configured without
+ * encryption support.
+ */
+
+#ifndef _LINUX_FSCRYPT_NOTSUPP_H
+#define _LINUX_FSCRYPT_NOTSUPP_H
+
+#include <linux/fscrypt_common.h>
+
+/* crypto.c */
+static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
+						  gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+	return;
+}
+
+static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
+						struct page *page,
+						unsigned int len,
+						unsigned int offs,
+						u64 lblk_num, gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int fscrypt_decrypt_page(const struct inode *inode,
+				       struct page *page,
+				       unsigned int len, unsigned int offs,
+				       u64 lblk_num)
+{
+	return -EOPNOTSUPP;
+}
+
+
+static inline void fscrypt_restore_control_page(struct page *page)
+{
+	return;
+}
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+	return;
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+	return;
+}
+
+/* policy.c */
+static inline int fscrypt_ioctl_set_policy(struct file *filp,
+					   const void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_has_permitted_context(struct inode *parent,
+						struct inode *child)
+{
+	return 0;
+}
+
+static inline int fscrypt_inherit_context(struct inode *parent,
+					  struct inode *child,
+					  void *fs_data, bool preload)
+{
+	return -EOPNOTSUPP;
+}
+
+/* keyinfo.c */
+static inline int fscrypt_get_encryption_info(struct inode *inode)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_put_encryption_info(struct inode *inode,
+					       struct fscrypt_info *ci)
+{
+	return;
+}
+
+ /* fname.c */
+static inline int fscrypt_setup_filename(struct inode *dir,
+					 const struct qstr *iname,
+					 int lookup, struct fscrypt_name *fname)
+{
+	if (dir->i_sb->s_cop->is_encrypted(dir))
+		return -EOPNOTSUPP;
+
+	memset(fname, 0, sizeof(struct fscrypt_name));
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *)iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+	return;
+}
+
+static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode,
+					       u32 ilen)
+{
+	/* never happens */
+	WARN_ON(1);
+	return 0;
+}
+
+static inline int fscrypt_fname_alloc_buffer(const struct inode *inode,
+					     u32 ilen,
+					     struct fscrypt_str *crypto_str)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
+{
+	return;
+}
+
+static inline int fscrypt_fname_disk_to_usr(struct inode *inode,
+					    u32 hash, u32 minor_hash,
+					    const struct fscrypt_str *iname,
+					    struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_fname_usr_to_disk(struct inode *inode,
+					    const struct qstr *iname,
+					    struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+/* bio.c */
+static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx,
+					     struct bio *bio)
+{
+	return;
+}
+
+static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	return;
+}
+
+static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+					sector_t pblk, unsigned int len)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
new file mode 100644
index 000000000000..a140f47e9b27
--- /dev/null
+++ b/include/linux/fscrypt_supp.h
@@ -0,0 +1,66 @@
+/*
+ * fscrypt_supp.h
+ *
+ * This is included by filesystems configured with encryption support.
+ */
+
+#ifndef _LINUX_FSCRYPT_SUPP_H
+#define _LINUX_FSCRYPT_SUPP_H
+
+#include <linux/fscrypt_common.h>
+
+/* crypto.c */
+extern struct kmem_cache *fscrypt_info_cachep;
+extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
+extern void fscrypt_release_ctx(struct fscrypt_ctx *);
+extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
+						unsigned int, unsigned int,
+						u64, gfp_t);
+extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
+				unsigned int, u64);
+extern void fscrypt_restore_control_page(struct page *);
+
+extern const struct dentry_operations fscrypt_d_ops;
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+	d_set_d_op(dentry, &fscrypt_d_ops);
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+	spin_unlock(&dentry->d_lock);
+}
+
+/* policy.c */
+extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
+extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
+extern int fscrypt_inherit_context(struct inode *, struct inode *,
+					void *, bool);
+/* keyinfo.c */
+extern int fscrypt_get_encryption_info(struct inode *);
+extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
+
+/* fname.c */
+extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
+				int lookup, struct fscrypt_name *);
+extern void fscrypt_free_filename(struct fscrypt_name *);
+extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32);
+extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
+				struct fscrypt_str *);
+extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
+extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
+			const struct fscrypt_str *, struct fscrypt_str *);
+extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
+			struct fscrypt_str *);
+
+/* bio.c */
+extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
+				 unsigned int);
+
+#endif	/* _LINUX_FSCRYPT_SUPP_H */
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
deleted file mode 100644
index 715f17b3c6d7..000000000000
--- a/include/linux/fscrypto.h
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * General per-file encryption definition
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-
-#ifndef _LINUX_FSCRYPTO_H
-#define _LINUX_FSCRYPTO_H
-
-#include <linux/key.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/bio.h>
-#include <linux/dcache.h>
-#include <crypto/skcipher.h>
-#include <uapi/linux/fs.h>
-
-#define FS_CRYPTO_BLOCK_SIZE		16
-
-struct fscrypt_info;
-
-struct fscrypt_ctx {
-	union {
-		struct {
-			struct page *bounce_page;	/* Ciphertext page */
-			struct page *control_page;	/* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;	/* Free list */
-	};
-	u8 flags;				/* Flags */
-};
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct fscrypt_symlink_data {
-	__le16 len;
-	char encrypted_path[1];
-} __packed;
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 fscrypt_symlink_data_len(u32 l)
-{
-	if (l < FS_CRYPTO_BLOCK_SIZE)
-		l = FS_CRYPTO_BLOCK_SIZE;
-	return (l + sizeof(struct fscrypt_symlink_data) - 1);
-}
-
-struct fscrypt_str {
-	unsigned char *name;
-	u32 len;
-};
-
-struct fscrypt_name {
-	const struct qstr *usr_fname;
-	struct fscrypt_str disk_name;
-	u32 hash;
-	u32 minor_hash;
-	struct fscrypt_str crypto_buf;
-};
-
-#define FSTR_INIT(n, l)		{ .name = n, .len = l }
-#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)		((p)->disk_name.name)
-#define fname_len(p)		((p)->disk_name.len)
-
-/*
- * fscrypt superblock flags
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
-/*
- * crypto opertions for filesystems
- */
-struct fscrypt_operations {
-	unsigned int flags;
-	const char *key_prefix;
-	int (*get_context)(struct inode *, void *, size_t);
-	int (*prepare_context)(struct inode *);
-	int (*set_context)(struct inode *, const void *, size_t, void *);
-	int (*dummy_context)(struct inode *);
-	bool (*is_encrypted)(struct inode *);
-	bool (*empty_dir)(struct inode *);
-	unsigned (*max_namelen)(struct inode *);
-};
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-	if (inode->i_sb->s_cop->dummy_context &&
-				inode->i_sb->s_cop->dummy_context(inode))
-		return true;
-	return false;
-}
-
-static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
-{
-	return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
-{
-	return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
-{
-	if (str->len == 1 && str->name[0] == '.')
-		return true;
-
-	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-		return true;
-
-	return false;
-}
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-#else
-	WARN_ON_ONCE(1);
-	return ERR_PTR(-EINVAL);
-#endif
-}
-
-static inline int fscrypt_has_encryption_key(const struct inode *inode)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return (inode->i_crypt_info != NULL);
-#else
-	return 0;
-#endif
-}
-
-static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
-	spin_unlock(&dentry->d_lock);
-#endif
-}
-
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-extern const struct dentry_operations fscrypt_d_ops;
-#endif
-
-static inline void fscrypt_set_d_op(struct dentry *dentry)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	d_set_d_op(dentry, &fscrypt_d_ops);
-#endif
-}
-
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-/* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
-extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
-extern void fscrypt_release_ctx(struct fscrypt_ctx *);
-extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
-						unsigned int, unsigned int,
-						u64, gfp_t);
-extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
-				unsigned int, u64);
-extern void fscrypt_restore_control_page(struct page *);
-
-/* policy.c */
-extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
-extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
-extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
-extern int fscrypt_inherit_context(struct inode *, struct inode *,
-					void *, bool);
-/* keyinfo.c */
-extern int fscrypt_get_encryption_info(struct inode *);
-extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
-
-/* fname.c */
-extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
-				int lookup, struct fscrypt_name *);
-extern void fscrypt_free_filename(struct fscrypt_name *);
-extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32);
-extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
-				struct fscrypt_str *);
-extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
-extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
-			const struct fscrypt_str *, struct fscrypt_str *);
-extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
-			struct fscrypt_str *);
-
-/* bio.c */
-extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
-extern void fscrypt_pullback_bio_page(struct page **, bool);
-extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
-				 unsigned int);
-#endif
-
-/* crypto.c */
-static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(const struct inode *i,
-							gfp_t f)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c)
-{
-	return;
-}
-
-static inline struct page *fscrypt_notsupp_encrypt_page(const struct inode *i,
-						struct page *p,
-						unsigned int len,
-						unsigned int offs,
-						u64 lblk_num, gfp_t f)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline int fscrypt_notsupp_decrypt_page(const struct inode *i, struct page *p,
-						unsigned int len, unsigned int offs,
-						u64 lblk_num)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c,
-						struct bio *b)
-{
-	return;
-}
-
-static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b)
-{
-	return;
-}
-
-static inline void fscrypt_notsupp_restore_control_page(struct page *p)
-{
-	return;
-}
-
-static inline int fscrypt_notsupp_zeroout_range(const struct inode *i, pgoff_t p,
-					sector_t s, unsigned int f)
-{
-	return -EOPNOTSUPP;
-}
-
-/* policy.c */
-static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f,
-				const void __user *arg)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f,
-				void __user *arg)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_has_permitted_context(struct inode *p,
-				struct inode *i)
-{
-	return 0;
-}
-
-static inline int fscrypt_notsupp_inherit_context(struct inode *p,
-				struct inode *i, void *v, bool b)
-{
-	return -EOPNOTSUPP;
-}
-
-/* keyinfo.c */
-static inline int fscrypt_notsupp_get_encryption_info(struct inode *i)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_put_encryption_info(struct inode *i,
-					struct fscrypt_info *f)
-{
-	return;
-}
-
- /* fname.c */
-static inline int fscrypt_notsupp_setup_filename(struct inode *dir,
-			const struct qstr *iname,
-			int lookup, struct fscrypt_name *fname)
-{
-	if (dir->i_sb->s_cop->is_encrypted(dir))
-		return -EOPNOTSUPP;
-
-	memset(fname, 0, sizeof(struct fscrypt_name));
-	fname->usr_fname = iname;
-	fname->disk_name.name = (unsigned char *)iname->name;
-	fname->disk_name.len = iname->len;
-	return 0;
-}
-
-static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname)
-{
-	return;
-}
-
-static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s)
-{
-	/* never happens */
-	WARN_ON(1);
-	return 0;
-}
-
-static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode,
-				u32 ilen, struct fscrypt_str *crypto_str)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c)
-{
-	return;
-}
-
-static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode,
-			u32 hash, u32 minor_hash,
-			const struct fscrypt_str *iname,
-			struct fscrypt_str *oname)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode,
-			const struct qstr *iname,
-			struct fscrypt_str *oname)
-{
-	return -EOPNOTSUPP;
-}
-#endif	/* _LINUX_FSCRYPTO_H */
-- 
cgit v1.2.3


From e58910cdc9f43cda2e52fcdf2fddbdc74e80b2f7 Mon Sep 17 00:00:00 2001
From: Josh Boyer <jwboyer@fedoraproject.org>
Date: Mon, 6 Feb 2017 11:22:42 +0000
Subject: efi: Add SHIM and image security database GUID definitions

Add the definitions for shim and image security database, both of which
are used widely in various Linux distros.

Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1486380166-31868-4-git-send-email-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/efi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 85e9fdaa8d07..d00538a65899 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -611,6 +611,9 @@ void efi_native_runtime_setup(void);
 #define EFI_CONSOLE_OUT_DEVICE_GUID		EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4,  0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define APPLE_PROPERTIES_PROTOCOL_GUID		EFI_GUID(0x91bd12fe, 0xf6c3, 0x44fb,  0xa5, 0xb7, 0x51, 0x22, 0xab, 0x30, 0x3a, 0xe0)
 
+#define EFI_IMAGE_SECURITY_DATABASE_GUID	EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596,  0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f)
+#define EFI_SHIM_LOCK_GUID			EFI_GUID(0x605dab50, 0xe046, 0x4300,  0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23)
+
 /*
  * This GUID is used to pass to the kernel proper the struct screen_info
  * structure that was populated by the stub based on the GOP protocol instance
-- 
cgit v1.2.3


From de8cb458625c164bb3f93c4e415e479afce8fa9d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 6 Feb 2017 11:22:43 +0000
Subject: efi: Get and store the secure boot status

Get the firmware's secure-boot status in the kernel boot wrapper and stash
it somewhere that the main kernel image can find.

The efi_get_secureboot() function is extracted from the ARM stub and (a)
generalised so that it can be called from x86 and (b) made to use
efi_call_runtime() so that it can be run in mixed-mode.

For x86, it is stored in boot_params and can be overridden by the boot
loader or kexec.  This allows secure-boot mode to be passed on to a new
kernel.

Suggested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1486380166-31868-5-git-send-email-ard.biesheuvel@linaro.org
[ Small readability edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/zero-page.txt           |  2 +
 arch/x86/boot/compressed/eboot.c          |  7 ++++
 arch/x86/include/uapi/asm/bootparam.h     |  3 +-
 arch/x86/kernel/asm-offsets.c             |  1 +
 drivers/firmware/efi/libstub/Makefile     |  2 +-
 drivers/firmware/efi/libstub/arm-stub.c   | 63 +++----------------------------
 drivers/firmware/efi/libstub/secureboot.c | 61 ++++++++++++++++++++++++++++++
 include/linux/efi.h                       |  8 ++++
 8 files changed, 88 insertions(+), 59 deletions(-)
 create mode 100644 drivers/firmware/efi/libstub/secureboot.c

(limited to 'include/linux')

diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index 95a4d34af3fd..b8527c6b7646 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -31,6 +31,8 @@ Offset	Proto	Name		Meaning
 1E9/001	ALL	eddbuf_entries	Number of entries in eddbuf (below)
 1EA/001	ALL	edd_mbr_sig_buf_entries	Number of entries in edd_mbr_sig_buffer
 				(below)
+1EB/001	ALL     kbd_status      Numlock is enabled
+1EC/001	ALL     secure_boot	Secure boot is enabled in the firmware
 1EF/001	ALL	sentinel	Used to detect broken bootloaders
 290/040	ALL	edd_mbr_sig_buffer EDD MBR signatures
 2D0/A00	ALL	e820_map	E820 memory map table
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index f99978db6b6f..801c7a158e55 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -988,6 +988,13 @@ struct boot_params *efi_main(struct efi_config *c,
 	else
 		setup_boot_services32(efi_early);
 
+	/*
+	 * If the boot loader gave us a value for secure_boot then we use that,
+	 * otherwise we ask the BIOS.
+	 */
+	if (boot_params->secure_boot == efi_secureboot_mode_unset)
+		boot_params->secure_boot = efi_get_secureboot(sys_table);
+
 	setup_graphics(boot_params);
 
 	setup_efi_pci(boot_params);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index b10bf319ed20..5138dacf8bb8 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -135,7 +135,8 @@ struct boot_params {
 	__u8  eddbuf_entries;				/* 0x1e9 */
 	__u8  edd_mbr_sig_buf_entries;			/* 0x1ea */
 	__u8  kbd_status;				/* 0x1eb */
-	__u8  _pad5[3];					/* 0x1ec */
+	__u8  secure_boot;				/* 0x1ec */
+	__u8  _pad5[2];					/* 0x1ed */
 	/*
 	 * The sentinel is set to a nonzero value (0xff) in header.S.
 	 *
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index c62e015b126c..de827d6ac8c2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -81,6 +81,7 @@ void common(void) {
 
 	BLANK();
 	OFFSET(BP_scratch, boot_params, scratch);
+	OFFSET(BP_secure_boot, boot_params, secure_boot);
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 33e0e2f1a730..f7425960f6a5 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD	:= y
 # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
 KCOV_INSTRUMENT			:= n
 
-lib-y				:= efi-stub-helper.o gop.o
+lib-y				:= efi-stub-helper.o gop.o secureboot.o
 
 # include the stub's generic dependencies from lib/ when building for ARM/arm64
 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 6fca48c9e054..d4056c6be1ec 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -20,52 +20,6 @@
 
 bool __nokaslr;
 
-static int efi_get_secureboot(efi_system_table_t *sys_table_arg)
-{
-	static efi_char16_t const sb_var_name[] = {
-		'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 };
-	static efi_char16_t const sm_var_name[] = {
-		'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 };
-
-	efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID;
-	efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable;
-	u8 val;
-	unsigned long size = sizeof(val);
-	efi_status_t status;
-
-	status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid,
-			  NULL, &size, &val);
-
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	if (val == 0)
-		return 0;
-
-	status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid,
-			  NULL, &size, &val);
-
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	if (val == 1)
-		return 0;
-
-	return 1;
-
-out_efi_err:
-	switch (status) {
-	case EFI_NOT_FOUND:
-		return 0;
-	case EFI_DEVICE_ERROR:
-		return -EIO;
-	case EFI_SECURITY_VIOLATION:
-		return -EACCES;
-	default:
-		return -EINVAL;
-	}
-}
-
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
 			     void *__image, void **__fh)
 {
@@ -157,7 +111,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID;
 	unsigned long reserve_addr = 0;
 	unsigned long reserve_size = 0;
-	int secure_boot = 0;
+	enum efi_secureboot_mode secure_boot;
 	struct screen_info *si;
 
 	/* Check if we were booted by the EFI firmware */
@@ -227,19 +181,14 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
 
 	secure_boot = efi_get_secureboot(sys_table);
-	if (secure_boot > 0)
-		pr_efi(sys_table, "UEFI Secure Boot is enabled.\n");
-
-	if (secure_boot < 0) {
-		pr_efi_err(sys_table,
-			"could not determine UEFI Secure Boot status.\n");
-	}
 
 	/*
-	 * Unauthenticated device tree data is a security hazard, so
-	 * ignore 'dtb=' unless UEFI Secure Boot is disabled.
+	 * Unauthenticated device tree data is a security hazard, so ignore
+	 * 'dtb=' unless UEFI Secure Boot is disabled.  We assume that secure
+	 * boot is enabled if we can't determine its state.
 	 */
-	if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) {
+	if (secure_boot != efi_secureboot_mode_disabled &&
+	    strstr(cmdline_ptr, "dtb=")) {
 		pr_efi(sys_table, "Ignoring DTB from command line.\n");
 	} else {
 		status = handle_cmdline_files(sys_table, image, cmdline_ptr,
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
new file mode 100644
index 000000000000..b20b8b460d77
--- /dev/null
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -0,0 +1,61 @@
+/*
+ * Secure boot handling.
+ *
+ * Copyright (C) 2013,2014 Linaro Limited
+ *     Roy Franz <roy.franz@linaro.org
+ * Copyright (C) 2013 Red Hat, Inc.
+ *     Mark Salter <msalter@redhat.com>
+ *
+ * This file is part of the Linux kernel, and is made available under the
+ * terms of the GNU General Public License version 2.
+ */
+#include <linux/efi.h>
+#include <asm/efi.h>
+
+/* BIOS variables */
+static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
+static const efi_char16_t const efi_SecureBoot_name[] = {
+	'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0
+};
+static const efi_char16_t const efi_SetupMode_name[] = {
+	'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0
+};
+
+#define get_efi_var(name, vendor, ...) \
+	efi_call_runtime(get_variable, \
+			 (efi_char16_t *)(name), (efi_guid_t *)(vendor), \
+			 __VA_ARGS__);
+
+/*
+ * Determine whether we're in secure boot mode.
+ */
+enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table_arg)
+{
+	u8 secboot, setupmode;
+	unsigned long size;
+	efi_status_t status;
+
+	size = sizeof(secboot);
+	status = get_efi_var(efi_SecureBoot_name, &efi_variable_guid,
+			     NULL, &size, &secboot);
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	size = sizeof(setupmode);
+	status = get_efi_var(efi_SetupMode_name, &efi_variable_guid,
+			     NULL, &size, &setupmode);
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	if (secboot == 0 || setupmode == 1)
+		return efi_secureboot_mode_disabled;
+
+	pr_efi(sys_table_arg, "UEFI Secure Boot is enabled.\n");
+	return efi_secureboot_mode_enabled;
+
+out_efi_err:
+	pr_efi_err(sys_table_arg, "Could not determine UEFI Secure Boot status.\n");
+	if (status == EFI_NOT_FOUND)
+		return efi_secureboot_mode_disabled;
+	return efi_secureboot_mode_unknown;
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d00538a65899..94d34e0be24f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1480,6 +1480,14 @@ efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
 bool efi_runtime_disabled(void);
 extern void efi_call_virt_check_flags(unsigned long flags, const char *call);
 
+enum efi_secureboot_mode {
+	efi_secureboot_mode_unset,
+	efi_secureboot_mode_unknown,
+	efi_secureboot_mode_disabled,
+	efi_secureboot_mode_enabled,
+};
+enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table);
+
 /*
  * Arch code can implement the following three template macros, avoiding
  * reptition for the void/non-void return cases of {__,}efi_call_virt():
-- 
cgit v1.2.3


From 4025819d328cd0efc53ee22e01b800631944b7f3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 11:58:42 +0100
Subject: delayacct: Include <uapi/linux/taskstats.h>

include/linux/delayacct.h relies on 'struct taskstats' but does
not include the header that defines it.

This worked so far because files that included <linux/taskstats.h>
also happened to include other headers that included
uapi/linux/taskstats.h.

Fix it.

Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/delayacct.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 6cee17c22313..00e60f79a9cc 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -17,6 +17,7 @@
 #ifndef _LINUX_DELAYACCT_H
 #define _LINUX_DELAYACCT_H
 
+#include <uapi/linux/taskstats.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-- 
cgit v1.2.3


From bec84da8d1da6677c458e6eedd8e814eea91b9fc Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 2 Feb 2017 17:41:25 -0800
Subject: device property: allow to constify properties

There is no reason why statically defined properties should be modifiable,
so let's make device_add_properties() and the rest of pset_*() functions to
take const pointers to properties.

This will allow us to mark properties as const/__initconst at definition
sites.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 41 +++++++++++++++++++++--------------------
 include/linux/property.h |  2 +-
 2 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 43a36d68c3fd..e9fa75645d69 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -21,7 +21,7 @@
 
 struct property_set {
 	struct fwnode_handle fwnode;
-	struct property_entry *properties;
+	const struct property_entry *properties;
 };
 
 static inline bool is_pset_node(struct fwnode_handle *fwnode)
@@ -35,10 +35,10 @@ static inline struct property_set *to_pset_node(struct fwnode_handle *fwnode)
 		container_of(fwnode, struct property_set, fwnode) : NULL;
 }
 
-static struct property_entry *pset_prop_get(struct property_set *pset,
-					    const char *name)
+static const struct property_entry *pset_prop_get(struct property_set *pset,
+						  const char *name)
 {
-	struct property_entry *prop;
+	const struct property_entry *prop;
 
 	if (!pset || !pset->properties)
 		return NULL;
@@ -50,11 +50,11 @@ static struct property_entry *pset_prop_get(struct property_set *pset,
 	return NULL;
 }
 
-static void *pset_prop_find(struct property_set *pset, const char *propname,
-			    size_t length)
+static const void *pset_prop_find(struct property_set *pset,
+				  const char *propname, size_t length)
 {
-	struct property_entry *prop;
-	void *pointer;
+	const struct property_entry *prop;
+	const void *pointer;
 
 	prop = pset_prop_get(pset, propname);
 	if (!prop)
@@ -74,7 +74,7 @@ static int pset_prop_read_u8_array(struct property_set *pset,
 				   const char *propname,
 				   u8 *values, size_t nval)
 {
-	void *pointer;
+	const void *pointer;
 	size_t length = nval * sizeof(*values);
 
 	pointer = pset_prop_find(pset, propname, length);
@@ -89,7 +89,7 @@ static int pset_prop_read_u16_array(struct property_set *pset,
 				    const char *propname,
 				    u16 *values, size_t nval)
 {
-	void *pointer;
+	const void *pointer;
 	size_t length = nval * sizeof(*values);
 
 	pointer = pset_prop_find(pset, propname, length);
@@ -104,7 +104,7 @@ static int pset_prop_read_u32_array(struct property_set *pset,
 				    const char *propname,
 				    u32 *values, size_t nval)
 {
-	void *pointer;
+	const void *pointer;
 	size_t length = nval * sizeof(*values);
 
 	pointer = pset_prop_find(pset, propname, length);
@@ -119,7 +119,7 @@ static int pset_prop_read_u64_array(struct property_set *pset,
 				    const char *propname,
 				    u64 *values, size_t nval)
 {
-	void *pointer;
+	const void *pointer;
 	size_t length = nval * sizeof(*values);
 
 	pointer = pset_prop_find(pset, propname, length);
@@ -133,7 +133,7 @@ static int pset_prop_read_u64_array(struct property_set *pset,
 static int pset_prop_count_elems_of_size(struct property_set *pset,
 					 const char *propname, size_t length)
 {
-	struct property_entry *prop;
+	const struct property_entry *prop;
 
 	prop = pset_prop_get(pset, propname);
 	if (!prop)
@@ -146,7 +146,7 @@ static int pset_prop_read_string_array(struct property_set *pset,
 				       const char *propname,
 				       const char **strings, size_t nval)
 {
-	void *pointer;
+	const void *pointer;
 	size_t length = nval * sizeof(*strings);
 
 	pointer = pset_prop_find(pset, propname, length);
@@ -160,8 +160,8 @@ static int pset_prop_read_string_array(struct property_set *pset,
 static int pset_prop_read_string(struct property_set *pset,
 				 const char *propname, const char **strings)
 {
-	struct property_entry *prop;
-	const char **pointer;
+	const struct property_entry *prop;
+	const char * const *pointer;
 
 	prop = pset_prop_get(pset, propname);
 	if (!prop)
@@ -776,7 +776,7 @@ static int pset_copy_entry(struct property_entry *dst,
  */
 static struct property_set *pset_copy_set(const struct property_set *pset)
 {
-	const struct property_entry *entry;
+	struct property_entry *props;
 	struct property_set *p;
 	size_t i, n = 0;
 
@@ -787,14 +787,14 @@ static struct property_set *pset_copy_set(const struct property_set *pset)
 	while (pset->properties[n].name)
 		n++;
 
-	p->properties = kcalloc(n + 1, sizeof(*entry), GFP_KERNEL);
+	p->properties = props = kcalloc(n + 1, sizeof(*props), GFP_KERNEL);
 	if (!p->properties) {
 		kfree(p);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	for (i = 0; i < n; i++) {
-		int ret = pset_copy_entry(&p->properties[i],
+		int ret = pset_copy_entry(&props[i],
 					  &pset->properties[i]);
 		if (ret) {
 			pset_free_set(p);
@@ -847,7 +847,8 @@ EXPORT_SYMBOL_GPL(device_remove_properties);
  * @dev as its secondary firmware node. The function takes a copy of
  * @properties.
  */
-int device_add_properties(struct device *dev, struct property_entry *properties)
+int device_add_properties(struct device *dev,
+			  const struct property_entry *properties)
 {
 	struct property_set *p, pset;
 
diff --git a/include/linux/property.h b/include/linux/property.h
index 856e50b2140c..d37a4498b3ac 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -242,7 +242,7 @@ struct property_entry {
 }
 
 int device_add_properties(struct device *dev,
-			  struct property_entry *properties);
+			  const struct property_entry *properties);
 void device_remove_properties(struct device *dev);
 
 bool device_dma_supported(struct device *dev);
-- 
cgit v1.2.3


From 9426998ce6f8616c48c2834cafbe5616da3f5abd Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 2 Feb 2017 17:41:26 -0800
Subject: device property: constify property arrays values

Data that is fed into property arrays should not be modified, so let's mark
relevant pointers as const. This will allow us making source arrays as
const/__initconst.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 10 +++++-----
 include/linux/property.h | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index e9fa75645d69..31b942a29fdc 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -718,7 +718,8 @@ static void pset_free_set(struct property_set *pset)
 static int pset_copy_entry(struct property_entry *dst,
 			   const struct property_entry *src)
 {
-	const char **d, **s;
+	const char * const *s;
+	char **d;
 	size_t i, nval;
 
 	dst->name = kstrdup(src->name, GFP_KERNEL);
@@ -731,12 +732,11 @@ static int pset_copy_entry(struct property_entry *dst,
 
 		if (src->is_string) {
 			nval = src->length / sizeof(const char *);
-			dst->pointer.str = kcalloc(nval, sizeof(const char *),
-						   GFP_KERNEL);
-			if (!dst->pointer.str)
+			d = kcalloc(nval, sizeof(const char *), GFP_KERNEL);
+			if (!d)
 				return -ENOMEM;
 
-			d = dst->pointer.str;
+			dst->pointer.raw_data = d;
 			s = src->pointer.str;
 			for (i = 0; i < nval; i++) {
 				d[i] = kstrdup(s[i], GFP_KERNEL);
diff --git a/include/linux/property.h b/include/linux/property.h
index d37a4498b3ac..7a0a1cce5165 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -160,12 +160,12 @@ struct property_entry {
 	bool is_string;
 	union {
 		union {
-			void *raw_data;
-			u8 *u8_data;
-			u16 *u16_data;
-			u32 *u32_data;
-			u64 *u64_data;
-			const char **str;
+			const void *raw_data;
+			const u8 *u8_data;
+			const u16 *u16_data;
+			const u32 *u32_data;
+			const u64 *u64_data;
+			const char * const *str;
 		} pointer;
 		union {
 			unsigned long long raw_data;
-- 
cgit v1.2.3


From 2d479e1fa2d09c5a9518a75a5d21ef2713117946 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 2 Feb 2017 17:41:27 -0800
Subject: device property: export code duplicating array of property entries

When augmenting ACPI-enumerated devices with additional property data based
on DMI info, a module has often several potential property sets, with only
one being active on a given box. In order to save memory it should be
possible to mark everything and __initdata or __initconst, execute DMI
match early, and duplicate relevant properties. Then kernel will discard
the rest of them.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 194 +++++++++++++++++++++++++++++++----------------
 include/linux/property.h |   5 ++
 2 files changed, 134 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 31b942a29fdc..c458c63e353f 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -682,77 +682,64 @@ out:
 }
 EXPORT_SYMBOL_GPL(fwnode_property_match_string);
 
-/**
- * pset_free_set - releases memory allocated for copied property set
- * @pset: Property set to release
- *
- * Function takes previously copied property set and releases all the
- * memory allocated to it.
- */
-static void pset_free_set(struct property_set *pset)
+static int property_copy_string_array(struct property_entry *dst,
+				      const struct property_entry *src)
 {
-	const struct property_entry *prop;
-	size_t i, nval;
+	char **d;
+	size_t nval = src->length / sizeof(*d);
+	int i;
 
-	if (!pset)
-		return;
+	d = kcalloc(nval, sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
 
-	for (prop = pset->properties; prop->name; prop++) {
-		if (prop->is_array) {
-			if (prop->is_string && prop->pointer.str) {
-				nval = prop->length / sizeof(const char *);
-				for (i = 0; i < nval; i++)
-					kfree(prop->pointer.str[i]);
-			}
-			kfree(prop->pointer.raw_data);
-		} else if (prop->is_string) {
-			kfree(prop->value.str);
+	for (i = 0; i < nval; i++) {
+		d[i] = kstrdup(src->pointer.str[i], GFP_KERNEL);
+		if (!d[i] && src->pointer.str[i]) {
+			while (--i >= 0)
+				kfree(d[i]);
+			kfree(d);
+			return -ENOMEM;
 		}
-		kfree(prop->name);
 	}
 
-	kfree(pset->properties);
-	kfree(pset);
+	dst->pointer.raw_data = d;
+	return 0;
 }
 
-static int pset_copy_entry(struct property_entry *dst,
-			   const struct property_entry *src)
+static int property_entry_copy_data(struct property_entry *dst,
+				    const struct property_entry *src)
 {
-	const char * const *s;
-	char **d;
-	size_t i, nval;
+	int error;
 
 	dst->name = kstrdup(src->name, GFP_KERNEL);
 	if (!dst->name)
 		return -ENOMEM;
 
 	if (src->is_array) {
-		if (!src->length)
-			return -ENODATA;
+		if (!src->length) {
+			error = -ENODATA;
+			goto out_free_name;
+		}
 
 		if (src->is_string) {
-			nval = src->length / sizeof(const char *);
-			d = kcalloc(nval, sizeof(const char *), GFP_KERNEL);
-			if (!d)
-				return -ENOMEM;
-
-			dst->pointer.raw_data = d;
-			s = src->pointer.str;
-			for (i = 0; i < nval; i++) {
-				d[i] = kstrdup(s[i], GFP_KERNEL);
-				if (!d[i] && s[i])
-					return -ENOMEM;
-			}
+			error = property_copy_string_array(dst, src);
+			if (error)
+				goto out_free_name;
 		} else {
 			dst->pointer.raw_data = kmemdup(src->pointer.raw_data,
 							src->length, GFP_KERNEL);
-			if (!dst->pointer.raw_data)
-				return -ENOMEM;
+			if (!dst->pointer.raw_data) {
+				error = -ENOMEM;
+				goto out_free_name;
+			}
 		}
 	} else if (src->is_string) {
 		dst->value.str = kstrdup(src->value.str, GFP_KERNEL);
-		if (!dst->value.str && src->value.str)
-			return -ENOMEM;
+		if (!dst->value.str && src->value.str) {
+			error = -ENOMEM;
+			goto out_free_name;
+		}
 	} else {
 		dst->value.raw_data = src->value.raw_data;
 	}
@@ -762,6 +749,95 @@ static int pset_copy_entry(struct property_entry *dst,
 	dst->is_string = src->is_string;
 
 	return 0;
+
+out_free_name:
+	kfree(dst->name);
+	return error;
+}
+
+static void property_entry_free_data(const struct property_entry *p)
+{
+	size_t i, nval;
+
+	if (p->is_array) {
+		if (p->is_string && p->pointer.str) {
+			nval = p->length / sizeof(const char *);
+			for (i = 0; i < nval; i++)
+				kfree(p->pointer.str[i]);
+		}
+		kfree(p->pointer.raw_data);
+	} else if (p->is_string) {
+		kfree(p->value.str);
+	}
+	kfree(p->name);
+}
+
+/**
+ * property_entries_dup - duplicate array of properties
+ * @properties: array of properties to copy
+ *
+ * This function creates a deep copy of the given NULL-terminated array
+ * of property entries.
+ */
+struct property_entry *
+property_entries_dup(const struct property_entry *properties)
+{
+	struct property_entry *p;
+	int i, n = 0;
+
+	while (properties[n].name)
+		n++;
+
+	p = kcalloc(n + 1, sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < n; i++) {
+		int ret = property_entry_copy_data(&p[i], &properties[i]);
+		if (ret) {
+			while (--i >= 0)
+				property_entry_free_data(&p[i]);
+			kfree(p);
+			return ERR_PTR(ret);
+		}
+	}
+
+	return p;
+}
+EXPORT_SYMBOL_GPL(property_entries_dup);
+
+/**
+ * property_entries_free - free previously allocated array of properties
+ * @properties: array of properties to destroy
+ *
+ * This function frees given NULL-terminated array of property entries,
+ * along with their data.
+ */
+void property_entries_free(const struct property_entry *properties)
+{
+	const struct property_entry *p;
+
+	for (p = properties; p->name; p++)
+		property_entry_free_data(p);
+
+	kfree(properties);
+}
+EXPORT_SYMBOL_GPL(property_entries_free);
+
+/**
+ * pset_free_set - releases memory allocated for copied property set
+ * @pset: Property set to release
+ *
+ * Function takes previously copied property set and releases all the
+ * memory allocated to it.
+ */
+static void pset_free_set(struct property_set *pset)
+{
+	if (!pset)
+		return;
+
+	property_entries_free(pset->properties);
+	kfree(pset);
 }
 
 /**
@@ -776,32 +852,20 @@ static int pset_copy_entry(struct property_entry *dst,
  */
 static struct property_set *pset_copy_set(const struct property_set *pset)
 {
-	struct property_entry *props;
+	struct property_entry *properties;
 	struct property_set *p;
-	size_t i, n = 0;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
-	while (pset->properties[n].name)
-		n++;
-
-	p->properties = props = kcalloc(n + 1, sizeof(*props), GFP_KERNEL);
-	if (!p->properties) {
+	properties = property_entries_dup(pset->properties);
+	if (IS_ERR(properties)) {
 		kfree(p);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for (i = 0; i < n; i++) {
-		int ret = pset_copy_entry(&props[i],
-					  &pset->properties[i]);
-		if (ret) {
-			pset_free_set(p);
-			return ERR_PTR(ret);
-		}
+		return ERR_CAST(properties);
 	}
 
+	p->properties = properties;
 	return p;
 }
 
diff --git a/include/linux/property.h b/include/linux/property.h
index 7a0a1cce5165..64e3a9c6d95f 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -241,6 +241,11 @@ struct property_entry {
 	.name = _name_,				\
 }
 
+struct property_entry *
+property_entries_dup(const struct property_entry *properties);
+
+void property_entries_free(const struct property_entry *properties);
+
 int device_add_properties(struct device *dev,
 			  const struct property_entry *properties);
 void device_remove_properties(struct device *dev);
-- 
cgit v1.2.3


From febf2407418a2d6c042fcd77b206040449cb9a70 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 6 Feb 2017 18:01:51 +0100
Subject: x86/ACPI: keep x86_cpu_to_acpiid mapping valid on CPU hotplug

We may or may not have all possible CPUs in MADT on boot but in any
case we're overwriting x86_cpu_to_acpiid mapping with U32_MAX when
acpi_register_lapic() is called again on the CPU hotplug path:

acpi_processor_hotadd_init()
  -> acpi_map_cpu()
    -> acpi_register_lapic()

As we have the required acpi_id information in acpi_processor_hotadd_init()
propagate it to acpi_map_cpu() to always keep x86_cpu_to_acpiid
mapping valid.

Reported-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/ia64/kernel/acpi.c       | 3 ++-
 arch/x86/kernel/acpi/boot.c   | 5 +++--
 drivers/acpi/acpi_processor.c | 4 ++--
 include/linux/acpi.h          | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 9273e034b730..7508c306aa9e 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -887,7 +887,8 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 }
 
 /* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
+int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+		       int *pcpu)
 {
 	return _acpi_map_lsapic(handle, physid, pcpu);
 }
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 64422f850e95..04bc5f3f9aa1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -723,11 +723,12 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 	return 0;
 }
 
-int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+		 int *pcpu)
 {
 	int cpu;
 
-	cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED);
+	cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
 	if (cpu < 0) {
 		pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
 		return cpu;
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 3de3b6b8f0f1..4467a8089ab8 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -165,7 +165,7 @@ static int acpi_processor_errata(void)
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 int __weak acpi_map_cpu(acpi_handle handle,
-		phys_cpuid_t physid, int *pcpu)
+		phys_cpuid_t physid, u32 acpi_id, int *pcpu)
 {
 	return -ENODEV;
 }
@@ -203,7 +203,7 @@ static int acpi_processor_hotadd_init(struct acpi_processor *pr)
 	cpu_maps_update_begin();
 	cpu_hotplug_begin();
 
-	ret = acpi_map_cpu(pr->handle, pr->phys_id, &pr->id);
+	ret = acpi_map_cpu(pr->handle, pr->phys_id, pr->acpi_id, &pr->id);
 	if (ret)
 		goto out;
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5b36974ed60a..6ab47e92c65a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -291,7 +291,8 @@ bool acpi_processor_validate_proc_id(int proc_id);
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu);
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+		 int *pcpu);
 int acpi_unmap_cpu(int cpu);
 int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
 #endif /* CONFIG_ACPI_HOTPLUG_CPU */
-- 
cgit v1.2.3


From af7bd4dc13093bf1477f370722bbab24cf457b91 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 17 Jan 2017 06:34:52 +0200
Subject: vfs: create vfs helper vfs_tmpfile()

Factor out some common vfs bits from do_tmpfile()
to be used by overlayfs for concurrent copy up.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/namei.c         | 68 ++++++++++++++++++++++++++++++++++--------------------
 include/linux/fs.h |  3 +++
 2 files changed, 46 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index ad74877e1442..7d87699c3e2e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3353,13 +3353,50 @@ out:
 	return error;
 }
 
+struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+{
+	static const struct qstr name = QSTR_INIT("/", 1);
+	struct dentry *child = NULL;
+	struct inode *dir = dentry->d_inode;
+	struct inode *inode;
+	int error;
+
+	/* we want directory to be writable */
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto out_err;
+	error = -EOPNOTSUPP;
+	if (!dir->i_op->tmpfile)
+		goto out_err;
+	error = -ENOMEM;
+	child = d_alloc(dentry, &name);
+	if (unlikely(!child))
+		goto out_err;
+	error = dir->i_op->tmpfile(dir, child, mode);
+	if (error)
+		goto out_err;
+	error = -ENOENT;
+	inode = child->d_inode;
+	if (unlikely(!inode))
+		goto out_err;
+	if (!(open_flag & O_EXCL)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state |= I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
+	return child;
+
+out_err:
+	dput(child);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL(vfs_tmpfile);
+
 static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		const struct open_flags *op,
 		struct file *file, int *opened)
 {
-	static const struct qstr name = QSTR_INIT("/", 1);
 	struct dentry *child;
-	struct inode *dir;
 	struct path path;
 	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
 	if (unlikely(error))
@@ -3367,25 +3404,12 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	error = mnt_want_write(path.mnt);
 	if (unlikely(error))
 		goto out;
-	dir = path.dentry->d_inode;
-	/* we want directory to be writable */
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
-	if (error)
-		goto out2;
-	if (!dir->i_op->tmpfile) {
-		error = -EOPNOTSUPP;
-		goto out2;
-	}
-	child = d_alloc(path.dentry, &name);
-	if (unlikely(!child)) {
-		error = -ENOMEM;
+	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
+	error = PTR_ERR(child);
+	if (unlikely(IS_ERR(child)))
 		goto out2;
-	}
 	dput(path.dentry);
 	path.dentry = child;
-	error = dir->i_op->tmpfile(dir, child, op->mode);
-	if (error)
-		goto out2;
 	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
 	error = may_open(&path, 0, op->open_flag);
@@ -3396,14 +3420,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	if (error)
 		goto out2;
 	error = open_check_o_direct(file);
-	if (error) {
+	if (error)
 		fput(file);
-	} else if (!(op->open_flag & O_EXCL)) {
-		struct inode *inode = file_inode(file);
-		spin_lock(&inode->i_lock);
-		inode->i_state |= I_LINKABLE;
-		spin_unlock(&inode->i_lock);
-	}
 out2:
 	mnt_drop_write(path.mnt);
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba074328894..4a7f3cc9edab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1561,6 +1561,9 @@ extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 extern int vfs_whiteout(struct inode *, struct dentry *);
 
+extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
+				  int open_flag);
+
 /*
  * VFS file helper functions.
  */
-- 
cgit v1.2.3


From bfe219d373cadab761373aeea4c70406bc27ea2c Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 31 Jan 2017 10:34:57 +0200
Subject: vfs: wrap write f_ops with file_{start,end}_write()

Before calling write f_ops, call file_start_write() instead
of sb_start_write().

Replace {sb,file}_start_write() for {copy,clone}_file_range() and
for fallocate().

Beyond correct semantics, this avoids freeze protection to sb when
operating on special inodes, such as fallocate() on a blockdev.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/open.c          |  4 ++--
 fs/read_write.c    |  4 ++--
 include/linux/fs.h | 26 +++++++++++++-------------
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index f9118f4113e7..949cef29c3bb 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -314,7 +314,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (!file->f_op->fallocate)
 		return -EOPNOTSUPP;
 
-	sb_start_write(inode->i_sb);
+	file_start_write(file);
 	ret = file->f_op->fallocate(file, mode, offset, len);
 
 	/*
@@ -327,7 +327,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (ret == 0)
 		fsnotify_modify(file);
 
-	sb_end_write(inode->i_sb);
+	file_end_write(file);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(vfs_fallocate);
diff --git a/fs/read_write.c b/fs/read_write.c
index 511178d7723b..820a3f06c46a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1543,7 +1543,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	if (len == 0)
 		return 0;
 
-	sb_start_write(inode_out->i_sb);
+	file_start_write(file_out);
 
 	/*
 	 * Try cloning first, this is supported by more file systems, and
@@ -1579,7 +1579,7 @@ done:
 	inc_syscr(current);
 	inc_syscw(current);
 
-	sb_end_write(inode_out->i_sb);
+	file_end_write(file_out);
 
 	return ret;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a7f3cc9edab..78c81e6f5d76 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1741,19 +1741,6 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 extern int vfs_dedupe_file_range(struct file *file,
 				 struct file_dedupe_range *same);
 
-static inline int do_clone_file_range(struct file *file_in, loff_t pos_in,
-				      struct file *file_out, loff_t pos_out,
-				      u64 len)
-{
-	int ret;
-
-	sb_start_write(file_inode(file_out)->i_sb);
-	ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len);
-	sb_end_write(file_inode(file_out)->i_sb);
-
-	return ret;
-}
-
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
@@ -2564,6 +2551,19 @@ static inline void file_end_write(struct file *file)
 	__sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 }
 
+static inline int do_clone_file_range(struct file *file_in, loff_t pos_in,
+				      struct file *file_out, loff_t pos_out,
+				      u64 len)
+{
+	int ret;
+
+	file_start_write(file_out);
+	ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+	file_end_write(file_out);
+
+	return ret;
+}
+
 /*
  * get_write_access() gets write permission for a file.
  * put_write_access() releases this write permission.
-- 
cgit v1.2.3


From 9fe7bfce8b3e112e8e08c40deb72ee7e24c6f072 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 2 Feb 2017 19:16:01 -0800
Subject: virtio_net: refactor freeze/restore logic into virtnet reset logic

For XDP we will need to reset the queues to allow for buffer headroom
to be configured. In order to do this we need to essentially run the
freeze()/restore() code path. Unfortunately the locking requirements
between the freeze/restore and reset paths are different however so
we can not simply reuse the code.

This patch refactors the code path and adds a reset helper routine.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 75 +++++++++++++++++++++++++++++-------------------
 drivers/virtio/virtio.c  | 42 +++++++++++++++------------
 include/linux/virtio.h   |  4 +++
 3 files changed, 73 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3b3c5c571b6d..4cc3da1e4070 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1661,6 +1661,49 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
 	.set_settings = virtnet_set_settings,
 };
 
+static void virtnet_freeze_down(struct virtio_device *vdev)
+{
+	struct virtnet_info *vi = vdev->priv;
+	int i;
+
+	/* Make sure no work handler is accessing the device */
+	flush_work(&vi->config_work);
+
+	netif_device_detach(vi->dev);
+	cancel_delayed_work_sync(&vi->refill);
+
+	if (netif_running(vi->dev)) {
+		for (i = 0; i < vi->max_queue_pairs; i++)
+			napi_disable(&vi->rq[i].napi);
+	}
+}
+
+static int init_vqs(struct virtnet_info *vi);
+
+static int virtnet_restore_up(struct virtio_device *vdev)
+{
+	struct virtnet_info *vi = vdev->priv;
+	int err, i;
+
+	err = init_vqs(vi);
+	if (err)
+		return err;
+
+	virtio_device_ready(vdev);
+
+	if (netif_running(vi->dev)) {
+		for (i = 0; i < vi->curr_queue_pairs; i++)
+			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
+				schedule_delayed_work(&vi->refill, 0);
+
+		for (i = 0; i < vi->max_queue_pairs; i++)
+			virtnet_napi_enable(&vi->rq[i]);
+	}
+
+	netif_device_attach(vi->dev);
+	return err;
+}
+
 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 {
 	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
@@ -2353,21 +2396,9 @@ static void virtnet_remove(struct virtio_device *vdev)
 static int virtnet_freeze(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
-	int i;
 
 	virtnet_cpu_notif_remove(vi);
-
-	/* Make sure no work handler is accessing the device */
-	flush_work(&vi->config_work);
-
-	netif_device_detach(vi->dev);
-	cancel_delayed_work_sync(&vi->refill);
-
-	if (netif_running(vi->dev)) {
-		for (i = 0; i < vi->max_queue_pairs; i++)
-			napi_disable(&vi->rq[i].napi);
-	}
-
+	virtnet_freeze_down(vdev);
 	remove_vq_common(vi);
 
 	return 0;
@@ -2376,25 +2407,11 @@ static int virtnet_freeze(struct virtio_device *vdev)
 static int virtnet_restore(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
-	int err, i;
+	int err;
 
-	err = init_vqs(vi);
+	err = virtnet_restore_up(vdev);
 	if (err)
 		return err;
-
-	virtio_device_ready(vdev);
-
-	if (netif_running(vi->dev)) {
-		for (i = 0; i < vi->curr_queue_pairs; i++)
-			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
-				schedule_delayed_work(&vi->refill, 0);
-
-		for (i = 0; i < vi->max_queue_pairs; i++)
-			virtnet_napi_enable(&vi->rq[i]);
-	}
-
-	netif_device_attach(vi->dev);
-
 	virtnet_set_queues(vi, vi->curr_queue_pairs);
 
 	err = virtnet_cpu_notif_add(vi);
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 7062bb0975a5..400d70b69379 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -100,11 +100,6 @@ static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env)
 			      dev->id.device, dev->id.vendor);
 }
 
-static void add_status(struct virtio_device *dev, unsigned status)
-{
-	dev->config->set_status(dev, dev->config->get_status(dev) | status);
-}
-
 void virtio_check_driver_offered_feature(const struct virtio_device *vdev,
 					 unsigned int fbit)
 {
@@ -145,14 +140,15 @@ void virtio_config_changed(struct virtio_device *dev)
 }
 EXPORT_SYMBOL_GPL(virtio_config_changed);
 
-static void virtio_config_disable(struct virtio_device *dev)
+void virtio_config_disable(struct virtio_device *dev)
 {
 	spin_lock_irq(&dev->config_lock);
 	dev->config_enabled = false;
 	spin_unlock_irq(&dev->config_lock);
 }
+EXPORT_SYMBOL_GPL(virtio_config_disable);
 
-static void virtio_config_enable(struct virtio_device *dev)
+void virtio_config_enable(struct virtio_device *dev)
 {
 	spin_lock_irq(&dev->config_lock);
 	dev->config_enabled = true;
@@ -161,8 +157,15 @@ static void virtio_config_enable(struct virtio_device *dev)
 	dev->config_change_pending = false;
 	spin_unlock_irq(&dev->config_lock);
 }
+EXPORT_SYMBOL_GPL(virtio_config_enable);
+
+void virtio_add_status(struct virtio_device *dev, unsigned int status)
+{
+	dev->config->set_status(dev, dev->config->get_status(dev) | status);
+}
+EXPORT_SYMBOL_GPL(virtio_add_status);
 
-static int virtio_finalize_features(struct virtio_device *dev)
+int virtio_finalize_features(struct virtio_device *dev)
 {
 	int ret = dev->config->finalize_features(dev);
 	unsigned status;
@@ -173,7 +176,7 @@ static int virtio_finalize_features(struct virtio_device *dev)
 	if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1))
 		return 0;
 
-	add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
 	status = dev->config->get_status(dev);
 	if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) {
 		dev_err(&dev->dev, "virtio: device refuses features: %x\n",
@@ -182,6 +185,7 @@ static int virtio_finalize_features(struct virtio_device *dev)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(virtio_finalize_features);
 
 static int virtio_dev_probe(struct device *_d)
 {
@@ -193,7 +197,7 @@ static int virtio_dev_probe(struct device *_d)
 	u64 driver_features_legacy;
 
 	/* We have a driver! */
-	add_status(dev, VIRTIO_CONFIG_S_DRIVER);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
 
 	/* Figure out what features the device supports. */
 	device_features = dev->config->get_features(dev);
@@ -247,7 +251,7 @@ static int virtio_dev_probe(struct device *_d)
 
 	return 0;
 err:
-	add_status(dev, VIRTIO_CONFIG_S_FAILED);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
 	return err;
 
 }
@@ -265,7 +269,7 @@ static int virtio_dev_remove(struct device *_d)
 	WARN_ON_ONCE(dev->config->get_status(dev));
 
 	/* Acknowledge the device's existence again. */
-	add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 	return 0;
 }
 
@@ -316,7 +320,7 @@ int register_virtio_device(struct virtio_device *dev)
 	dev->config->reset(dev);
 
 	/* Acknowledge that we've seen the device. */
-	add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 
 	INIT_LIST_HEAD(&dev->vqs);
 
@@ -325,7 +329,7 @@ int register_virtio_device(struct virtio_device *dev)
 	err = device_register(&dev->dev);
 out:
 	if (err)
-		add_status(dev, VIRTIO_CONFIG_S_FAILED);
+		virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
 	return err;
 }
 EXPORT_SYMBOL_GPL(register_virtio_device);
@@ -365,18 +369,18 @@ int virtio_device_restore(struct virtio_device *dev)
 	dev->config->reset(dev);
 
 	/* Acknowledge that we've seen the device. */
-	add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 
 	/* Maybe driver failed before freeze.
 	 * Restore the failed status, for debugging. */
 	if (dev->failed)
-		add_status(dev, VIRTIO_CONFIG_S_FAILED);
+		virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
 
 	if (!drv)
 		return 0;
 
 	/* We have a driver! */
-	add_status(dev, VIRTIO_CONFIG_S_DRIVER);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
 
 	ret = virtio_finalize_features(dev);
 	if (ret)
@@ -389,14 +393,14 @@ int virtio_device_restore(struct virtio_device *dev)
 	}
 
 	/* Finally, tell the device we're all set */
-	add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
 
 	virtio_config_enable(dev);
 
 	return 0;
 
 err:
-	add_status(dev, VIRTIO_CONFIG_S_FAILED);
+	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(virtio_device_restore);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index d5eb5479a425..04b0d3f95043 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -132,12 +132,16 @@ static inline struct virtio_device *dev_to_virtio(struct device *_dev)
 	return container_of(_dev, struct virtio_device, dev);
 }
 
+void virtio_add_status(struct virtio_device *dev, unsigned int status);
 int register_virtio_device(struct virtio_device *dev);
 void unregister_virtio_device(struct virtio_device *dev);
 
 void virtio_break_device(struct virtio_device *dev);
 
 void virtio_config_changed(struct virtio_device *dev);
+void virtio_config_disable(struct virtio_device *dev);
+void virtio_config_enable(struct virtio_device *dev);
+int virtio_finalize_features(struct virtio_device *dev);
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
 int virtio_device_restore(struct virtio_device *dev);
-- 
cgit v1.2.3


From 55601a880690cdeccdb5923c2493f0e3736f8f6b Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 4 Feb 2017 20:02:49 +0100
Subject: net: phy: Add 2000base-x, 2500base-x and rxaui modes

The mv88e6390 ports 9 and 10 supports some additional PHY modes. Add
these modes to the PHY core so they can be used in the binding.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/ethernet.txt | 3 +++
 include/linux/phy.h                                | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt
index 05150957ecfd..3a6916909d90 100644
--- a/Documentation/devicetree/bindings/net/ethernet.txt
+++ b/Documentation/devicetree/bindings/net/ethernet.txt
@@ -29,6 +29,9 @@ The following properties are common to the Ethernet controllers:
   * "smii"
   * "xgmii"
   * "trgmii"
+  * "2000base-x",
+  * "2500base-x",
+  * "rxaui"
 - phy-connection-type: the same as "phy-mode" property but described in ePAPR;
 - phy-handle: phandle, specifies a reference to a node representing a PHY
   device; this property is described in ePAPR and so preferred;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 43474f39ef65..28ae9eafec19 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -81,6 +81,9 @@ typedef enum {
 	PHY_INTERFACE_MODE_MOCA,
 	PHY_INTERFACE_MODE_QSGMII,
 	PHY_INTERFACE_MODE_TRGMII,
+	PHY_INTERFACE_MODE_1000BASEX,
+	PHY_INTERFACE_MODE_2500BASEX,
+	PHY_INTERFACE_MODE_RXAUI,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -141,6 +144,12 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "qsgmii";
 	case PHY_INTERFACE_MODE_TRGMII:
 		return "trgmii";
+	case PHY_INTERFACE_MODE_1000BASEX:
+		return "1000base-x";
+	case PHY_INTERFACE_MODE_2500BASEX:
+		return "2500base-x";
+	case PHY_INTERFACE_MODE_RXAUI:
+		return "rxaui";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From 648ea0134069cda7d4940f397bcc6901fb88752a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sat, 4 Feb 2017 13:02:44 -0800
Subject: net: phy: Allow pre-declaration of MDIO devices

Allow board support code to collect pre-declarations for MDIO devices by
registering them with mdiobus_register_board_info(). SPI and I2C buses
have a similar feature, we were missing this for MDIO devices, but this
is particularly useful for e.g: MDIO-connected switches which need to
provide their port layout (often board-specific) to a MDIO Ethernet
switch driver.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Makefile         |  3 +-
 drivers/net/phy/mdio-boardinfo.c | 86 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/mdio-boardinfo.h | 19 +++++++++
 drivers/net/phy/mdio_bus.c       |  4 ++
 drivers/net/phy/mdio_device.c    | 11 +++++
 include/linux/mdio.h             |  3 ++
 include/linux/mod_devicetable.h  |  1 +
 include/linux/phy.h              | 19 +++++++++
 8 files changed, 145 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/phy/mdio-boardinfo.c
 create mode 100644 drivers/net/phy/mdio-boardinfo.h

(limited to 'include/linux')

diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 356859ac7c18..407b0b601ea8 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -1,6 +1,7 @@
 # Makefile for Linux PHY drivers and MDIO bus drivers
 
-libphy-y			:= phy.o phy_device.o mdio_bus.o mdio_device.o
+libphy-y			:= phy.o phy_device.o mdio_bus.o mdio_device.o \
+				   mdio-boardinfo.o
 libphy-$(CONFIG_SWPHY)		+= swphy.o
 libphy-$(CONFIG_LED_TRIGGER_PHY)	+= phy_led_triggers.o
 
diff --git a/drivers/net/phy/mdio-boardinfo.c b/drivers/net/phy/mdio-boardinfo.c
new file mode 100644
index 000000000000..6b988f77da08
--- /dev/null
+++ b/drivers/net/phy/mdio-boardinfo.c
@@ -0,0 +1,86 @@
+/*
+ * mdio-boardinfo - Collect pre-declarations for MDIO devices
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+
+#include "mdio-boardinfo.h"
+
+static LIST_HEAD(mdio_board_list);
+static DEFINE_MUTEX(mdio_board_lock);
+
+/**
+ * mdiobus_setup_mdiodev_from_board_info - create and setup MDIO devices
+ * from pre-collected board specific MDIO information
+ * @mdiodev: MDIO device pointer
+ * Context: can sleep
+ */
+void mdiobus_setup_mdiodev_from_board_info(struct mii_bus *bus)
+{
+	struct mdio_board_entry *be;
+	struct mdio_device *mdiodev;
+	struct mdio_board_info *bi;
+	int ret;
+
+	mutex_lock(&mdio_board_lock);
+	list_for_each_entry(be, &mdio_board_list, list) {
+		bi = &be->board_info;
+
+		if (strcmp(bus->id, bi->bus_id))
+			continue;
+
+		mdiodev = mdio_device_create(bus, bi->mdio_addr);
+		if (IS_ERR(mdiodev))
+			continue;
+
+		strncpy(mdiodev->modalias, bi->modalias,
+			sizeof(mdiodev->modalias));
+		mdiodev->bus_match = mdio_device_bus_match;
+		mdiodev->dev.platform_data = (void *)bi->platform_data;
+
+		ret = mdio_device_register(mdiodev);
+		if (ret) {
+			mdio_device_free(mdiodev);
+			continue;
+		}
+	}
+	mutex_unlock(&mdio_board_lock);
+}
+
+/**
+ * mdio_register_board_info - register MDIO devices for a given board
+ * @info: array of devices descriptors
+ * @n: number of descriptors provided
+ * Context: can sleep
+ *
+ * The board info passed can be marked with __initdata but be pointers
+ * such as platform_data etc. are copied as-is
+ */
+int mdiobus_register_board_info(const struct mdio_board_info *info,
+				unsigned int n)
+{
+	struct mdio_board_entry *be;
+	unsigned int i;
+
+	be = kcalloc(n, sizeof(*be), GFP_KERNEL);
+	if (!be)
+		return -ENOMEM;
+
+	for (i = 0; i < n; i++, be++, info++) {
+		memcpy(&be->board_info, info, sizeof(*info));
+		mutex_lock(&mdio_board_lock);
+		list_add_tail(&be->list, &mdio_board_list);
+		mutex_unlock(&mdio_board_lock);
+	}
+
+	return 0;
+}
diff --git a/drivers/net/phy/mdio-boardinfo.h b/drivers/net/phy/mdio-boardinfo.h
new file mode 100644
index 000000000000..00f98163e90e
--- /dev/null
+++ b/drivers/net/phy/mdio-boardinfo.h
@@ -0,0 +1,19 @@
+/*
+ * mdio-boardinfo.h - board info interface internal to the mdio_bus
+ * component
+ */
+
+#ifndef __MDIO_BOARD_INFO_H
+#define __MDIO_BOARD_INFO_H
+
+#include <linux/phy.h>
+#include <linux/mutex.h>
+
+struct mdio_board_entry {
+	struct list_head	list;
+	struct mdio_board_info	board_info;
+};
+
+void mdiobus_setup_mdiodev_from_board_info(struct mii_bus *bus);
+
+#endif /* __MDIO_BOARD_INFO_H */
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 653d076eafe5..fa7d51f14869 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -41,6 +41,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/mdio.h>
 
+#include "mdio-boardinfo.h"
+
 int mdiobus_register_device(struct mdio_device *mdiodev)
 {
 	if (mdiodev->bus->mdio_map[mdiodev->addr])
@@ -343,6 +345,8 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner)
 		}
 	}
 
+	mdiobus_setup_mdiodev_from_board_info(bus);
+
 	bus->state = MDIOBUS_REGISTERED;
 	pr_info("%s: probed\n", bus->name);
 	return 0;
diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c
index fc3aaaa36b1d..e24f28924af8 100644
--- a/drivers/net/phy/mdio_device.c
+++ b/drivers/net/phy/mdio_device.c
@@ -34,6 +34,17 @@ static void mdio_device_release(struct device *dev)
 	kfree(to_mdio_device(dev));
 }
 
+int mdio_device_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct mdio_device *mdiodev = to_mdio_device(dev);
+	struct mdio_driver *mdiodrv = to_mdio_driver(drv);
+
+	if (mdiodrv->mdiodrv.flags & MDIO_DEVICE_IS_PHY)
+		return 0;
+
+	return strcmp(mdiodev->modalias, drv->name) == 0;
+}
+
 struct mdio_device *mdio_device_create(struct mii_bus *bus, int addr)
 {
 	struct mdio_device *mdiodev;
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 55a80d73cfc1..ca08ab16ecdc 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -10,6 +10,7 @@
 #define __LINUX_MDIO_H__
 
 #include <uapi/linux/mdio.h>
+#include <linux/mod_devicetable.h>
 
 struct mii_bus;
 
@@ -29,6 +30,7 @@ struct mdio_device {
 
 	const struct dev_pm_ops *pm_ops;
 	struct mii_bus *bus;
+	char modalias[MDIO_NAME_SIZE];
 
 	int (*bus_match)(struct device *dev, struct device_driver *drv);
 	void (*device_free)(struct mdio_device *mdiodev);
@@ -71,6 +73,7 @@ int mdio_device_register(struct mdio_device *mdiodev);
 void mdio_device_remove(struct mdio_device *mdiodev);
 int mdio_driver_register(struct mdio_driver *drv);
 void mdio_driver_unregister(struct mdio_driver *drv);
+int mdio_device_bus_match(struct device *dev, struct device_driver *drv);
 
 static inline bool mdio_phy_id_is_c45(int phy_id)
 {
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 8a57f0b1242d..8850fcaf50db 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -501,6 +501,7 @@ struct platform_device_id {
 	kernel_ulong_t driver_data;
 };
 
+#define MDIO_NAME_SIZE		32
 #define MDIO_MODULE_PREFIX	"mdio:"
 
 #define MDIO_ID_FMT "%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d"
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 28ae9eafec19..d9bdf53e0514 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -886,6 +886,25 @@ void mdio_bus_exit(void);
 
 extern struct bus_type mdio_bus_type;
 
+struct mdio_board_info {
+	const char	*bus_id;
+	char		modalias[MDIO_NAME_SIZE];
+	int		mdio_addr;
+	const void	*platform_data;
+};
+
+#if IS_ENABLED(CONFIG_PHYLIB)
+int mdiobus_register_board_info(const struct mdio_board_info *info,
+				unsigned int n);
+#else
+static inline int mdiobus_register_board_info(const struct mdio_board_info *i,
+					      unsigned int n)
+{
+	return 0;
+}
+#endif
+
+
 /**
  * module_phy_driver() - Helper macro for registering PHY drivers
  * @__phy_drivers: array of PHY drivers to register
-- 
cgit v1.2.3


From b08d46b01e995dd7b653b22d35bd1d958d6ee9b4 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 6 Feb 2017 13:01:16 -0800
Subject: net: phy: bcm7xxx: Add BCM74371 PHY ID

Add the BCM74371 PHY ID to the list of supported chips. This is a 28nm
technology Gigabit PHY SoC.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm7xxx.c | 2 ++
 include/linux/brcmphy.h   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index aa01020ab1b9..d1c2614dad3a 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -453,6 +453,7 @@ static struct phy_driver bcm7xxx_driver[] = {
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7278, "Broadcom BCM7278"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7364, "Broadcom BCM7364"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7366, "Broadcom BCM7366"),
+	BCM7XXX_28NM_GPHY(PHY_ID_BCM74371, "Broadcom BCM74371"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7439_2, "Broadcom BCM7439 (2)"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7445, "Broadcom BCM7445"),
@@ -472,6 +473,7 @@ static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = {
 	{ PHY_ID_BCM7362, 0xfffffff0, },
 	{ PHY_ID_BCM7425, 0xfffffff0, },
 	{ PHY_ID_BCM7429, 0xfffffff0, },
+	{ PHY_ID_BCM74371, 0xfffffff0, },
 	{ PHY_ID_BCM7439, 0xfffffff0, },
 	{ PHY_ID_BCM7435, 0xfffffff0, },
 	{ PHY_ID_BCM7445, 0xfffffff0, },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 5881d1fdc1fb..55e517130311 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -33,6 +33,7 @@
 #define PHY_ID_BCM7425			0x600d86b0
 #define PHY_ID_BCM7429			0x600d8730
 #define PHY_ID_BCM7435			0x600d8750
+#define PHY_ID_BCM74371			0xae0252e0
 #define PHY_ID_BCM7439			0x600d8480
 #define PHY_ID_BCM7439_2		0xae025080
 #define PHY_ID_BCM7445			0x600d8510
-- 
cgit v1.2.3


From 4ff0620354f2b39b9fe2a91c22c4de9d1fba0c8e Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 6 Feb 2017 23:14:12 +0200
Subject: net: add dst_pending_confirm flag to skbuff

Add new skbuff flag to allow protocols to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.

Add sock_confirm_neigh() helper to confirm the neighbour and
use it for IPv4, IPv6 and VRF before dst_neigh_output.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c      |  5 ++++-
 include/linux/skbuff.h | 12 ++++++++++++
 include/net/sock.h     | 14 ++++++++++++++
 net/ipv4/ip_output.c   |  5 ++++-
 net/ipv6/ip6_output.c  |  1 +
 5 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 264fc1585b3c..630eafdb79e8 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -378,6 +378,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
@@ -574,8 +575,10 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
-	if (!IS_ERR(neigh))
+	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
+	}
 
 	rcu_read_unlock_bh();
 err:
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c6a78e1892b6..f1adddc1c5ac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -612,6 +612,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@wifi_acked_valid: wifi_acked was set
  *	@wifi_acked: whether frame was acked on wifi or not
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
+ *	@dst_pending_confirm: need to confirm neighbour
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
@@ -741,6 +742,7 @@ struct sk_buff {
 	__u8			csum_level:2;
 	__u8			csum_bad:1;
 
+	__u8			dst_pending_confirm:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
@@ -3698,6 +3700,16 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 	return skb->queue_mapping != 0;
 }
 
+static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
+{
+	skb->dst_pending_confirm = val;
+}
+
+static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
+{
+	return skb->dst_pending_confirm != 0;
+}
+
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 {
 #ifdef CONFIG_XFRM
diff --git a/include/net/sock.h b/include/net/sock.h
index 85d856b94b4b..6f83e78eaa5a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1821,6 +1821,20 @@ static inline void sk_dst_confirm(struct sock *sk)
 		sk->sk_dst_pending_confirm = 1;
 }
 
+static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
+{
+	if (skb_get_dst_pending_confirm(skb)) {
+		struct sock *sk = skb->sk;
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+		if (sk && sk->sk_dst_pending_confirm)
+			sk->sk_dst_pending_confirm = 0;
+	}
+}
+
 bool sk_mc_loop(struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b67719f45953..c9fc32fa3272 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 	if (!IS_ERR(neigh)) {
-		int res = dst_neigh_output(dst, neigh, skb);
+		int res;
+
+		sock_confirm_neigh(skb, neigh);
+		res = dst_neigh_output(dst, neigh, skb);
 
 		rcu_read_unlock_bh();
 		return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b6a94ff0bbd0..14d99fbf102e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -119,6 +119,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
-- 
cgit v1.2.3


From 85c727b5948344c8d559e2fda8925e9ddd41c29a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 7 Feb 2017 11:37:56 -0200
Subject: sctp: drop __packed from almost all SCTP structures

__packed is considered harmful as it potentially generates code that
doesn't perform well and its usage should be avoided as much as
possible.

This patch drops __packed from all SCTP structures except one, which is
sctp_signed_cookie. In there it's required, as per changelog on
commit 9834a2bb4970 ("[SCTP]: Fix sctp_cookie alignment in the packet.").

After this patch, no alignment changes neither in x86 or x86_64 and
no exceptions were noticed during testing on both archs.

Code size for SCTP module also didn't change with this patch.

Cc: David Miller <davem@davemloft.net>
Cc: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       | 80 +++++++++++++++++++++++-----------------------
 include/net/sctp/structs.h |  2 +-
 2 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a9e790685af3..2408c6877ca0 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -62,7 +62,7 @@ typedef struct sctphdr {
 	__be16 dest;
 	__be32 vtag;
 	__le32 checksum;
-} __packed sctp_sctphdr_t;
+} sctp_sctphdr_t;
 
 static inline struct sctphdr *sctp_hdr(const struct sk_buff *skb)
 {
@@ -74,7 +74,7 @@ typedef struct sctp_chunkhdr {
 	__u8 type;
 	__u8 flags;
 	__be16 length;
-} __packed sctp_chunkhdr_t;
+} sctp_chunkhdr_t;
 
 
 /* Section 3.2.  Chunk Type Values.
@@ -165,7 +165,7 @@ enum { SCTP_CHUNK_FLAG_T = 0x01 };
 typedef struct sctp_paramhdr {
 	__be16 type;
 	__be16 length;
-} __packed sctp_paramhdr_t;
+} sctp_paramhdr_t;
 
 typedef enum {
 
@@ -233,12 +233,12 @@ typedef struct sctp_datahdr {
 	__be16 ssn;
 	__be32 ppid;
 	__u8  payload[0];
-} __packed sctp_datahdr_t;
+} sctp_datahdr_t;
 
 typedef struct sctp_data_chunk {
         sctp_chunkhdr_t chunk_hdr;
         sctp_datahdr_t  data_hdr;
-} __packed sctp_data_chunk_t;
+} sctp_data_chunk_t;
 
 /* DATA Chuck Specific Flags */
 enum {
@@ -264,78 +264,78 @@ typedef struct sctp_inithdr {
 	__be16 num_inbound_streams;
 	__be32 initial_tsn;
 	__u8  params[0];
-} __packed sctp_inithdr_t;
+} sctp_inithdr_t;
 
 typedef struct sctp_init_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_inithdr_t init_hdr;
-} __packed sctp_init_chunk_t;
+} sctp_init_chunk_t;
 
 
 /* Section 3.3.2.1. IPv4 Address Parameter (5) */
 typedef struct sctp_ipv4addr_param {
 	sctp_paramhdr_t param_hdr;
 	struct in_addr  addr;
-} __packed sctp_ipv4addr_param_t;
+} sctp_ipv4addr_param_t;
 
 /* Section 3.3.2.1. IPv6 Address Parameter (6) */
 typedef struct sctp_ipv6addr_param {
 	sctp_paramhdr_t param_hdr;
 	struct in6_addr addr;
-} __packed sctp_ipv6addr_param_t;
+} sctp_ipv6addr_param_t;
 
 /* Section 3.3.2.1 Cookie Preservative (9) */
 typedef struct sctp_cookie_preserve_param {
 	sctp_paramhdr_t param_hdr;
 	__be32          lifespan_increment;
-} __packed sctp_cookie_preserve_param_t;
+} sctp_cookie_preserve_param_t;
 
 /* Section 3.3.2.1 Host Name Address (11) */
 typedef struct sctp_hostname_param {
 	sctp_paramhdr_t param_hdr;
 	uint8_t hostname[0];
-} __packed sctp_hostname_param_t;
+} sctp_hostname_param_t;
 
 /* Section 3.3.2.1 Supported Address Types (12) */
 typedef struct sctp_supported_addrs_param {
 	sctp_paramhdr_t param_hdr;
 	__be16 types[0];
-} __packed sctp_supported_addrs_param_t;
+} sctp_supported_addrs_param_t;
 
 /* Appendix A. ECN Capable (32768) */
 typedef struct sctp_ecn_capable_param {
 	sctp_paramhdr_t param_hdr;
-} __packed sctp_ecn_capable_param_t;
+} sctp_ecn_capable_param_t;
 
 /* ADDIP Section 3.2.6 Adaptation Layer Indication */
 typedef struct sctp_adaptation_ind_param {
 	struct sctp_paramhdr param_hdr;
 	__be32 adaptation_ind;
-} __packed sctp_adaptation_ind_param_t;
+} sctp_adaptation_ind_param_t;
 
 /* ADDIP Section 4.2.7 Supported Extensions Parameter */
 typedef struct sctp_supported_ext_param {
 	struct sctp_paramhdr param_hdr;
 	__u8 chunks[0];
-} __packed sctp_supported_ext_param_t;
+} sctp_supported_ext_param_t;
 
 /* AUTH Section 3.1 Random */
 typedef struct sctp_random_param {
 	sctp_paramhdr_t param_hdr;
 	__u8 random_val[0];
-} __packed sctp_random_param_t;
+} sctp_random_param_t;
 
 /* AUTH Section 3.2 Chunk List */
 typedef struct sctp_chunks_param {
 	sctp_paramhdr_t param_hdr;
 	__u8 chunks[0];
-} __packed sctp_chunks_param_t;
+} sctp_chunks_param_t;
 
 /* AUTH Section 3.3 HMAC Algorithm */
 typedef struct sctp_hmac_algo_param {
 	sctp_paramhdr_t param_hdr;
 	__be16 hmac_ids[0];
-} __packed sctp_hmac_algo_param_t;
+} sctp_hmac_algo_param_t;
 
 /* RFC 2960.  Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2):
  *   The INIT ACK chunk is used to acknowledge the initiation of an SCTP
@@ -347,13 +347,13 @@ typedef sctp_init_chunk_t sctp_initack_chunk_t;
 typedef struct sctp_cookie_param {
 	sctp_paramhdr_t p;
 	__u8 body[0];
-} __packed sctp_cookie_param_t;
+} sctp_cookie_param_t;
 
 /* Section 3.3.3.1 Unrecognized Parameters (8) */
 typedef struct sctp_unrecognized_param {
 	sctp_paramhdr_t param_hdr;
 	sctp_paramhdr_t unrecognized;
-} __packed sctp_unrecognized_param_t;
+} sctp_unrecognized_param_t;
 
 
@@ -368,7 +368,7 @@ typedef struct sctp_unrecognized_param {
 typedef struct sctp_gap_ack_block {
 	__be16 start;
 	__be16 end;
-} __packed sctp_gap_ack_block_t;
+} sctp_gap_ack_block_t;
 
 typedef __be32 sctp_dup_tsn_t;
 
@@ -383,12 +383,12 @@ typedef struct sctp_sackhdr {
 	__be16 num_gap_ack_blocks;
 	__be16 num_dup_tsns;
 	sctp_sack_variable_t variable[0];
-} __packed sctp_sackhdr_t;
+} sctp_sackhdr_t;
 
 typedef struct sctp_sack_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_sackhdr_t sack_hdr;
-} __packed sctp_sack_chunk_t;
+} sctp_sack_chunk_t;
 
 
 /* RFC 2960.  Section 3.3.5 Heartbeat Request (HEARTBEAT) (4):
@@ -400,12 +400,12 @@ typedef struct sctp_sack_chunk {
 
 typedef struct sctp_heartbeathdr {
 	sctp_paramhdr_t info;
-} __packed sctp_heartbeathdr_t;
+} sctp_heartbeathdr_t;
 
 typedef struct sctp_heartbeat_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_heartbeathdr_t hb_hdr;
-} __packed sctp_heartbeat_chunk_t;
+} sctp_heartbeat_chunk_t;
 
 
 /* For the abort and shutdown ACK we must carry the init tag in the
@@ -414,7 +414,7 @@ typedef struct sctp_heartbeat_chunk {
  */
 typedef struct sctp_abort_chunk {
         sctp_chunkhdr_t uh;
-} __packed sctp_abort_chunk_t;
+} sctp_abort_chunk_t;
 
 
 /* For the graceful shutdown we must carry the tag (in common header)
@@ -422,12 +422,12 @@ typedef struct sctp_abort_chunk {
  */
 typedef struct sctp_shutdownhdr {
 	__be32 cum_tsn_ack;
-} __packed sctp_shutdownhdr_t;
+} sctp_shutdownhdr_t;
 
 struct sctp_shutdown_chunk_t {
         sctp_chunkhdr_t    chunk_hdr;
         sctp_shutdownhdr_t shutdown_hdr;
-} __packed;
+};
 
 /* RFC 2960.  Section 3.3.10 Operation Error (ERROR) (9) */
 
@@ -435,12 +435,12 @@ typedef struct sctp_errhdr {
 	__be16 cause;
 	__be16 length;
 	__u8  variable[0];
-} __packed sctp_errhdr_t;
+} sctp_errhdr_t;
 
 typedef struct sctp_operr_chunk {
         sctp_chunkhdr_t chunk_hdr;
 	sctp_errhdr_t   err_hdr;
-} __packed sctp_operr_chunk_t;
+} sctp_operr_chunk_t;
 
 /* RFC 2960 3.3.10 - Operation Error
  *
@@ -530,7 +530,7 @@ typedef struct sctp_ecnehdr {
 typedef struct sctp_ecne_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_ecnehdr_t ence_hdr;
-} __packed sctp_ecne_chunk_t;
+} sctp_ecne_chunk_t;
 
 /* RFC 2960.  Appendix A.  Explicit Congestion Notification.
  *   Congestion Window Reduced (CWR) (13)
@@ -542,7 +542,7 @@ typedef struct sctp_cwrhdr {
 typedef struct sctp_cwr_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_cwrhdr_t cwr_hdr;
-} __packed sctp_cwr_chunk_t;
+} sctp_cwr_chunk_t;
 
 /* PR-SCTP
  * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN)
@@ -593,17 +593,17 @@ typedef struct sctp_cwr_chunk {
 struct sctp_fwdtsn_skip {
 	__be16 stream;
 	__be16 ssn;
-} __packed;
+};
 
 struct sctp_fwdtsn_hdr {
 	__be32 new_cum_tsn;
 	struct sctp_fwdtsn_skip skip[0];
-} __packed;
+};
 
 struct sctp_fwdtsn_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_fwdtsn_hdr fwdtsn_hdr;
-} __packed;
+};
 
 
 /* ADDIP
@@ -641,17 +641,17 @@ struct sctp_fwdtsn_chunk {
 typedef struct sctp_addip_param {
 	sctp_paramhdr_t	param_hdr;
 	__be32		crr_id;
-} __packed sctp_addip_param_t;
+} sctp_addip_param_t;
 
 typedef struct sctp_addiphdr {
 	__be32	serial;
 	__u8	params[0];
-} __packed sctp_addiphdr_t;
+} sctp_addiphdr_t;
 
 typedef struct sctp_addip_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_addiphdr_t addip_hdr;
-} __packed sctp_addip_chunk_t;
+} sctp_addip_chunk_t;
 
 /* AUTH
  * Section 4.1  Authentication Chunk (AUTH)
@@ -706,12 +706,12 @@ typedef struct sctp_authhdr {
 	__be16 shkey_id;
 	__be16 hmac_id;
 	__u8   hmac[0];
-} __packed sctp_authhdr_t;
+} sctp_authhdr_t;
 
 typedef struct sctp_auth_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_authhdr_t auth_hdr;
-} __packed sctp_auth_chunk_t;
+} sctp_auth_chunk_t;
 
 struct sctp_infox {
 	struct sctp_info *sctpinfo;
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 6a685049f67f..387c802bf248 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -374,7 +374,7 @@ typedef struct sctp_sender_hb_info {
 	union sctp_addr daddr;
 	unsigned long sent_at;
 	__u64 hb_nonce;
-} __packed sctp_sender_hb_info_t;
+} sctp_sender_hb_info_t;
 
 struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp);
 void sctp_stream_free(struct sctp_stream *stream);
-- 
cgit v1.2.3


From 376bc27150f180d9f5eddec6a14117780177589d Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 19 Apr 2016 08:56:27 +0200
Subject: clockevents: Add a clkevt-of mechanism like clksrc-of

The current code uses the CLOCKSOURCE_OF_DECLARE macro to fill the clksrc
table with a t-uple (name, init_function).

Unfortunately it ends up to the clockevent and the clocksource being
both initialized with this macro. It is not a problem by itself but there
is not a clear distinction between a clockevent and a clocksource in the
code initialization path. Somebody can argue there are the same IP block
and the same DT node. But conceptually from the software side, there are
two distincts entities and as is they should be initialized separetely.
Some drivers which do not have a clocksource end up by using the
CLOCKSOURCE_OF_DECLARE macro to declare a clockevent.

Another result is the fuzzy organization in the clocksource directory,
where the clockevents are implemented in the same file than the
clocksources or file labelled timer-something implementing a clocksource.

This patch provides another macro to specifically declare a clockevent in
the same way than the clocksource and gives the opportunity to write two
separate drivers, one for the clocksource and another for the clockevents.

Hopefully, that can help to do some housework in the directory, perhaps
split the drivers in to entities, for example:
	- clksrc-rockchip.c
	- clkevt-rockchip.c

Also, it gives the possibility to declare clocksources separately in the
DT and then use a clocksource from IP block while while clockevents are
used from another IP block.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/Kconfig        |  7 +++++
 drivers/clocksource/Makefile       |  1 +
 drivers/clocksource/clkevt-probe.c | 56 ++++++++++++++++++++++++++++++++++++++
 include/linux/clockchips.h         |  9 ++++++
 4 files changed, 73 insertions(+)
 create mode 100644 drivers/clocksource/clkevt-probe.c

(limited to 'include/linux')

diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 4866f7aa32e6..3dc06f0d2a53 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -5,6 +5,10 @@ config CLKSRC_OF
 	bool
 	select CLKSRC_PROBE
 
+config CLKEVT_OF
+	bool
+	select CLKEVT_PROBE
+
 config CLKSRC_ACPI
 	bool
 	select CLKSRC_PROBE
@@ -12,6 +16,9 @@ config CLKSRC_ACPI
 config CLKSRC_PROBE
 	bool
 
+config CLKEVT_PROBE
+	bool
+
 config CLKSRC_I8253
 	bool
 
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index a14111e1f087..4b7e9a279c3f 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_CLKSRC_PROBE)	+= clksrc-probe.o
+obj-$(CONFIG_CLKEVT_PROBE)	+= clkevt-probe.o
 obj-$(CONFIG_ATMEL_PIT)		+= timer-atmel-pit.o
 obj-$(CONFIG_ATMEL_ST)		+= timer-atmel-st.o
 obj-$(CONFIG_ATMEL_TCB_CLKSRC)	+= tcb_clksrc.o
diff --git a/drivers/clocksource/clkevt-probe.c b/drivers/clocksource/clkevt-probe.c
new file mode 100644
index 000000000000..8c30fec86094
--- /dev/null
+++ b/drivers/clocksource/clkevt-probe.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Linaro Ltd.  All rights reserved.
+ * Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/clockchip.h>
+
+extern struct of_device_id __clkevt_of_table[];
+
+static const struct of_device_id __clkevt_of_table_sentinel
+	__used __section(__clkevt_of_table_end);
+
+int __init clockevent_probe(void)
+{
+	struct device_node *np;
+	const struct of_device_id *match;
+	of_init_fn_1_ret init_func;
+	int ret, clockevents = 0;
+
+	for_each_matching_node_and_match(np, __clkevt_of_table, &match) {
+		if (!of_device_is_available(np))
+			continue;
+
+		init_func = match->data;
+
+		ret = init_func(np);
+		if (ret) {
+			pr_warn("Failed to initialize '%s' (%d)\n",
+				np->name, ret);
+			continue;
+		}
+
+		clockevents++;
+	}
+
+	if (!clockevents) {
+		pr_crit("%s: no matching clockevent found\n", __func__);
+		return -ENODEV;
+	}
+
+	return 0;
+}
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 0d442e34c349..5d3053c34fb3 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -224,4 +224,13 @@ static inline void tick_setup_hrtimer_broadcast(void) { }
 
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
+#define CLOCKEVENT_OF_DECLARE(name, compat, fn) \
+	OF_DECLARE_1_RET(clkevt, name, compat, fn)
+
+#ifdef CONFIG_CLKEVT_PROBE
+extern int clockevent_probe(void);
+#els
+static inline int clockevent_probe(void) { return 0; }
+#endif
+
 #endif /* _LINUX_CLOCKCHIPS_H */
-- 
cgit v1.2.3


From 0f5bf6d0afe4be6e1391908ff2d6dc9730e91550 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Mon, 6 Feb 2017 16:31:58 -0800
Subject: arch: Rename CONFIG_DEBUG_RODATA and CONFIG_DEBUG_MODULE_RONX

Both of these options are poorly named. The features they provide are
necessary for system security and should not be considered debug only.
Change the names to CONFIG_STRICT_KERNEL_RWX and
CONFIG_STRICT_MODULE_RWX to better describe what these options do.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/DocBook/kgdb.tmpl            | 8 ++++----
 Documentation/security/self-protection.txt | 4 ++--
 arch/Kconfig                               | 4 ++--
 arch/arm/configs/aspeed_g4_defconfig       | 4 ++--
 arch/arm/configs/aspeed_g5_defconfig       | 4 ++--
 arch/arm/include/asm/cacheflush.h          | 2 +-
 arch/arm/kernel/patch.c                    | 4 ++--
 arch/arm/kernel/vmlinux.lds.S              | 8 ++++----
 arch/arm/mm/Kconfig                        | 2 +-
 arch/arm/mm/init.c                         | 4 ++--
 arch/arm64/Kconfig.debug                   | 2 +-
 arch/arm64/kernel/insn.c                   | 2 +-
 arch/parisc/configs/712_defconfig          | 1 -
 arch/parisc/configs/c3000_defconfig        | 1 -
 arch/parisc/mm/init.c                      | 2 +-
 include/linux/filter.h                     | 4 ++--
 include/linux/init.h                       | 4 ++--
 include/linux/module.h                     | 2 +-
 init/main.c                                | 4 ++--
 kernel/configs/android-recommended.config  | 2 +-
 kernel/module.c                            | 6 +++---
 kernel/power/hibernate.c                   | 2 +-
 kernel/power/power.h                       | 4 ++--
 kernel/power/snapshot.c                    | 4 ++--
 24 files changed, 41 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index f3abca7ec53d..856ac20bf367 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -115,12 +115,12 @@
     </para>
     <para>
     If the architecture that you are using supports the kernel option
-    CONFIG_DEBUG_RODATA, you should consider turning it off.  This
+    CONFIG_STRICT_KERNEL_RWX, you should consider turning it off.  This
     option will prevent the use of software breakpoints because it
     marks certain regions of the kernel's memory space as read-only.
     If kgdb supports it for the architecture you are using, you can
     use hardware breakpoints if you desire to run with the
-    CONFIG_DEBUG_RODATA option turned on, else you need to turn off
+    CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off
     this option.
     </para>
     <para>
@@ -135,7 +135,7 @@
     <para>Here is an example set of .config symbols to enable or
     disable for kgdb:
     <itemizedlist>
-    <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
+    <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
     <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
     <listitem><para>CONFIG_KGDB=y</para></listitem>
     <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
@@ -166,7 +166,7 @@
     </para>
     <para>Here is an example set of .config symbols to enable/disable kdb:
     <itemizedlist>
-    <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
+    <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
     <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
     <listitem><para>CONFIG_KGDB=y</para></listitem>
     <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
diff --git a/Documentation/security/self-protection.txt b/Documentation/security/self-protection.txt
index f41dd00e8b98..141acfebe6ef 100644
--- a/Documentation/security/self-protection.txt
+++ b/Documentation/security/self-protection.txt
@@ -51,8 +51,8 @@ kernel, they are implemented in a way where the memory is temporarily
 made writable during the update, and then returned to the original
 permissions.)
 
-In support of this are (the poorly named) CONFIG_DEBUG_RODATA and
-CONFIG_DEBUG_SET_MODULE_RONX, which seek to make sure that code is not
+In support of this are CONFIG_STRICT_KERNEL_RWX and
+CONFIG_STRICT_MODULE_RWX, which seek to make sure that code is not
 writable, data is not executable, and read-only data is neither writable
 nor executable.
 
diff --git a/arch/Kconfig b/arch/Kconfig
index 3f8b8be3036f..33f5a555c32a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -790,7 +790,7 @@ config ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
 config ARCH_HAS_STRICT_KERNEL_RWX
 	def_bool n
 
-config DEBUG_RODATA
+config STRICT_KERNEL_RWX
 	bool "Make kernel text and rodata read-only" if ARCH_OPTIONAL_KERNEL_RWX
 	depends on ARCH_HAS_STRICT_KERNEL_RWX
 	default !ARCH_OPTIONAL_KERNEL_RWX || ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
@@ -806,7 +806,7 @@ config DEBUG_RODATA
 config ARCH_HAS_STRICT_MODULE_RWX
 	def_bool n
 
-config DEBUG_SET_MODULE_RONX
+config STRICT_MODULE_RWX
 	bool "Set loadable kernel module data as NX and text as RO" if ARCH_OPTIONAL_KERNEL_RWX
 	depends on ARCH_HAS_STRICT_MODULE_RWX && MODULES
 	default !ARCH_OPTIONAL_KERNEL_RWX || ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index ca39c04fec6b..05b99bc1c1ce 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -25,7 +25,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_ARCH_MULTI_V7 is not set
 CONFIG_ARCH_ASPEED=y
 CONFIG_MACH_ASPEED_G4=y
-CONFIG_DEBUG_RODATA=y
 CONFIG_AEABI=y
 CONFIG_UACCESS_WITH_MEMCPY=y
 CONFIG_SECCOMP=y
@@ -79,7 +78,8 @@ CONFIG_DEBUG_LL_UART_8250=y
 CONFIG_DEBUG_UART_PHYS=0x1e784000
 CONFIG_DEBUG_UART_VIRT=0xe8784000
 CONFIG_EARLY_PRINTK=y
-CONFIG_DEBUG_SET_MODULE_RONX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_STRICT_KERNEL_RWX=y
 # CONFIG_XZ_DEC_X86 is not set
 # CONFIG_XZ_DEC_POWERPC is not set
 # CONFIG_XZ_DEC_IA64 is not set
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 4f366b0370e9..05a16d53d03c 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -26,7 +26,6 @@ CONFIG_ARCH_MULTI_V6=y
 # CONFIG_ARCH_MULTI_V7 is not set
 CONFIG_ARCH_ASPEED=y
 CONFIG_MACH_ASPEED_G5=y
-CONFIG_DEBUG_RODATA=y
 CONFIG_AEABI=y
 CONFIG_UACCESS_WITH_MEMCPY=y
 CONFIG_SECCOMP=y
@@ -81,7 +80,8 @@ CONFIG_DEBUG_LL_UART_8250=y
 CONFIG_DEBUG_UART_PHYS=0x1e784000
 CONFIG_DEBUG_UART_VIRT=0xe8784000
 CONFIG_EARLY_PRINTK=y
-CONFIG_DEBUG_SET_MODULE_RONX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_STRICT_KERNEL_RWX=y
 # CONFIG_XZ_DEC_X86 is not set
 # CONFIG_XZ_DEC_POWERPC is not set
 # CONFIG_XZ_DEC_IA64 is not set
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index bdd283bc5842..02454fa15d2c 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -490,7 +490,7 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
 static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 #else
diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
index 69bda1a5707e..020560b2dcb7 100644
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -24,9 +24,9 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 	bool module = !core_kernel_text(uintaddr);
 	struct page *page;
 
-	if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
+	if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 		page = vmalloc_to_page(addr);
-	else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA))
+	else if (!module && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
 		page = virt_to_page(addr);
 	else
 		return addr;
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index f7f55df0bf7b..ce18007f9e4e 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
 		HEAD_TEXT
 	}
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 	. = ALIGN(1<<SECTION_SHIFT);
 #endif
 
@@ -158,7 +158,7 @@ SECTIONS
 
 	NOTES
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 	. = ALIGN(1<<SECTION_SHIFT);
 #else
 	. = ALIGN(PAGE_SIZE);
@@ -230,7 +230,7 @@ SECTIONS
 	PERCPU_SECTION(L1_CACHE_BYTES)
 #endif
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 	. = ALIGN(1<<SECTION_SHIFT);
 #else
 	. = ALIGN(THREAD_SIZE);
@@ -325,7 +325,7 @@ SECTIONS
 	STABS_DEBUG
 }
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 /*
  * Without CONFIG_DEBUG_ALIGN_RODATA, __start_rodata_section_aligned will
  * be the first section-aligned location after __start_rodata. Otherwise,
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 419a0355d4e4..35e3a56e5d86 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -1053,7 +1053,7 @@ config ARCH_SUPPORTS_BIG_ENDIAN
 
 config DEBUG_ALIGN_RODATA
 	bool "Make rodata strictly non-executable"
-	depends on DEBUG_RODATA
+	depends on STRICT_KERNEL_RWX
 	default y
 	help
 	  If this is set, rodata will be made explicitly non-executable. This
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 370581aeb871..4be0bee4c357 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -572,7 +572,7 @@ void __init mem_init(void)
 	}
 }
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 struct section_perm {
 	const char *name;
 	unsigned long start;
@@ -741,7 +741,7 @@ void set_kernel_text_ro(void)
 
 #else
 static inline void fix_kernmem_perms(void) { }
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
 
 void free_tcmmem(void)
 {
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 939815e8d695..560a8d85a4f8 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -72,7 +72,7 @@ config DEBUG_WX
 	  If in doubt, say "Y".
 
 config DEBUG_ALIGN_RODATA
-	depends on DEBUG_RODATA
+	depends on STRICT_KERNEL_RWX
 	bool "Align linker sections up to SECTION_SIZE"
 	help
 	  If this option is enabled, sections that may potentially be marked as
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 94b62c1fa4df..67f9cb9e8512 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -93,7 +93,7 @@ static void __kprobes *patch_map(void *addr, int fixmap)
 	bool module = !core_kernel_text(uintaddr);
 	struct page *page;
 
-	if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
+	if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 		page = vmalloc_to_page(addr);
 	else if (!module)
 		page = pfn_to_page(PHYS_PFN(__pa(addr)));
diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig
index db8f56bf3883..143d02652792 100644
--- a/arch/parisc/configs/712_defconfig
+++ b/arch/parisc/configs/712_defconfig
@@ -182,7 +182,6 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_DEBUG_RODATA=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_HMAC=y
diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig
index fb92b8920785..8e8f0e34f817 100644
--- a/arch/parisc/configs/c3000_defconfig
+++ b/arch/parisc/configs/c3000_defconfig
@@ -166,7 +166,6 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_DEBUG_RODATA=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_MD5=m
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index e02ada312be8..a055e5b6b380 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -545,7 +545,7 @@ void free_initmem(void)
 }
 
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 void mark_rodata_ro(void)
 {
 	/* rodata memory was already mapped with KERNEL_RO access rights by
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a0934e6c9bab..c6dd53e88711 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -543,7 +543,7 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
 	set_memory_ro((unsigned long)fp, fp->pages);
@@ -561,7 +561,7 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 }
-#endif /* CONFIG_DEBUG_SET_MODULE_RONX */
+#endif /* CONFIG_STRICT_MODULE_RWX */
 
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
diff --git a/include/linux/init.h b/include/linux/init.h
index 885c3e6d0f9d..79af0962fd52 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -126,10 +126,10 @@ void prepare_namespace(void);
 void __init load_default_modules(void);
 int __init init_rootfs(void);
 
-#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_DEBUG_SET_MODULE_RONX)
+#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
 extern bool rodata_enabled;
 #endif
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 void mark_rodata_ro(void);
 #endif
 
diff --git a/include/linux/module.h b/include/linux/module.h
index 7c84273d60b9..d5afd142818f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -764,7 +764,7 @@ extern int module_sysfs_initialized;
 
 #define __MODULE_STRING(x) __stringify(x)
 
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 extern void set_all_modules_text_rw(void);
 extern void set_all_modules_text_ro(void);
 extern void module_enable_ro(const struct module *mod, bool after_init);
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..0b7bae29eef6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -925,7 +925,7 @@ static int try_to_run_init_process(const char *init_filename)
 
 static noinline void __init kernel_init_freeable(void);
 
-#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_DEBUG_SET_MODULE_RONX)
+#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
 bool rodata_enabled __ro_after_init = true;
 static int __init set_debug_rodata(char *str)
 {
@@ -934,7 +934,7 @@ static int __init set_debug_rodata(char *str)
 __setup("rodata=", set_debug_rodata);
 #endif
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 static void mark_readonly(void)
 {
 	if (rodata_enabled)
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 297756be369c..99127edc5204 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_COMPACTION=y
-CONFIG_DEBUG_RODATA=y
+CONFIG_STRICT_KERNEL_RWX=y
 CONFIG_DM_CRYPT=y
 CONFIG_DM_UEVENT=y
 CONFIG_DM_VERITY=y
diff --git a/kernel/module.c b/kernel/module.c
index 5088784c0cf9..e71478569273 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -74,9 +74,9 @@
 /*
  * Modules' sections will be aligned on page boundaries
  * to ensure complete separation of code and data, but
- * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ * only when CONFIG_STRICT_MODULE_RWX=y
  */
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 # define debug_align(X) ALIGN(X, PAGE_SIZE)
 #else
 # define debug_align(X) (X)
@@ -1847,7 +1847,7 @@ static void mod_sysfs_teardown(struct module *mod)
 	mod_sysfs_fini(mod);
 }
 
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 /*
  * LKM RO/NX protection: protect module's text/ro-data
  * from modification and any data from execution.
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26dbc48c75b..86385af1080f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1156,7 +1156,7 @@ static int __init hibernate_setup(char *str)
 	} else if (!strncmp(str, "no", 2)) {
 		noresume = 1;
 		nohibernate = 1;
-	} else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+	} else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)
 		   && !strncmp(str, "protect_image", 13)) {
 		enable_restore_image_protection();
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1dfa0da827d3..7fdc40d31b7d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -61,12 +61,12 @@ extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 /* kernel/power/snapshot.c */
 extern void enable_restore_image_protection(void);
 #else
 static inline void enable_restore_image_protection(void) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
 
 #else /* !CONFIG_HIBERNATION */
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2d8e2b227db8..905d5bbd595f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,7 +38,7 @@
 
 #include "power.h"
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 static bool hibernate_restore_protection;
 static bool hibernate_restore_protection_active;
 
@@ -73,7 +73,7 @@ static inline void hibernate_restore_protection_begin(void) {}
 static inline void hibernate_restore_protection_end(void) {}
 static inline void hibernate_restore_protect_page(void *page_address) {}
 static inline void hibernate_restore_unprotect_page(void *page_address) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
 
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
-- 
cgit v1.2.3


From 66cd794e3c30b8af3b6befe42a378557efb3114a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 7 Feb 2017 22:40:44 +0200
Subject: nl80211: add HT/VHT capabilities to AP parameters

For the benefit of drivers that rebuild IEs in firmware, parse the
IEs for HT/VHT capabilities and the respective membership selector
in the (extended) supported rates. This avoids duplicating the same
code into all drivers that need this information.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  3 ++-
 include/net/cfg80211.h    |  8 ++++++++
 net/wireless/nl80211.c    | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 02768de209d6..0dd9498c694f 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1043,8 +1043,9 @@ struct ieee80211_mgmt {
 	} u;
 } __packed __aligned(2);
 
-/* Supported Rates value encodings in 802.11n-2009 7.3.2.2 */
+/* Supported rates membership selectors */
 #define BSS_MEMBERSHIP_SELECTOR_HT_PHY	127
+#define BSS_MEMBERSHIP_SELECTOR_VHT_PHY	126
 
 /* mgmt header + 1 byte category code */
 #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 870549480e9b..5cfd2806a078 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -748,6 +748,10 @@ struct cfg80211_bitrate_mask {
  * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
  *	networks.
  * @beacon_rate: bitrate to be used for beacons
+ * @ht_cap: HT capabilities (or %NULL if HT isn't enabled)
+ * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled)
+ * @ht_required: stations must support HT
+ * @vht_required: stations must support VHT
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -768,6 +772,10 @@ struct cfg80211_ap_settings {
 	const struct cfg80211_acl_data *acl;
 	bool pbss;
 	struct cfg80211_bitrate_mask beacon_rate;
+
+	const struct ieee80211_ht_cap *ht_cap;
+	const struct ieee80211_vht_cap *vht_cap;
+	bool ht_required, vht_required;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a7b4318f735d..c853746f47bc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015-2016	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  */
 
 #include <linux/if.h>
@@ -3743,6 +3743,49 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
 	return 0;
 }
 
+static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
+					    const u8 *rates)
+{
+	int i;
+
+	if (!rates)
+		return;
+
+	for (i = 0; i < rates[1]; i++) {
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
+			params->ht_required = true;
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
+			params->vht_required = true;
+	}
+}
+
+/*
+ * Since the nl80211 API didn't include, from the beginning, attributes about
+ * HT/VHT requirements/capabilities, we parse them out of the IEs for the
+ * benefit of drivers that rebuild IEs in the firmware.
+ */
+static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
+{
+	const struct cfg80211_beacon_data *bcn = &params->beacon;
+	size_t ies_len = bcn->beacon_ies_len;
+	const u8 *ies = bcn->beacon_ies;
+	const u8 *rates;
+	const u8 *cap;
+
+	rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len);
+	nl80211_check_ap_rate_selectors(params, rates);
+
+	rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len);
+	nl80211_check_ap_rate_selectors(params, rates);
+
+	cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len);
+	if (cap && cap[1] >= sizeof(*params->ht_cap))
+		params->ht_cap = (void *)(cap + 2);
+	cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
+	if (cap && cap[1] >= sizeof(*params->vht_cap))
+		params->vht_cap = (void *)(cap + 2);
+}
+
 static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
 				   struct cfg80211_ap_settings *params)
 {
@@ -3971,6 +4014,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	nl80211_calculate_ap_params(&params);
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
-- 
cgit v1.2.3


From f92bac3b141b8233e34ddf32d227e12bfba07b48 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:05 +0900
Subject: printk: rename nmi.c and exported api

A preparation patch for printk_safe work. No functional change.
- rename nmi.c to print_safe.c
- add `printk_safe' prefix to some (which used both by printk-safe
  and printk-nmi) of the exported functions.

Link: http://lkml.kernel.org/r/20161227141611.940-3-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h      |  12 +-
 init/Kconfig                |  16 +--
 init/main.c                 |   2 +-
 kernel/kexec_core.c         |   2 +-
 kernel/panic.c              |   4 +-
 kernel/printk/Makefile      |   2 +-
 kernel/printk/nmi.c         | 291 --------------------------------------------
 kernel/printk/printk_safe.c | 291 ++++++++++++++++++++++++++++++++++++++++++++
 lib/nmi_backtrace.c         |   2 +-
 9 files changed, 312 insertions(+), 310 deletions(-)
 delete mode 100644 kernel/printk/nmi.c
 create mode 100644 kernel/printk/printk_safe.c

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 3472cc6b7a60..37e933eeffb2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -147,17 +147,17 @@ void early_printk(const char *s, ...) { }
 #endif
 
 #ifdef CONFIG_PRINTK_NMI
-extern void printk_nmi_init(void);
+extern void printk_safe_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
-extern void printk_nmi_flush(void);
-extern void printk_nmi_flush_on_panic(void);
+extern void printk_safe_flush(void);
+extern void printk_safe_flush_on_panic(void);
 #else
-static inline void printk_nmi_init(void) { }
+static inline void printk_safe_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
-static inline void printk_nmi_flush(void) { }
-static inline void printk_nmi_flush_on_panic(void) { }
+static inline void printk_safe_flush(void) { }
+static inline void printk_safe_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
diff --git a/init/Kconfig b/init/Kconfig
index 223b734abccd..760b7d0bc9d7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -875,17 +875,19 @@ config LOG_CPU_MAX_BUF_SHIFT
 		     13 =>   8 KB for each CPU
 		     12 =>   4 KB for each CPU
 
-config NMI_LOG_BUF_SHIFT
-	int "Temporary per-CPU NMI log buffer size (12 => 4KB, 13 => 8KB)"
+config PRINTK_SAFE_LOG_BUF_SHIFT
+	int "Temporary per-CPU printk log buffer size (12 => 4KB, 13 => 8KB)"
 	range 10 21
 	default 13
-	depends on PRINTK_NMI
+	depends on PRINTK
 	help
-	  Select the size of a per-CPU buffer where NMI messages are temporary
-	  stored. They are copied to the main log buffer in a safe context
-	  to avoid a deadlock. The value defines the size as a power of 2.
+	  Select the size of an alternate printk per-CPU buffer where messages
+	  printed from usafe contexts are temporary stored. One example would
+	  be NMI messages, another one - printk recursion. The messages are
+	  copied to the main log buffer in a safe context to avoid a deadlock.
+	  The value defines the size as a power of 2.
 
-	  NMI messages are rare and limited. The largest one is when
+	  Those messages are rare and limited. The largest one is when
 	  a backtrace is printed. It usually fits into 4KB. Select
 	  8KB if you want to be on the safe side.
 
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..b4ca17d9bdeb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -580,7 +580,7 @@ asmlinkage __visible void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
-	printk_nmi_init();
+	printk_safe_init();
 	perf_event_init();
 	profile_init();
 	call_function_init();
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5617cc412444..14bb9eb76665 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -916,7 +916,7 @@ void crash_kexec(struct pt_regs *regs)
 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 	if (old_cpu == PANIC_CPU_INVALID) {
 		/* This is the 1st CPU which comes here, so go ahead. */
-		printk_nmi_flush_on_panic();
+		printk_safe_flush_on_panic();
 		__crash_kexec(regs);
 
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index c51edaa04fce..8c8efcd310e7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -188,7 +188,7 @@ void panic(const char *fmt, ...)
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
 	if (!_crash_kexec_post_notifiers) {
-		printk_nmi_flush_on_panic();
+		printk_safe_flush_on_panic();
 		__crash_kexec(NULL);
 
 		/*
@@ -213,7 +213,7 @@ void panic(const char *fmt, ...)
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	/* Call flush even twice. It tries harder with a single online CPU */
-	printk_nmi_flush_on_panic();
+	printk_safe_flush_on_panic();
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index abb0042a427b..607928119f26 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,3 @@
 obj-y	= printk.o
-obj-$(CONFIG_PRINTK_NMI)		+= nmi.o
+obj-$(CONFIG_PRINTK_NMI)		+= printk_safe.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
deleted file mode 100644
index f011aaef583c..000000000000
--- a/kernel/printk/nmi.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * nmi.c - Safe printk in NMI context
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/irq_work.h>
-#include <linux/printk.h>
-
-#include "internal.h"
-
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * via @printk_func per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-static int printk_nmi_irq_ready;
-atomic_t nmi_message_lost;
-
-#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) -		\
-			 sizeof(atomic_t) - sizeof(struct irq_work))
-
-struct nmi_seq_buf {
-	atomic_t		len;	/* length of written data */
-	struct irq_work		work;	/* IRQ work that flushes the buffer */
-	unsigned char		buffer[NMI_LOG_BUF_LEN];
-};
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static int vprintk_nmi(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	int add = 0;
-	size_t len;
-
-again:
-	len = atomic_read(&s->len);
-
-	/* The trailing '\0' is not counted into len. */
-	if (len >= sizeof(s->buffer) - 1) {
-		atomic_inc(&nmi_message_lost);
-		return 0;
-	}
-
-	/*
-	 * Make sure that all old data have been read before the buffer was
-	 * reseted. This is not needed when we just append data.
-	 */
-	if (!len)
-		smp_rmb();
-
-	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
-
-	/*
-	 * Do it once again if the buffer has been flushed in the meantime.
-	 * Note that atomic_cmpxchg() is an implicit memory barrier that
-	 * makes sure that the data were written before updating s->len.
-	 */
-	if (atomic_cmpxchg(&s->len, len, len + add) != len)
-		goto again;
-
-	/* Get flushed in a more safe context. */
-	if (add && printk_nmi_irq_ready) {
-		/* Make sure that IRQ work is really initialized. */
-		smp_rmb();
-		irq_work_queue(&s->work);
-	}
-
-	return add;
-}
-
-static void printk_nmi_flush_line(const char *text, int len)
-{
-	/*
-	 * The buffers are flushed in NMI only on panic.  The messages must
-	 * go only into the ring buffer at this stage.  Consoles will get
-	 * explicitly called later when a crashdump is not generated.
-	 */
-	if (in_nmi())
-		printk_deferred("%.*s", len, text);
-	else
-		printk("%.*s", len, text);
-
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_nmi_flush_buffer(const char *start, size_t len)
-{
-	const char *c, *end;
-	bool header;
-
-	c = start;
-	end = start + len;
-	header = true;
-
-	/* Print line by line. */
-	while (c < end) {
-		if (*c == '\n') {
-			printk_nmi_flush_line(start, c - start + 1);
-			start = ++c;
-			header = true;
-			continue;
-		}
-
-		/* Handle continuous lines or missing new line. */
-		if ((c + 1 < end) && printk_get_level(c)) {
-			if (header) {
-				c = printk_skip_level(c);
-				continue;
-			}
-
-			printk_nmi_flush_line(start, c - start);
-			start = c++;
-			header = true;
-			continue;
-		}
-
-		header = false;
-		c++;
-	}
-
-	/* Check if there was a partial line. Ignore pure header. */
-	if (start < end && !header) {
-		static const char newline[] = KERN_CONT "\n";
-
-		printk_nmi_flush_line(start, end - start);
-		printk_nmi_flush_line(newline, strlen(newline));
-	}
-
-	return len;
-}
-
-/*
- * Flush data from the associated per_CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_nmi_flush(struct irq_work *work)
-{
-	static raw_spinlock_t read_lock =
-		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
-	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
-	unsigned long flags;
-	size_t len;
-	int i;
-
-	/*
-	 * The lock has two functions. First, one reader has to flush all
-	 * available message to make the lockless synchronization with
-	 * writers easier. Second, we do not want to mix messages from
-	 * different CPUs. This is especially important when printing
-	 * a backtrace.
-	 */
-	raw_spin_lock_irqsave(&read_lock, flags);
-
-	i = 0;
-more:
-	len = atomic_read(&s->len);
-
-	/*
-	 * This is just a paranoid check that nobody has manipulated
-	 * the buffer an unexpected way. If we printed something then
-	 * @len must only increase. Also it should never overflow the
-	 * buffer size.
-	 */
-	if ((i && i >= len) || len > sizeof(s->buffer)) {
-		const char *msg = "printk_nmi_flush: internal error\n";
-
-		printk_nmi_flush_line(msg, strlen(msg));
-		len = 0;
-	}
-
-	if (!len)
-		goto out; /* Someone else has already flushed the buffer. */
-
-	/* Make sure that data has been written up to the @len */
-	smp_rmb();
-	i += printk_nmi_flush_buffer(s->buffer + i, len - i);
-
-	/*
-	 * Check that nothing has got added in the meantime and truncate
-	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
-	 * barrier that makes sure that the data were copied before
-	 * updating s->len.
-	 */
-	if (atomic_cmpxchg(&s->len, len, 0) != len)
-		goto more;
-
-out:
-	raw_spin_unlock_irqrestore(&read_lock, flags);
-}
-
-/**
- * printk_nmi_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_nmi_flush(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
-}
-
-/**
- * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
- *	goes down.
- *
- * Similar to printk_nmi_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_nmi_flush_on_panic(void)
-{
-	/*
-	 * Make sure that we could access the main ring buffer.
-	 * Do not risk a double release when more CPUs are up.
-	 */
-	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
-		if (num_online_cpus() > 1)
-			return;
-
-		debug_locks_off();
-		raw_spin_lock_init(&logbuf_lock);
-	}
-
-	printk_nmi_flush();
-}
-
-void __init printk_nmi_init(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
-
-		init_irq_work(&s->work, __printk_nmi_flush);
-	}
-
-	/* Make sure that IRQ works are initialized before enabling. */
-	smp_wmb();
-	printk_nmi_irq_ready = 1;
-
-	/* Flush pending messages that did not have scheduled IRQ works. */
-	printk_nmi_flush();
-}
-
-void printk_nmi_enter(void)
-{
-	this_cpu_write(printk_func, vprintk_nmi);
-}
-
-void printk_nmi_exit(void)
-{
-	this_cpu_write(printk_func, vprintk_default);
-}
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
new file mode 100644
index 000000000000..fc80359dcd78
--- /dev/null
+++ b/kernel/printk/printk_safe.c
@@ -0,0 +1,291 @@
+/*
+ * printk_safe.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/debug_locks.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_safe_irq_ready;
+atomic_t nmi_message_lost;
+
+#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) -	\
+			 sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct printk_safe_seq_buf {
+	atomic_t		len;	/* length of written data */
+	struct irq_work		work;	/* IRQ work that flushes the buffer */
+	unsigned char		buffer[SAFE_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = 0;
+	size_t len;
+
+again:
+	len = atomic_read(&s->len);
+
+	/* The trailing '\0' is not counted into len. */
+	if (len >= sizeof(s->buffer) - 1) {
+		atomic_inc(&nmi_message_lost);
+		return 0;
+	}
+
+	/*
+	 * Make sure that all old data have been read before the buffer was
+	 * reseted. This is not needed when we just append data.
+	 */
+	if (!len)
+		smp_rmb();
+
+	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+	/*
+	 * Do it once again if the buffer has been flushed in the meantime.
+	 * Note that atomic_cmpxchg() is an implicit memory barrier that
+	 * makes sure that the data were written before updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, len + add) != len)
+		goto again;
+
+	/* Get flushed in a more safe context. */
+	if (add && printk_safe_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+
+	return add;
+}
+
+static void printk_safe_flush_line(const char *text, int len)
+{
+	/*
+	 * The buffers are flushed in NMI only on panic.  The messages must
+	 * go only into the ring buffer at this stage.  Consoles will get
+	 * explicitly called later when a crashdump is not generated.
+	 */
+	if (in_nmi())
+		printk_deferred("%.*s", len, text);
+	else
+		printk("%.*s", len, text);
+}
+
+/* printk part of the temporary buffer line by line */
+static int printk_safe_flush_buffer(const char *start, size_t len)
+{
+	const char *c, *end;
+	bool header;
+
+	c = start;
+	end = start + len;
+	header = true;
+
+	/* Print line by line. */
+	while (c < end) {
+		if (*c == '\n') {
+			printk_safe_flush_line(start, c - start + 1);
+			start = ++c;
+			header = true;
+			continue;
+		}
+
+		/* Handle continuous lines or missing new line. */
+		if ((c + 1 < end) && printk_get_level(c)) {
+			if (header) {
+				c = printk_skip_level(c);
+				continue;
+			}
+
+			printk_safe_flush_line(start, c - start);
+			start = c++;
+			header = true;
+			continue;
+		}
+
+		header = false;
+		c++;
+	}
+
+	/* Check if there was a partial line. Ignore pure header. */
+	if (start < end && !header) {
+		static const char newline[] = KERN_CONT "\n";
+
+		printk_safe_flush_line(start, end - start);
+		printk_safe_flush_line(newline, strlen(newline));
+	}
+
+	return len;
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_safe_flush(struct irq_work *work)
+{
+	static raw_spinlock_t read_lock =
+		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
+	struct printk_safe_seq_buf *s =
+		container_of(work, struct printk_safe_seq_buf, work);
+	unsigned long flags;
+	size_t len;
+	int i;
+
+	/*
+	 * The lock has two functions. First, one reader has to flush all
+	 * available message to make the lockless synchronization with
+	 * writers easier. Second, we do not want to mix messages from
+	 * different CPUs. This is especially important when printing
+	 * a backtrace.
+	 */
+	raw_spin_lock_irqsave(&read_lock, flags);
+
+	i = 0;
+more:
+	len = atomic_read(&s->len);
+
+	/*
+	 * This is just a paranoid check that nobody has manipulated
+	 * the buffer an unexpected way. If we printed something then
+	 * @len must only increase. Also it should never overflow the
+	 * buffer size.
+	 */
+	if ((i && i >= len) || len > sizeof(s->buffer)) {
+		const char *msg = "printk_safe_flush: internal error\n";
+
+		printk_safe_flush_line(msg, strlen(msg));
+		len = 0;
+	}
+
+	if (!len)
+		goto out; /* Someone else has already flushed the buffer. */
+
+	/* Make sure that data has been written up to the @len */
+	smp_rmb();
+	i += printk_safe_flush_buffer(s->buffer + i, len - i);
+
+	/*
+	 * Check that nothing has got added in the meantime and truncate
+	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
+	 * barrier that makes sure that the data were copied before
+	 * updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, 0) != len)
+		goto more;
+
+out:
+	raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_safe_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_safe_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+/**
+ * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
+ *	goes down.
+ *
+ * Similar to printk_safe_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_safe_flush_on_panic(void)
+{
+	/*
+	 * Make sure that we could access the main ring buffer.
+	 * Do not risk a double release when more CPUs are up.
+	 */
+	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+		if (num_online_cpus() > 1)
+			return;
+
+		debug_locks_off();
+		raw_spin_lock_init(&logbuf_lock);
+	}
+
+	printk_safe_flush();
+}
+
+void __init printk_safe_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct printk_safe_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_safe_flush);
+	}
+
+	/* Make sure that IRQ works are initialized before enabling. */
+	smp_wmb();
+	printk_safe_irq_ready = 1;
+
+	/* Flush pending messages that did not have scheduled IRQ works. */
+	printk_safe_flush();
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 75554754eadf..5f7999eacad5 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -77,7 +77,7 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
 	 * Force flush any remote buffers that might be stuck in IRQ context
 	 * and therefore could not run their irq_work.
 	 */
-	printk_nmi_flush();
+	printk_safe_flush();
 
 	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
-- 
cgit v1.2.3


From 099f1c84c0052ec1b27f1c3942eed5830d86bdbb Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:06 +0900
Subject: printk: introduce per-cpu safe_print seq buffer

This patch extends the idea of NMI per-cpu buffers to regions
that may cause recursive printk() calls and possible deadlocks.
Namely, printk() can't handle printk calls from schedule code
or printk() calls from lock debugging code (spin_dump() for instance);
because those may be called with `sem->lock' already taken or any
other `critical' locks (p->pi_lock, etc.). An example of deadlock
can be

 vprintk_emit()
  console_unlock()
   up()                        << raw_spin_lock_irqsave(&sem->lock, flags);
    wake_up_process()
     try_to_wake_up()
      ttwu_queue()
       ttwu_activate()
        activate_task()
         enqueue_task()
          enqueue_task_fair()
           cfs_rq_of()
            task_of()
             WARN_ON_ONCE(!entity_is_task(se))
              vprintk_emit()
               console_trylock()
                down_trylock()
                 raw_spin_lock_irqsave(&sem->lock, flags)
                 ^^^^ deadlock

and some other cases.

Just like in NMI implementation, the solution uses a per-cpu
`printk_func' pointer to 'redirect' printk() calls to a 'safe'
callback, that store messages in a per-cpu buffer and flushes
them back to logbuf buffer later.

Usage example:

 printk()
  printk_safe_enter_irqsave(flags)
  //
  //  any printk() call from here will endup in vprintk_safe(),
  //  that stores messages in a special per-CPU buffer.
  //
  printk_safe_exit_irqrestore(flags)

The 'redirection' mechanism, though, has been reworked, as suggested
by Petr Mladek. Instead of using a per-cpu @print_func callback we now
keep a per-cpu printk-context variable and call either default or nmi
vprintk function depending on its value. printk_nmi_entrer/exit and
printk_safe_enter/exit, thus, just set/celar corresponding bits in
printk-context functions.

The patch only adds printk_safe support, we don't use it yet.

Link: http://lkml.kernel.org/r/20161227141611.940-4-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/printk.h      |  21 +++++---
 kernel/printk/Makefile      |   2 +-
 kernel/printk/internal.h    |  76 ++++++++++++++++++--------
 kernel/printk/printk.c      |   3 --
 kernel/printk/printk_safe.c | 128 +++++++++++++++++++++++++++++++++++---------
 5 files changed, 172 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 37e933eeffb2..571257e0f53d 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -147,17 +147,11 @@ void early_printk(const char *s, ...) { }
 #endif
 
 #ifdef CONFIG_PRINTK_NMI
-extern void printk_safe_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
-extern void printk_safe_flush(void);
-extern void printk_safe_flush_on_panic(void);
 #else
-static inline void printk_safe_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
-static inline void printk_safe_flush(void) { }
-static inline void printk_safe_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
@@ -209,6 +203,9 @@ void __init setup_log_buf(int early);
 __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
 void show_regs_print_info(const char *log_lvl);
+extern void printk_safe_init(void);
+extern void printk_safe_flush(void);
+extern void printk_safe_flush_on_panic(void);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -268,6 +265,18 @@ static inline void dump_stack_print_info(const char *log_lvl)
 static inline void show_regs_print_info(const char *log_lvl)
 {
 }
+
+static inline void printk_safe_init(void)
+{
+}
+
+static inline void printk_safe_flush(void)
+{
+}
+
+static inline void printk_safe_flush_on_panic(void)
+{
+}
 #endif
 
 extern asmlinkage void dump_stack(void) __cold;
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 607928119f26..4a2ffc39eb95 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,3 @@
 obj-y	= printk.o
-obj-$(CONFIG_PRINTK_NMI)		+= printk_safe.o
+obj-$(CONFIG_PRINTK)	+= printk_safe.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7fd2838fa417..c65e36509f3b 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,26 +16,8 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
-
-int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
-
 #ifdef CONFIG_PRINTK_NMI
 
-extern raw_spinlock_t logbuf_lock;
-
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it temporary stores the strings into a per-CPU buffer.
- * The alternative implementation is chosen transparently
- * via per-CPU variable.
- */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
-	return this_cpu_read(printk_func)(fmt, args);
-}
-
 extern atomic_t nmi_message_lost;
 static inline int get_nmi_message_lost(void)
 {
@@ -44,14 +26,62 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
-	return vprintk_default(fmt, args);
-}
-
 static inline int get_nmi_message_lost(void)
 {
 	return 0;
 }
 
 #endif /* CONFIG_PRINTK_NMI */
+
+#ifdef CONFIG_PRINTK
+
+#define PRINTK_SAFE_CONTEXT_MASK	0x7fffffff
+#define PRINTK_NMI_CONTEXT_MASK	0x80000000
+
+extern raw_spinlock_t logbuf_lock;
+
+__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
+void __printk_safe_enter(void);
+void __printk_safe_exit(void);
+
+#define printk_safe_enter_irqsave(flags)	\
+	do {					\
+		local_irq_save(flags);		\
+		__printk_safe_enter();		\
+	} while (0)
+
+#define printk_safe_exit_irqrestore(flags)	\
+	do {					\
+		__printk_safe_exit();		\
+		local_irq_restore(flags);	\
+	} while (0)
+
+#define printk_safe_enter_irq()		\
+	do {					\
+		local_irq_disable();		\
+		__printk_safe_enter();		\
+	} while (0)
+
+#define printk_safe_exit_irq()			\
+	do {					\
+		__printk_safe_exit();		\
+		local_irq_enable();		\
+	} while (0)
+
+#else
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
+
+/*
+ * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * semaphore and some of console functions (console_unlock()/etc.), so
+ * printk-safe must preserve the existing local IRQ guarantees.
+ */
+#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
+#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
+
+#define printk_safe_enter_irq() local_irq_disable()
+#define printk_safe_exit_irq() local_irq_enable()
+
+#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 106843a83b63..f73046f7a6df 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1902,9 +1902,6 @@ static size_t msg_print_text(const struct printk_log *msg,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
 
-/* Still needs to be defined for users */
-DEFINE_PER_CPU(printk_func_t, printk_func);
-
 #endif /* CONFIG_PRINTK */
 
 #ifdef CONFIG_EARLY_PRINTK
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index fc80359dcd78..efc89a4e9df5 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -1,5 +1,5 @@
 /*
- * printk_safe.c - Safe printk in NMI context
+ * printk_safe.c - Safe printk for printk-deadlock-prone contexts
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -32,13 +32,13 @@
  * is later flushed into the main ring buffer via IRQ work.
  *
  * The alternative implementation is chosen transparently
- * via @printk_func per-CPU variable.
+ * by examinig current printk() context mask stored in @printk_context
+ * per-CPU variable.
  *
  * The implementation allows to flush the strings also from another CPU.
  * There are situations when we want to make sure that all buffers
  * were handled or when IRQs are blocked.
  */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 static int printk_safe_irq_ready;
 atomic_t nmi_message_lost;
 
@@ -50,18 +50,28 @@ struct printk_safe_seq_buf {
 	struct irq_work		work;	/* IRQ work that flushes the buffer */
 	unsigned char		buffer[SAFE_LOG_BUF_LEN];
 };
+
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
+static DEFINE_PER_CPU(int, printk_context);
+
+#ifdef CONFIG_PRINTK_NMI
 static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
+#endif
 
 /*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
+ * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
+ * have dedicated buffers, because otherwise printk-safe preempted by
+ * NMI-printk would have overwritten the NMI messages.
+ *
+ * The messages are fushed from irq work (or from panic()), possibly,
+ * from other CPU, concurrently with printk_safe_log_store(). Should this
+ * happen, printk_safe_log_store() will notice the buffer->len mismatch
+ * and repeat the write.
  */
-static int vprintk_nmi(const char *fmt, va_list args)
+static int printk_safe_log_store(struct printk_safe_seq_buf *s,
+				 const char *fmt, va_list args)
 {
-	struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	int add = 0;
+	int add;
 	size_t len;
 
 again:
@@ -74,8 +84,8 @@ again:
 	}
 
 	/*
-	 * Make sure that all old data have been read before the buffer was
-	 * reseted. This is not needed when we just append data.
+	 * Make sure that all old data have been read before the buffer
+	 * was reset. This is not needed when we just append data.
 	 */
 	if (!len)
 		smp_rmb();
@@ -161,7 +171,7 @@ static int printk_safe_flush_buffer(const char *start, size_t len)
 }
 
 /*
- * Flush data from the associated per_CPU buffer. The function
+ * Flush data from the associated per-CPU buffer. The function
  * can be called either via IRQ work or independently.
  */
 static void __printk_safe_flush(struct irq_work *work)
@@ -231,8 +241,12 @@ void printk_safe_flush(void)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+#ifdef CONFIG_PRINTK_NMI
 		__printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
+#endif
+		__printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
+	}
 }
 
 /**
@@ -262,14 +276,88 @@ void printk_safe_flush_on_panic(void)
 	printk_safe_flush();
 }
 
+#ifdef CONFIG_PRINTK_NMI
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+
+	return printk_safe_log_store(s, fmt, args);
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+}
+
+#else
+
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PRINTK_NMI */
+
+/*
+ * Lock-less printk(), to avoid deadlocks should the printk() recurse
+ * into itself. It uses a per-CPU buffer to store the message, just like
+ * NMI.
+ */
+static int vprintk_safe(const char *fmt, va_list args)
+{
+	struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
+
+	return printk_safe_log_store(s, fmt, args);
+}
+
+/* Can be preempted by NMI. */
+void __printk_safe_enter(void)
+{
+	this_cpu_inc(printk_context);
+}
+
+/* Can be preempted by NMI. */
+void __printk_safe_exit(void)
+{
+	this_cpu_dec(printk_context);
+}
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
+		return vprintk_nmi(fmt, args);
+
+	if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
+		return vprintk_safe(fmt, args);
+
+	return vprintk_default(fmt, args);
+}
+
 void __init printk_safe_init(void)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct printk_safe_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+		struct printk_safe_seq_buf *s;
+
+		s = &per_cpu(safe_print_seq, cpu);
+		init_irq_work(&s->work, __printk_safe_flush);
 
+#ifdef CONFIG_PRINTK_NMI
+		s = &per_cpu(nmi_print_seq, cpu);
 		init_irq_work(&s->work, __printk_safe_flush);
+#endif
 	}
 
 	/* Make sure that IRQ works are initialized before enabling. */
@@ -279,13 +367,3 @@ void __init printk_safe_init(void)
 	/* Flush pending messages that did not have scheduled IRQ works. */
 	printk_safe_flush();
 }
-
-void printk_nmi_enter(void)
-{
-	this_cpu_write(printk_func, vprintk_nmi);
-}
-
-void printk_nmi_exit(void)
-{
-	this_cpu_write(printk_func, vprintk_default);
-}
-- 
cgit v1.2.3


From b44c4d3f1ee77e7947c525d6a907c0057fa238d2 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 20 Jan 2017 11:14:14 +0100
Subject: mfd: cros-ec: Update cros_ec_commands.h for buttons and switches

Add the defines for the new buttons and switches connected to the CrosEC.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec_commands.h | 73 +++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 1683003603f3..004dcf3560fb 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1839,18 +1839,69 @@ struct ec_response_tmp006_get_raw {
  *
  * Returns raw data for keyboard cols; see ec_response_mkbp_info.cols for
  * expected response size.
+ *
+ * NOTE: This has been superseded by EC_CMD_MKBP_GET_NEXT_EVENT.  If you wish
+ * to obtain the instantaneous state, use EC_CMD_MKBP_INFO with the type
+ * EC_MKBP_INFO_CURRENT and event EC_MKBP_EVENT_KEY_MATRIX.
  */
 #define EC_CMD_MKBP_STATE 0x60
 
-/* Provide information about the matrix : number of rows and columns */
+/*
+ * Provide information about various MKBP things.  See enum ec_mkbp_info_type.
+ */
 #define EC_CMD_MKBP_INFO 0x61
 
 struct ec_response_mkbp_info {
 	uint32_t rows;
 	uint32_t cols;
-	uint8_t switches;
+	/* Formerly "switches", which was 0. */
+	uint8_t reserved;
 } __packed;
 
+struct ec_params_mkbp_info {
+	uint8_t info_type;
+	uint8_t event_type;
+} __packed;
+
+enum ec_mkbp_info_type {
+	/*
+	 * Info about the keyboard matrix: number of rows and columns.
+	 *
+	 * Returns struct ec_response_mkbp_info.
+	 */
+	EC_MKBP_INFO_KBD = 0,
+
+	/*
+	 * For buttons and switches, info about which specifically are
+	 * supported.  event_type must be set to one of the values in enum
+	 * ec_mkbp_event.
+	 *
+	 * For EC_MKBP_EVENT_BUTTON and EC_MKBP_EVENT_SWITCH, returns a 4 byte
+	 * bitmask indicating which buttons or switches are present.  See the
+	 * bit inidices below.
+	 */
+	EC_MKBP_INFO_SUPPORTED = 1,
+
+	/*
+	 * Instantaneous state of buttons and switches.
+	 *
+	 * event_type must be set to one of the values in enum ec_mkbp_event.
+	 *
+	 * For EC_MKBP_EVENT_KEY_MATRIX, returns uint8_t key_matrix[13]
+	 * indicating the current state of the keyboard matrix.
+	 *
+	 * For EC_MKBP_EVENT_HOST_EVENT, return uint32_t host_event, the raw
+	 * event state.
+	 *
+	 * For EC_MKBP_EVENT_BUTTON, returns uint32_t buttons, indicating the
+	 * state of supported buttons.
+	 *
+	 * For EC_MKBP_EVENT_SWITCH, returns uint32_t switches, indicating the
+	 * state of supported switches.
+	 */
+	EC_MKBP_INFO_CURRENT = 2,
+};
+
 /* Simulate key press */
 #define EC_CMD_MKBP_SIMULATE_KEY 0x62
 
@@ -1983,6 +2034,12 @@ enum ec_mkbp_event {
 	/* New Sensor FIFO data. The event data is fifo_info structure. */
 	EC_MKBP_EVENT_SENSOR_FIFO = 2,
 
+	/* The state of the non-matrixed buttons have changed. */
+	EC_MKBP_EVENT_BUTTON = 3,
+
+	/* The state of the switches have changed. */
+	EC_MKBP_EVENT_SWITCH = 4,
+
 	/* Number of MKBP events */
 	EC_MKBP_EVENT_COUNT,
 };
@@ -1992,6 +2049,9 @@ union ec_response_get_next_data {
 
 	/* Unaligned */
 	uint32_t  host_event;
+
+	uint32_t   buttons;
+	uint32_t   switches;
 } __packed;
 
 struct ec_response_get_next_event {
@@ -2000,6 +2060,15 @@ struct ec_response_get_next_event {
 	union ec_response_get_next_data data;
 } __packed;
 
+/* Bit indices for buttons and switches.*/
+/* Buttons */
+#define EC_MKBP_POWER_BUTTON	0
+#define EC_MKBP_VOL_UP		1
+#define EC_MKBP_VOL_DOWN	2
+
+/* Switches */
+#define EC_MKBP_LID_OPEN	0
+
 /*****************************************************************************/
 /* Temperature sensor commands */
 
-- 
cgit v1.2.3


From d3e1b617ae20c459627f501b4bc55b1ea91f662b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 2 Feb 2017 17:41:28 -0800
Subject: i2c: allow specify device properties in i2c_board_info

With many drivers converting to using generic device properties, it is
useful to provide array of device properties when instantiating new i2c
client via i2c_board_info and have them automatically added to the device
in question.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/i2c/i2c-core.c | 16 +++++++++++++++-
 include/linux/i2c.h    |  3 +++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 583e95042a21..f72292156faf 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1335,15 +1335,29 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 	client->dev.fwnode = info->fwnode;
 
 	i2c_dev_set_name(adap, client);
+
+	if (info->properties) {
+		status = device_add_properties(&client->dev, info->properties);
+		if (status) {
+			dev_err(&adap->dev,
+				"Failed to add properties to client %s: %d\n",
+				client->name, status);
+			goto out_err;
+		}
+	}
+
 	status = device_register(&client->dev);
 	if (status)
-		goto out_err;
+		goto out_free_props;
 
 	dev_dbg(&adap->dev, "client [%s] registered with bus id %s\n",
 		client->name, dev_name(&client->dev));
 
 	return client;
 
+out_free_props:
+	if (info->properties)
+		device_remove_properties(&client->dev);
 out_err:
 	dev_err(&adap->dev,
 		"Failed to register i2c client %s at 0x%02x (%d)\n",
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 4b45ec46161f..7b23a3316dcb 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -51,6 +51,7 @@ enum i2c_slave_event;
 typedef int (*i2c_slave_cb_t)(struct i2c_client *, enum i2c_slave_event, u8 *);
 
 struct module;
+struct property_entry;
 
 #if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
 /*
@@ -299,6 +300,7 @@ static inline int i2c_slave_event(struct i2c_client *client,
  * @archdata: copied into i2c_client.dev.archdata
  * @of_node: pointer to OpenFirmware device node
  * @fwnode: device node supplied by the platform firmware
+ * @properties: additional device properties for the device
  * @irq: stored in i2c_client.irq
  *
  * I2C doesn't actually support hardware probing, although controllers and
@@ -320,6 +322,7 @@ struct i2c_board_info {
 	struct dev_archdata	*archdata;
 	struct device_node *of_node;
 	struct fwnode_handle *fwnode;
+	const struct property_entry *properties;
 	int		irq;
 };
 
-- 
cgit v1.2.3


From 4f46de9d2eda184b0e49e32bb2a92c6a06b336f5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:50:14 +0000
Subject: irqchip/gic-v3-its: Drop deprecated GITS_BASER_TYPE_CPU

During the development of the GICv3/v4 architecture, it was
envisaged to have a CPU table, though the use for it was
never completely clear (the collection table serves that role
pretty well). It ended being dropped before the specification
was published, though it lived on in the driver.

In order to avoid people scratching their head too much, let's do
the same in the kernel.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 2 +-
 include/linux/irqchip/arm-gic-v3.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4e01103c6a06..fff024b89a2a 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -822,7 +822,7 @@ static int __init its_alloc_lpi_tables(void)
 static const char *its_base_type_string[] = {
 	[GITS_BASER_TYPE_DEVICE]	= "Devices",
 	[GITS_BASER_TYPE_VCPU]		= "Virtual CPUs",
-	[GITS_BASER_TYPE_CPU]		= "Physical CPUs",
+	[GITS_BASER_TYPE_RESERVED3]	= "Reserved (3)",
 	[GITS_BASER_TYPE_COLLECTION]	= "Interrupt Collections",
 	[GITS_BASER_TYPE_RESERVED5] 	= "Reserved (5)",
 	[GITS_BASER_TYPE_RESERVED6] 	= "Reserved (6)",
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index e808f8ae6f14..b6e7629f8bb0 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -306,7 +306,7 @@
 #define GITS_BASER_TYPE_NONE		0
 #define GITS_BASER_TYPE_DEVICE		1
 #define GITS_BASER_TYPE_VCPU		2
-#define GITS_BASER_TYPE_CPU		3
+#define GITS_BASER_TYPE_RESERVED3	3
 #define GITS_BASER_TYPE_COLLECTION	4
 #define GITS_BASER_TYPE_RESERVED5	5
 #define GITS_BASER_TYPE_RESERVED6	6
-- 
cgit v1.2.3


From 6a25ad3a9f9832f24df7987227122b8359f05c8e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:52:26 +0000
Subject: irqchip/gic-v3-its: Rename MAPVI to MAPTI

Back in the days when the GICv3/v4 architecture was drafted,
the command to an event to an LPI number was called MAPVI.
Later on, and to avoid confusion with the GICv4 command VMAPI,
it was renamed MAPTI. We've carried the old name for a long
time, but it gets in the way of people reading the code in
the light of the public architecture specification.

Just repaint all the references and kill the old definition.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 28 ++++++++++++++--------------
 include/linux/irqchip/arm-gic-v3.h |  2 --
 2 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index fff024b89a2a..fcbba10c0ed9 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -161,7 +161,7 @@ struct its_cmd_desc {
 			struct its_device *dev;
 			u32 phys_id;
 			u32 event_id;
-		} its_mapvi_cmd;
+		} its_mapti_cmd;
 
 		struct {
 			struct its_device *dev;
@@ -287,18 +287,18 @@ static struct its_collection *its_build_mapc_cmd(struct its_cmd_block *cmd,
 	return desc->its_mapc_cmd.col;
 }
 
-static struct its_collection *its_build_mapvi_cmd(struct its_cmd_block *cmd,
+static struct its_collection *its_build_mapti_cmd(struct its_cmd_block *cmd,
 						  struct its_cmd_desc *desc)
 {
 	struct its_collection *col;
 
-	col = dev_event_to_col(desc->its_mapvi_cmd.dev,
-			       desc->its_mapvi_cmd.event_id);
+	col = dev_event_to_col(desc->its_mapti_cmd.dev,
+			       desc->its_mapti_cmd.event_id);
 
-	its_encode_cmd(cmd, GITS_CMD_MAPVI);
-	its_encode_devid(cmd, desc->its_mapvi_cmd.dev->device_id);
-	its_encode_event_id(cmd, desc->its_mapvi_cmd.event_id);
-	its_encode_phys_id(cmd, desc->its_mapvi_cmd.phys_id);
+	its_encode_cmd(cmd, GITS_CMD_MAPTI);
+	its_encode_devid(cmd, desc->its_mapti_cmd.dev->device_id);
+	its_encode_event_id(cmd, desc->its_mapti_cmd.event_id);
+	its_encode_phys_id(cmd, desc->its_mapti_cmd.phys_id);
 	its_encode_collection(cmd, col->col_id);
 
 	its_fixup_cmd(cmd);
@@ -529,15 +529,15 @@ static void its_send_mapc(struct its_node *its, struct its_collection *col,
 	its_send_single_command(its, its_build_mapc_cmd, &desc);
 }
 
-static void its_send_mapvi(struct its_device *dev, u32 irq_id, u32 id)
+static void its_send_mapti(struct its_device *dev, u32 irq_id, u32 id)
 {
 	struct its_cmd_desc desc;
 
-	desc.its_mapvi_cmd.dev = dev;
-	desc.its_mapvi_cmd.phys_id = irq_id;
-	desc.its_mapvi_cmd.event_id = id;
+	desc.its_mapti_cmd.dev = dev;
+	desc.its_mapti_cmd.phys_id = irq_id;
+	desc.its_mapti_cmd.event_id = id;
 
-	its_send_single_command(dev->its, its_build_mapvi_cmd, &desc);
+	its_send_single_command(dev->its, its_build_mapti_cmd, &desc);
 }
 
 static void its_send_movi(struct its_device *dev,
@@ -1496,7 +1496,7 @@ static void its_irq_domain_activate(struct irq_domain *domain,
 	its_dev->event_map.col_map[event] = cpumask_first(cpu_mask);
 
 	/* Map the GIC IRQ and event to the device */
-	its_send_mapvi(its_dev, d->hwirq, event);
+	its_send_mapti(its_dev, d->hwirq, event);
 }
 
 static void its_irq_domain_deactivate(struct irq_domain *domain,
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index b6e7629f8bb0..400ed7ad2bff 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -320,8 +320,6 @@
 #define GITS_CMD_MAPD			0x08
 #define GITS_CMD_MAPC			0x09
 #define GITS_CMD_MAPTI			0x0a
-/* older GIC documentation used MAPVI for this command */
-#define GITS_CMD_MAPVI			GITS_CMD_MAPTI
 #define GITS_CMD_MAPI			0x0b
 #define GITS_CMD_MOVI			0x01
 #define GITS_CMD_DISCARD		0x0f
-- 
cgit v1.2.3


From e3c484b183d3bbef0f0df527e10359163f0bb1db Mon Sep 17 00:00:00 2001
From: Alim Akhtar <alim.akhtar@samsung.com>
Date: Wed, 4 Jan 2017 13:54:20 +0530
Subject: irqchip/gic-v3: Remove duplicate definition of GICD_TYPER_LPIS

GICD_TYPER_LPIS macro is defined twice in this file. This patch removes the
duplicate entry.

Fixes: f5c1434c217f ("irqchip: GICv3: rework redistributor structure")
Signed-off-by: Alim Akhtar <alim.akhtar@samsung.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 400ed7ad2bff..725e86b506f3 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -73,7 +73,6 @@
 
 #define GICD_TYPER_ID_BITS(typer)	((((typer) >> 19) & 0x1f) + 1)
 #define GICD_TYPER_IRQS(typer)		((((typer) & 0x1f) + 1) * 32)
-#define GICD_TYPER_LPIS			(1U << 17)
 
 #define GICD_IROUTER_SPI_MODE_ONE	(0U << 31)
 #define GICD_IROUTER_SPI_MODE_ANY	(1U << 31)
-- 
cgit v1.2.3


From 3046ec674d441562c6bb3e4284cd866743042ef3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 8 Feb 2017 14:54:12 +0000
Subject: ARM: smccc: Update HVC comment to describe new quirk parameter

Commit 680a0873e193 ("arm: kernel: Add SMC structure parameter") added
a new "quirk" parameter to the SMC and HVC SMCCC backends, but only
updated the comment for the SMC version. This patch adds the new
paramater to the comment describing the HVC version too.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/arm-smccc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index b67934164401..4c5bca38c653 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -109,6 +109,7 @@ asmlinkage void __arm_smccc_smc(unsigned long a0, unsigned long a1,
  * __arm_smccc_hvc() - make HVC calls
  * @a0-a7: arguments passed in registers 0 to 7
  * @res: result values from registers 0 to 3
+ * @quirk: points to an arm_smccc_quirk, or NULL when no quirks are required.
  *
  * This function is used to make HVC calls following SMC Calling
  * Convention.  The content of the supplied param are copied to registers 0
-- 
cgit v1.2.3


From be5e5099183301fb7920f8f6b66bd3ac1f820a97 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Mon, 16 Jan 2017 17:28:18 +0100
Subject: mtd: bcm47xxsflash: use platform_(set|get)_drvdata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have generic place & helpers for storing platform driver data so
there is no reason for using custom priv pointer.

This allows cleaning up struct bcma_sflash from unneeded fields.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/devices/bcm47xxsflash.c         | 6 +++---
 include/linux/bcma/bcma_driver_chipcommon.h | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/bcm47xxsflash.c b/drivers/mtd/devices/bcm47xxsflash.c
index 514be04c0b6c..4decd8c0360a 100644
--- a/drivers/mtd/devices/bcm47xxsflash.c
+++ b/drivers/mtd/devices/bcm47xxsflash.c
@@ -284,7 +284,6 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 	b47s = devm_kzalloc(dev, sizeof(*b47s), GFP_KERNEL);
 	if (!b47s)
 		return -ENOMEM;
-	sflash->priv = b47s;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res) {
@@ -334,6 +333,8 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 	b47s->size = sflash->size;
 	bcm47xxsflash_fill_mtd(b47s, &pdev->dev);
 
+	platform_set_drvdata(pdev, b47s);
+
 	err = mtd_device_parse_register(&b47s->mtd, probes, NULL, NULL, 0);
 	if (err) {
 		pr_err("Failed to register MTD device: %d\n", err);
@@ -349,8 +350,7 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 
 static int bcm47xxsflash_bcma_remove(struct platform_device *pdev)
 {
-	struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
-	struct bcm47xxsflash *b47s = sflash->priv;
+	struct bcm47xxsflash *b47s = platform_get_drvdata(pdev);
 
 	mtd_device_unregister(&b47s->mtd);
 	iounmap(b47s->window);
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index b20e3d56253f..2f1c690a3e66 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -593,9 +593,6 @@ struct bcma_sflash {
 	u32 blocksize;
 	u16 numblocks;
 	u32 size;
-
-	struct mtd_info *mtd;
-	void *priv;
 };
 #endif
 
-- 
cgit v1.2.3


From 21bdbb7102edeaebb5ec4ef530c8f442f7562c96 Mon Sep 17 00:00:00 2001
From: Neil Leeder <nleeder@codeaurora.org>
Date: Tue, 7 Feb 2017 13:14:04 -0500
Subject: perf: add qcom l2 cache perf events driver

Adds perf events support for L2 cache PMU.

The L2 cache PMU driver is named 'l2cache_0' and can be used
with perf events to profile L2 events such as cache hits
and misses on Qualcomm Technologies processors.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Neil Leeder <nleeder@codeaurora.org>
[will: minimise nesting in l2_cache_associate_cpu_with_cluster]
[will: use kstrtoul for unsigned long, remove redunant .owner setting]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 Documentation/perf/qcom_l2_pmu.txt |   38 ++
 drivers/perf/Kconfig               |    9 +
 drivers/perf/Makefile              |    1 +
 drivers/perf/qcom_l2_pmu.c         | 1013 ++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h         |    1 +
 5 files changed, 1062 insertions(+)
 create mode 100644 Documentation/perf/qcom_l2_pmu.txt
 create mode 100644 drivers/perf/qcom_l2_pmu.c

(limited to 'include/linux')

diff --git a/Documentation/perf/qcom_l2_pmu.txt b/Documentation/perf/qcom_l2_pmu.txt
new file mode 100644
index 000000000000..b25b97659ab9
--- /dev/null
+++ b/Documentation/perf/qcom_l2_pmu.txt
@@ -0,0 +1,38 @@
+Qualcomm Technologies Level-2 Cache Performance Monitoring Unit (PMU)
+=====================================================================
+
+This driver supports the L2 cache clusters found in Qualcomm Technologies
+Centriq SoCs. There are multiple physical L2 cache clusters, each with their
+own PMU. Each cluster has one or more CPUs associated with it.
+
+There is one logical L2 PMU exposed, which aggregates the results from
+the physical PMUs.
+
+The driver provides a description of its available events and configuration
+options in sysfs, see /sys/devices/l2cache_0.
+
+The "format" directory describes the format of the events.
+
+Events can be envisioned as a 2-dimensional array. Each column represents
+a group of events. There are 8 groups. Only one entry from each
+group can be in use at a time. If multiple events from the same group
+are specified, the conflicting events cannot be counted at the same time.
+
+Events are specified as 0xCCG, where CC is 2 hex digits specifying
+the code (array row) and G specifies the group (column) 0-7.
+
+In addition there is a cycle counter event specified by the value 0xFE
+which is outside the above scheme.
+
+The driver provides a "cpumask" sysfs attribute which contains a mask
+consisting of one CPU per cluster which will be used to handle all the PMU
+events on that cluster.
+
+Examples for use with perf:
+
+  perf stat -e l2cache_0/config=0x001/,l2cache_0/config=0x042/ -a sleep 1
+
+  perf stat -e l2cache_0/config=0xfe/ -C 2 sleep 1
+
+The driver does not support sampling, therefore "perf record" will
+not work. Per-task perf sessions are not supported.
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 4d5c5f9f0dbd..93651907874f 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -12,6 +12,15 @@ config ARM_PMU
 	  Say y if you want to use CPU performance monitors on ARM-based
 	  systems.
 
+config QCOM_L2_PMU
+	bool "Qualcomm Technologies L2-cache PMU"
+	depends on ARCH_QCOM && ARM64 && PERF_EVENTS && ACPI
+	  help
+	  Provides support for the L2 cache performance monitor unit (PMU)
+	  in Qualcomm Technologies processors.
+	  Adds the L2 cache PMU into the perf events subsystem for
+	  monitoring L2 cache events.
+
 config XGENE_PMU
         depends on PERF_EVENTS && ARCH_XGENE
         bool "APM X-Gene SoC PMU"
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b116e982810b..ef24833c94a8 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o
+obj-$(CONFIG_QCOM_L2_PMU)	+= qcom_l2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
new file mode 100644
index 000000000000..c259848228b4
--- /dev/null
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -0,0 +1,1013 @@
+/* Copyright (c) 2015-2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+
+#include <asm/barrier.h>
+#include <asm/local64.h>
+#include <asm/sysreg.h>
+
+#define MAX_L2_CTRS             9
+
+#define L2PMCR_NUM_EV_SHIFT     11
+#define L2PMCR_NUM_EV_MASK      0x1F
+
+#define L2PMCR                  0x400
+#define L2PMCNTENCLR            0x403
+#define L2PMCNTENSET            0x404
+#define L2PMINTENCLR            0x405
+#define L2PMINTENSET            0x406
+#define L2PMOVSCLR              0x407
+#define L2PMOVSSET              0x408
+#define L2PMCCNTCR              0x409
+#define L2PMCCNTR               0x40A
+#define L2PMCCNTSR              0x40C
+#define L2PMRESR                0x410
+#define IA_L2PMXEVCNTCR_BASE    0x420
+#define IA_L2PMXEVCNTR_BASE     0x421
+#define IA_L2PMXEVFILTER_BASE   0x423
+#define IA_L2PMXEVTYPER_BASE    0x424
+
+#define IA_L2_REG_OFFSET        0x10
+
+#define L2PMXEVFILTER_SUFILTER_ALL      0x000E0000
+#define L2PMXEVFILTER_ORGFILTER_IDINDEP 0x00000004
+#define L2PMXEVFILTER_ORGFILTER_ALL     0x00000003
+
+#define L2EVTYPER_REG_SHIFT     3
+
+#define L2PMRESR_GROUP_BITS     8
+#define L2PMRESR_GROUP_MASK     GENMASK(7, 0)
+
+#define L2CYCLE_CTR_BIT         31
+#define L2CYCLE_CTR_RAW_CODE    0xFE
+
+#define L2PMCR_RESET_ALL        0x6
+#define L2PMCR_COUNTERS_ENABLE  0x1
+#define L2PMCR_COUNTERS_DISABLE 0x0
+
+#define L2PMRESR_EN             BIT_ULL(63)
+
+#define L2_EVT_MASK             0x00000FFF
+#define L2_EVT_CODE_MASK        0x00000FF0
+#define L2_EVT_GRP_MASK         0x0000000F
+#define L2_EVT_CODE_SHIFT       4
+#define L2_EVT_GRP_SHIFT        0
+
+#define L2_EVT_CODE(event)   (((event) & L2_EVT_CODE_MASK) >> L2_EVT_CODE_SHIFT)
+#define L2_EVT_GROUP(event)  (((event) & L2_EVT_GRP_MASK) >> L2_EVT_GRP_SHIFT)
+
+#define L2_EVT_GROUP_MAX        7
+
+#define L2_COUNTER_RELOAD       BIT_ULL(31)
+#define L2_CYCLE_COUNTER_RELOAD BIT_ULL(63)
+
+#define L2CPUSRSELR_EL1         sys_reg(3, 3, 15, 0, 6)
+#define L2CPUSRDR_EL1           sys_reg(3, 3, 15, 0, 7)
+
+#define reg_idx(reg, i)         (((i) * IA_L2_REG_OFFSET) + reg##_BASE)
+
+static DEFINE_RAW_SPINLOCK(l2_access_lock);
+
+/**
+ * set_l2_indirect_reg: write value to an L2 register
+ * @reg: Address of L2 register.
+ * @value: Value to be written to register.
+ *
+ * Use architecturally required barriers for ordering between system register
+ * accesses
+ */
+static void set_l2_indirect_reg(u64 reg, u64 val)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2_access_lock, flags);
+	write_sysreg_s(reg, L2CPUSRSELR_EL1);
+	isb();
+	write_sysreg_s(val, L2CPUSRDR_EL1);
+	isb();
+	raw_spin_unlock_irqrestore(&l2_access_lock, flags);
+}
+
+/**
+ * get_l2_indirect_reg: read an L2 register value
+ * @reg: Address of L2 register.
+ *
+ * Use architecturally required barriers for ordering between system register
+ * accesses
+ */
+static u64 get_l2_indirect_reg(u64 reg)
+{
+	u64 val;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2_access_lock, flags);
+	write_sysreg_s(reg, L2CPUSRSELR_EL1);
+	isb();
+	val = read_sysreg_s(L2CPUSRDR_EL1);
+	raw_spin_unlock_irqrestore(&l2_access_lock, flags);
+
+	return val;
+}
+
+struct cluster_pmu;
+
+/*
+ * Aggregate PMU. Implements the core pmu functions and manages
+ * the hardware PMUs.
+ */
+struct l2cache_pmu {
+	struct hlist_node node;
+	u32 num_pmus;
+	struct pmu pmu;
+	int num_counters;
+	cpumask_t cpumask;
+	struct platform_device *pdev;
+	struct cluster_pmu * __percpu *pmu_cluster;
+	struct list_head clusters;
+};
+
+/*
+ * The cache is made up of one or more clusters, each cluster has its own PMU.
+ * Each cluster is associated with one or more CPUs.
+ * This structure represents one of the hardware PMUs.
+ *
+ * Events can be envisioned as a 2-dimensional array. Each column represents
+ * a group of events. There are 8 groups. Only one entry from each
+ * group can be in use at a time.
+ *
+ * Events are specified as 0xCCG, where CC is 2 hex digits specifying
+ * the code (array row) and G specifies the group (column).
+ *
+ * In addition there is a cycle counter event specified by L2CYCLE_CTR_RAW_CODE
+ * which is outside the above scheme.
+ */
+struct cluster_pmu {
+	struct list_head next;
+	struct perf_event *events[MAX_L2_CTRS];
+	struct l2cache_pmu *l2cache_pmu;
+	DECLARE_BITMAP(used_counters, MAX_L2_CTRS);
+	DECLARE_BITMAP(used_groups, L2_EVT_GROUP_MAX + 1);
+	int irq;
+	int cluster_id;
+	/* The CPU that is used for collecting events on this cluster */
+	int on_cpu;
+	/* All the CPUs associated with this cluster */
+	cpumask_t cluster_cpus;
+	spinlock_t pmu_lock;
+};
+
+#define to_l2cache_pmu(p) (container_of(p, struct l2cache_pmu, pmu))
+
+static u32 l2_cycle_ctr_idx;
+static u32 l2_counter_present_mask;
+
+static inline u32 idx_to_reg_bit(u32 idx)
+{
+	if (idx == l2_cycle_ctr_idx)
+		return BIT(L2CYCLE_CTR_BIT);
+
+	return BIT(idx);
+}
+
+static inline struct cluster_pmu *get_cluster_pmu(
+	struct l2cache_pmu *l2cache_pmu, int cpu)
+{
+	return *per_cpu_ptr(l2cache_pmu->pmu_cluster, cpu);
+}
+
+static void cluster_pmu_reset(void)
+{
+	/* Reset all counters */
+	set_l2_indirect_reg(L2PMCR, L2PMCR_RESET_ALL);
+	set_l2_indirect_reg(L2PMCNTENCLR, l2_counter_present_mask);
+	set_l2_indirect_reg(L2PMINTENCLR, l2_counter_present_mask);
+	set_l2_indirect_reg(L2PMOVSCLR, l2_counter_present_mask);
+}
+
+static inline void cluster_pmu_enable(void)
+{
+	set_l2_indirect_reg(L2PMCR, L2PMCR_COUNTERS_ENABLE);
+}
+
+static inline void cluster_pmu_disable(void)
+{
+	set_l2_indirect_reg(L2PMCR, L2PMCR_COUNTERS_DISABLE);
+}
+
+static inline void cluster_pmu_counter_set_value(u32 idx, u64 value)
+{
+	if (idx == l2_cycle_ctr_idx)
+		set_l2_indirect_reg(L2PMCCNTR, value);
+	else
+		set_l2_indirect_reg(reg_idx(IA_L2PMXEVCNTR, idx), value);
+}
+
+static inline u64 cluster_pmu_counter_get_value(u32 idx)
+{
+	u64 value;
+
+	if (idx == l2_cycle_ctr_idx)
+		value = get_l2_indirect_reg(L2PMCCNTR);
+	else
+		value = get_l2_indirect_reg(reg_idx(IA_L2PMXEVCNTR, idx));
+
+	return value;
+}
+
+static inline void cluster_pmu_counter_enable(u32 idx)
+{
+	set_l2_indirect_reg(L2PMCNTENSET, idx_to_reg_bit(idx));
+}
+
+static inline void cluster_pmu_counter_disable(u32 idx)
+{
+	set_l2_indirect_reg(L2PMCNTENCLR, idx_to_reg_bit(idx));
+}
+
+static inline void cluster_pmu_counter_enable_interrupt(u32 idx)
+{
+	set_l2_indirect_reg(L2PMINTENSET, idx_to_reg_bit(idx));
+}
+
+static inline void cluster_pmu_counter_disable_interrupt(u32 idx)
+{
+	set_l2_indirect_reg(L2PMINTENCLR, idx_to_reg_bit(idx));
+}
+
+static inline void cluster_pmu_set_evccntcr(u32 val)
+{
+	set_l2_indirect_reg(L2PMCCNTCR, val);
+}
+
+static inline void cluster_pmu_set_evcntcr(u32 ctr, u32 val)
+{
+	set_l2_indirect_reg(reg_idx(IA_L2PMXEVCNTCR, ctr), val);
+}
+
+static inline void cluster_pmu_set_evtyper(u32 ctr, u32 val)
+{
+	set_l2_indirect_reg(reg_idx(IA_L2PMXEVTYPER, ctr), val);
+}
+
+static void cluster_pmu_set_resr(struct cluster_pmu *cluster,
+			       u32 event_group, u32 event_cc)
+{
+	u64 field;
+	u64 resr_val;
+	u32 shift;
+	unsigned long flags;
+
+	shift = L2PMRESR_GROUP_BITS * event_group;
+	field = ((u64)(event_cc & L2PMRESR_GROUP_MASK) << shift);
+
+	spin_lock_irqsave(&cluster->pmu_lock, flags);
+
+	resr_val = get_l2_indirect_reg(L2PMRESR);
+	resr_val &= ~(L2PMRESR_GROUP_MASK << shift);
+	resr_val |= field;
+	resr_val |= L2PMRESR_EN;
+	set_l2_indirect_reg(L2PMRESR, resr_val);
+
+	spin_unlock_irqrestore(&cluster->pmu_lock, flags);
+}
+
+/*
+ * Hardware allows filtering of events based on the originating
+ * CPU. Turn this off by setting filter bits to allow events from
+ * all CPUS, subunits and ID independent events in this cluster.
+ */
+static inline void cluster_pmu_set_evfilter_sys_mode(u32 ctr)
+{
+	u32 val =  L2PMXEVFILTER_SUFILTER_ALL |
+		   L2PMXEVFILTER_ORGFILTER_IDINDEP |
+		   L2PMXEVFILTER_ORGFILTER_ALL;
+
+	set_l2_indirect_reg(reg_idx(IA_L2PMXEVFILTER, ctr), val);
+}
+
+static inline u32 cluster_pmu_getreset_ovsr(void)
+{
+	u32 result = get_l2_indirect_reg(L2PMOVSSET);
+
+	set_l2_indirect_reg(L2PMOVSCLR, result);
+	return result;
+}
+
+static inline bool cluster_pmu_has_overflowed(u32 ovsr)
+{
+	return !!(ovsr & l2_counter_present_mask);
+}
+
+static inline bool cluster_pmu_counter_has_overflowed(u32 ovsr, u32 idx)
+{
+	return !!(ovsr & idx_to_reg_bit(idx));
+}
+
+static void l2_cache_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 delta, prev, now;
+	u32 idx = hwc->idx;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		now = cluster_pmu_counter_get_value(idx);
+	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+	/*
+	 * The cycle counter is 64-bit, but all other counters are
+	 * 32-bit, and we must handle 32-bit overflow explicitly.
+	 */
+	delta = now - prev;
+	if (idx != l2_cycle_ctr_idx)
+		delta &= 0xffffffff;
+
+	local64_add(delta, &event->count);
+}
+
+static void l2_cache_cluster_set_period(struct cluster_pmu *cluster,
+				       struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u64 new;
+
+	/*
+	 * We limit the max period to half the max counter value so
+	 * that even in the case of extreme interrupt latency the
+	 * counter will (hopefully) not wrap past its initial value.
+	 */
+	if (idx == l2_cycle_ctr_idx)
+		new = L2_CYCLE_COUNTER_RELOAD;
+	else
+		new = L2_COUNTER_RELOAD;
+
+	local64_set(&hwc->prev_count, new);
+	cluster_pmu_counter_set_value(idx, new);
+}
+
+static int l2_cache_get_event_idx(struct cluster_pmu *cluster,
+				   struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+	int num_ctrs = cluster->l2cache_pmu->num_counters - 1;
+	unsigned int group;
+
+	if (hwc->config_base == L2CYCLE_CTR_RAW_CODE) {
+		if (test_and_set_bit(l2_cycle_ctr_idx, cluster->used_counters))
+			return -EAGAIN;
+
+		return l2_cycle_ctr_idx;
+	}
+
+	idx = find_first_zero_bit(cluster->used_counters, num_ctrs);
+	if (idx == num_ctrs)
+		/* The counters are all in use. */
+		return -EAGAIN;
+
+	/*
+	 * Check for column exclusion: event column already in use by another
+	 * event. This is for events which are not in the same group.
+	 * Conflicting events in the same group are detected in event_init.
+	 */
+	group = L2_EVT_GROUP(hwc->config_base);
+	if (test_bit(group, cluster->used_groups))
+		return -EAGAIN;
+
+	set_bit(idx, cluster->used_counters);
+	set_bit(group, cluster->used_groups);
+
+	return idx;
+}
+
+static void l2_cache_clear_event_idx(struct cluster_pmu *cluster,
+				      struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	clear_bit(idx, cluster->used_counters);
+	if (hwc->config_base != L2CYCLE_CTR_RAW_CODE)
+		clear_bit(L2_EVT_GROUP(hwc->config_base), cluster->used_groups);
+}
+
+static irqreturn_t l2_cache_handle_irq(int irq_num, void *data)
+{
+	struct cluster_pmu *cluster = data;
+	int num_counters = cluster->l2cache_pmu->num_counters;
+	u32 ovsr;
+	int idx;
+
+	ovsr = cluster_pmu_getreset_ovsr();
+	if (!cluster_pmu_has_overflowed(ovsr))
+		return IRQ_NONE;
+
+	for_each_set_bit(idx, cluster->used_counters, num_counters) {
+		struct perf_event *event = cluster->events[idx];
+		struct hw_perf_event *hwc;
+
+		if (WARN_ON_ONCE(!event))
+			continue;
+
+		if (!cluster_pmu_counter_has_overflowed(ovsr, idx))
+			continue;
+
+		l2_cache_event_update(event);
+		hwc = &event->hw;
+
+		l2_cache_cluster_set_period(cluster, hwc);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Implementation of abstract pmu functionality required by
+ * the core perf events code.
+ */
+
+static void l2_cache_pmu_enable(struct pmu *pmu)
+{
+	/*
+	 * Although there is only one PMU (per socket) controlling multiple
+	 * physical PMUs (per cluster), because we do not support per-task mode
+	 * each event is associated with a CPU. Each event has pmu_enable
+	 * called on its CPU, so here it is only necessary to enable the
+	 * counters for the current CPU.
+	 */
+
+	cluster_pmu_enable();
+}
+
+static void l2_cache_pmu_disable(struct pmu *pmu)
+{
+	cluster_pmu_disable();
+}
+
+static int l2_cache_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cluster_pmu *cluster;
+	struct perf_event *sibling;
+	struct l2cache_pmu *l2cache_pmu;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	l2cache_pmu = to_l2cache_pmu(event->pmu);
+
+	if (hwc->sample_period) {
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+				    "Sampling not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (event->cpu < 0) {
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+				    "Per-task mode not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* We cannot filter accurately so we just don't allow it. */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_hv || event->attr.exclude_idle) {
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+				    "Can't exclude execution levels\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (((L2_EVT_GROUP(event->attr.config) > L2_EVT_GROUP_MAX) ||
+	     ((event->attr.config & ~L2_EVT_MASK) != 0)) &&
+	    (event->attr.config != L2CYCLE_CTR_RAW_CODE)) {
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+				    "Invalid config %llx\n",
+				    event->attr.config);
+		return -EINVAL;
+	}
+
+	/* Don't allow groups with mixed PMUs, except for s/w events */
+	if (event->group_leader->pmu != event->pmu &&
+	    !is_software_event(event->group_leader)) {
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+			 "Can't create mixed PMU group\n");
+		return -EINVAL;
+	}
+
+	list_for_each_entry(sibling, &event->group_leader->sibling_list,
+			    group_entry)
+		if (sibling->pmu != event->pmu &&
+		    !is_software_event(sibling)) {
+			dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+				 "Can't create mixed PMU group\n");
+			return -EINVAL;
+		}
+
+	cluster = get_cluster_pmu(l2cache_pmu, event->cpu);
+	if (!cluster) {
+		/* CPU has not been initialised */
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+			"CPU%d not associated with L2 cluster\n", event->cpu);
+		return -EINVAL;
+	}
+
+	/* Ensure all events in a group are on the same cpu */
+	if ((event->group_leader != event) &&
+	    (cluster->on_cpu != event->group_leader->cpu)) {
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+			 "Can't create group on CPUs %d and %d",
+			 event->cpu, event->group_leader->cpu);
+		return -EINVAL;
+	}
+
+	if ((event != event->group_leader) &&
+	    (L2_EVT_GROUP(event->group_leader->attr.config) ==
+	     L2_EVT_GROUP(event->attr.config))) {
+		dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+			 "Column exclusion: conflicting events %llx %llx\n",
+		       event->group_leader->attr.config,
+		       event->attr.config);
+		return -EINVAL;
+	}
+
+	list_for_each_entry(sibling, &event->group_leader->sibling_list,
+			    group_entry) {
+		if ((sibling != event) &&
+		    (L2_EVT_GROUP(sibling->attr.config) ==
+		     L2_EVT_GROUP(event->attr.config))) {
+			dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
+			     "Column exclusion: conflicting events %llx %llx\n",
+					    sibling->attr.config,
+					    event->attr.config);
+			return -EINVAL;
+		}
+	}
+
+	hwc->idx = -1;
+	hwc->config_base = event->attr.config;
+
+	/*
+	 * Ensure all events are on the same cpu so all events are in the
+	 * same cpu context, to avoid races on pmu_enable etc.
+	 */
+	event->cpu = cluster->on_cpu;
+
+	return 0;
+}
+
+static void l2_cache_event_start(struct perf_event *event, int flags)
+{
+	struct cluster_pmu *cluster;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	u32 config;
+	u32 event_cc, event_group;
+
+	hwc->state = 0;
+
+	cluster = get_cluster_pmu(to_l2cache_pmu(event->pmu), event->cpu);
+
+	l2_cache_cluster_set_period(cluster, hwc);
+
+	if (hwc->config_base == L2CYCLE_CTR_RAW_CODE) {
+		cluster_pmu_set_evccntcr(0);
+	} else {
+		config = hwc->config_base;
+		event_cc    = L2_EVT_CODE(config);
+		event_group = L2_EVT_GROUP(config);
+
+		cluster_pmu_set_evcntcr(idx, 0);
+		cluster_pmu_set_evtyper(idx, event_group);
+		cluster_pmu_set_resr(cluster, event_group, event_cc);
+		cluster_pmu_set_evfilter_sys_mode(idx);
+	}
+
+	cluster_pmu_counter_enable_interrupt(idx);
+	cluster_pmu_counter_enable(idx);
+}
+
+static void l2_cache_event_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	cluster_pmu_counter_disable_interrupt(idx);
+	cluster_pmu_counter_disable(idx);
+
+	if (flags & PERF_EF_UPDATE)
+		l2_cache_event_update(event);
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int l2_cache_event_add(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+	int err = 0;
+	struct cluster_pmu *cluster;
+
+	cluster = get_cluster_pmu(to_l2cache_pmu(event->pmu), event->cpu);
+
+	idx = l2_cache_get_event_idx(cluster, event);
+	if (idx < 0)
+		return idx;
+
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	cluster->events[idx] = event;
+	local64_set(&hwc->prev_count, 0);
+
+	if (flags & PERF_EF_START)
+		l2_cache_event_start(event, flags);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return err;
+}
+
+static void l2_cache_event_del(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cluster_pmu *cluster;
+	int idx = hwc->idx;
+
+	cluster = get_cluster_pmu(to_l2cache_pmu(event->pmu), event->cpu);
+
+	l2_cache_event_stop(event, flags | PERF_EF_UPDATE);
+	cluster->events[idx] = NULL;
+	l2_cache_clear_event_idx(cluster, event);
+
+	perf_event_update_userpage(event);
+}
+
+static void l2_cache_event_read(struct perf_event *event)
+{
+	l2_cache_event_update(event);
+}
+
+static ssize_t l2_cache_pmu_cpumask_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct l2cache_pmu *l2cache_pmu = to_l2cache_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, &l2cache_pmu->cpumask);
+}
+
+static struct device_attribute l2_cache_pmu_cpumask_attr =
+		__ATTR(cpumask, S_IRUGO, l2_cache_pmu_cpumask_show, NULL);
+
+static struct attribute *l2_cache_pmu_cpumask_attrs[] = {
+	&l2_cache_pmu_cpumask_attr.attr,
+	NULL,
+};
+
+static struct attribute_group l2_cache_pmu_cpumask_group = {
+	.attrs = l2_cache_pmu_cpumask_attrs,
+};
+
+/* CCG format for perf RAW codes. */
+PMU_FORMAT_ATTR(l2_code,   "config:4-11");
+PMU_FORMAT_ATTR(l2_group,  "config:0-3");
+static struct attribute *l2_cache_pmu_formats[] = {
+	&format_attr_l2_code.attr,
+	&format_attr_l2_group.attr,
+	NULL,
+};
+
+static struct attribute_group l2_cache_pmu_format_group = {
+	.name = "format",
+	.attrs = l2_cache_pmu_formats,
+};
+
+static const struct attribute_group *l2_cache_pmu_attr_grps[] = {
+	&l2_cache_pmu_format_group,
+	&l2_cache_pmu_cpumask_group,
+	NULL,
+};
+
+/*
+ * Generic device handlers
+ */
+
+static const struct acpi_device_id l2_cache_pmu_acpi_match[] = {
+	{ "QCOM8130", },
+	{ }
+};
+
+static int get_num_counters(void)
+{
+	int val;
+
+	val = get_l2_indirect_reg(L2PMCR);
+
+	/*
+	 * Read number of counters from L2PMCR and add 1
+	 * for the cycle counter.
+	 */
+	return ((val >> L2PMCR_NUM_EV_SHIFT) & L2PMCR_NUM_EV_MASK) + 1;
+}
+
+static struct cluster_pmu *l2_cache_associate_cpu_with_cluster(
+	struct l2cache_pmu *l2cache_pmu, int cpu)
+{
+	u64 mpidr;
+	int cpu_cluster_id;
+	struct cluster_pmu *cluster = NULL;
+
+	/*
+	 * This assumes that the cluster_id is in MPIDR[aff1] for
+	 * single-threaded cores, and MPIDR[aff2] for multi-threaded
+	 * cores. This logic will have to be updated if this changes.
+	 */
+	mpidr = read_cpuid_mpidr();
+	if (mpidr & MPIDR_MT_BITMASK)
+		cpu_cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2);
+	else
+		cpu_cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+
+	list_for_each_entry(cluster, &l2cache_pmu->clusters, next) {
+		if (cluster->cluster_id != cpu_cluster_id)
+			continue;
+
+		dev_info(&l2cache_pmu->pdev->dev,
+			 "CPU%d associated with cluster %d\n", cpu,
+			 cluster->cluster_id);
+		cpumask_set_cpu(cpu, &cluster->cluster_cpus);
+		*per_cpu_ptr(l2cache_pmu->pmu_cluster, cpu) = cluster;
+		break;
+	}
+
+	return cluster;
+}
+
+static int l2cache_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct cluster_pmu *cluster;
+	struct l2cache_pmu *l2cache_pmu;
+
+	l2cache_pmu = hlist_entry_safe(node, struct l2cache_pmu, node);
+	cluster = get_cluster_pmu(l2cache_pmu, cpu);
+	if (!cluster) {
+		/* First time this CPU has come online */
+		cluster = l2_cache_associate_cpu_with_cluster(l2cache_pmu, cpu);
+		if (!cluster) {
+			/* Only if broken firmware doesn't list every cluster */
+			WARN_ONCE(1, "No L2 cache cluster for CPU%d\n", cpu);
+			return 0;
+		}
+	}
+
+	/* If another CPU is managing this cluster, we're done */
+	if (cluster->on_cpu != -1)
+		return 0;
+
+	/*
+	 * All CPUs on this cluster were down, use this one.
+	 * Reset to put it into sane state.
+	 */
+	cluster->on_cpu = cpu;
+	cpumask_set_cpu(cpu, &l2cache_pmu->cpumask);
+	cluster_pmu_reset();
+
+	WARN_ON(irq_set_affinity(cluster->irq, cpumask_of(cpu)));
+	enable_irq(cluster->irq);
+
+	return 0;
+}
+
+static int l2cache_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct cluster_pmu *cluster;
+	struct l2cache_pmu *l2cache_pmu;
+	cpumask_t cluster_online_cpus;
+	unsigned int target;
+
+	l2cache_pmu = hlist_entry_safe(node, struct l2cache_pmu, node);
+	cluster = get_cluster_pmu(l2cache_pmu, cpu);
+	if (!cluster)
+		return 0;
+
+	/* If this CPU is not managing the cluster, we're done */
+	if (cluster->on_cpu != cpu)
+		return 0;
+
+	/* Give up ownership of cluster */
+	cpumask_clear_cpu(cpu, &l2cache_pmu->cpumask);
+	cluster->on_cpu = -1;
+
+	/* Any other CPU for this cluster which is still online */
+	cpumask_and(&cluster_online_cpus, &cluster->cluster_cpus,
+		    cpu_online_mask);
+	target = cpumask_any_but(&cluster_online_cpus, cpu);
+	if (target >= nr_cpu_ids) {
+		disable_irq(cluster->irq);
+		return 0;
+	}
+
+	perf_pmu_migrate_context(&l2cache_pmu->pmu, cpu, target);
+	cluster->on_cpu = target;
+	cpumask_set_cpu(target, &l2cache_pmu->cpumask);
+	WARN_ON(irq_set_affinity(cluster->irq, cpumask_of(target)));
+
+	return 0;
+}
+
+static int l2_cache_pmu_probe_cluster(struct device *dev, void *data)
+{
+	struct platform_device *pdev = to_platform_device(dev->parent);
+	struct platform_device *sdev = to_platform_device(dev);
+	struct l2cache_pmu *l2cache_pmu = data;
+	struct cluster_pmu *cluster;
+	struct acpi_device *device;
+	unsigned long fw_cluster_id;
+	int err;
+	int irq;
+
+	if (acpi_bus_get_device(ACPI_HANDLE(dev), &device))
+		return -ENODEV;
+
+	if (kstrtoul(device->pnp.unique_id, 10, &fw_cluster_id) < 0) {
+		dev_err(&pdev->dev, "unable to read ACPI uid\n");
+		return -ENODEV;
+	}
+
+	cluster = devm_kzalloc(&pdev->dev, sizeof(*cluster), GFP_KERNEL);
+	if (!cluster)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&cluster->next);
+	list_add(&cluster->next, &l2cache_pmu->clusters);
+	cluster->cluster_id = fw_cluster_id;
+
+	irq = platform_get_irq(sdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev,
+			"Failed to get valid irq for cluster %ld\n",
+			fw_cluster_id);
+		return irq;
+	}
+	irq_set_status_flags(irq, IRQ_NOAUTOEN);
+	cluster->irq = irq;
+
+	cluster->l2cache_pmu = l2cache_pmu;
+	cluster->on_cpu = -1;
+
+	err = devm_request_irq(&pdev->dev, irq, l2_cache_handle_irq,
+			       IRQF_NOBALANCING | IRQF_NO_THREAD,
+			       "l2-cache-pmu", cluster);
+	if (err) {
+		dev_err(&pdev->dev,
+			"Unable to request IRQ%d for L2 PMU counters\n", irq);
+		return err;
+	}
+
+	dev_info(&pdev->dev,
+		"Registered L2 cache PMU cluster %ld\n", fw_cluster_id);
+
+	spin_lock_init(&cluster->pmu_lock);
+
+	l2cache_pmu->num_pmus++;
+
+	return 0;
+}
+
+static int l2_cache_pmu_probe(struct platform_device *pdev)
+{
+	int err;
+	struct l2cache_pmu *l2cache_pmu;
+
+	l2cache_pmu =
+		devm_kzalloc(&pdev->dev, sizeof(*l2cache_pmu), GFP_KERNEL);
+	if (!l2cache_pmu)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&l2cache_pmu->clusters);
+
+	platform_set_drvdata(pdev, l2cache_pmu);
+	l2cache_pmu->pmu = (struct pmu) {
+		/* suffix is instance id for future use with multiple sockets */
+		.name		= "l2cache_0",
+		.task_ctx_nr    = perf_invalid_context,
+		.pmu_enable	= l2_cache_pmu_enable,
+		.pmu_disable	= l2_cache_pmu_disable,
+		.event_init	= l2_cache_event_init,
+		.add		= l2_cache_event_add,
+		.del		= l2_cache_event_del,
+		.start		= l2_cache_event_start,
+		.stop		= l2_cache_event_stop,
+		.read		= l2_cache_event_read,
+		.attr_groups	= l2_cache_pmu_attr_grps,
+	};
+
+	l2cache_pmu->num_counters = get_num_counters();
+	l2cache_pmu->pdev = pdev;
+	l2cache_pmu->pmu_cluster = devm_alloc_percpu(&pdev->dev,
+						     struct cluster_pmu *);
+	if (!l2cache_pmu->pmu_cluster)
+		return -ENOMEM;
+
+	l2_cycle_ctr_idx = l2cache_pmu->num_counters - 1;
+	l2_counter_present_mask = GENMASK(l2cache_pmu->num_counters - 2, 0) |
+		BIT(L2CYCLE_CTR_BIT);
+
+	cpumask_clear(&l2cache_pmu->cpumask);
+
+	/* Read cluster info and initialize each cluster */
+	err = device_for_each_child(&pdev->dev, l2cache_pmu,
+				    l2_cache_pmu_probe_cluster);
+	if (err)
+		return err;
+
+	if (l2cache_pmu->num_pmus == 0) {
+		dev_err(&pdev->dev, "No hardware L2 cache PMUs found\n");
+		return -ENODEV;
+	}
+
+	err = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
+				       &l2cache_pmu->node);
+	if (err) {
+		dev_err(&pdev->dev, "Error %d registering hotplug", err);
+		return err;
+	}
+
+	err = perf_pmu_register(&l2cache_pmu->pmu, l2cache_pmu->pmu.name, -1);
+	if (err) {
+		dev_err(&pdev->dev, "Error %d registering L2 cache PMU\n", err);
+		goto out_unregister;
+	}
+
+	dev_info(&pdev->dev, "Registered L2 cache PMU using %d HW PMUs\n",
+		 l2cache_pmu->num_pmus);
+
+	return err;
+
+out_unregister:
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
+				    &l2cache_pmu->node);
+	return err;
+}
+
+static int l2_cache_pmu_remove(struct platform_device *pdev)
+{
+	struct l2cache_pmu *l2cache_pmu =
+		to_l2cache_pmu(platform_get_drvdata(pdev));
+
+	perf_pmu_unregister(&l2cache_pmu->pmu);
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
+				    &l2cache_pmu->node);
+	return 0;
+}
+
+static struct platform_driver l2_cache_pmu_driver = {
+	.driver = {
+		.name = "qcom-l2cache-pmu",
+		.acpi_match_table = ACPI_PTR(l2_cache_pmu_acpi_match),
+	},
+	.probe = l2_cache_pmu_probe,
+	.remove = l2_cache_pmu_remove,
+};
+
+static int __init register_l2_cache_pmu_driver(void)
+{
+	int err;
+
+	err = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
+				      "AP_PERF_ARM_QCOM_L2_ONLINE",
+				      l2cache_pmu_online_cpu,
+				      l2cache_pmu_offline_cpu);
+	if (err)
+		return err;
+
+	return platform_driver_register(&l2_cache_pmu_driver);
+}
+device_initcall(register_l2_cache_pmu_driver);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 20bfefbe7594..1b7b2075b9cd 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -138,6 +138,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
+	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-- 
cgit v1.2.3


From cbaf58032efca401834518b905f528ac912449e4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 7 Feb 2017 11:58:15 -0500
Subject: svcrdma: Another sendto chunk list parsing update

Commit 5fdca6531434 ("svcrdma: Renovate sendto chunk list parsing")
missed a spot. svc_rdma_xdr_get_reply_hdr_len() also assumes the
Write list has only one Write chunk. There's no harm in making this
code more general.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/rpc_rdma.h        |  9 +++++++
 include/linux/sunrpc/svc_rdma.h        |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_marshal.c | 49 ++++++++++++++++++----------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c  |  3 ++-
 4 files changed, 38 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index cfda6adcf33c..245fc59b7324 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -109,6 +109,15 @@ struct rpcrdma_msg {
 	} rm_body;
 };
 
+/*
+ * XDR sizes, in quads
+ */
+enum {
+	rpcrdma_fixed_maxsz	= 4,
+	rpcrdma_segment_maxsz	= 4,
+	rpcrdma_readchunk_maxsz	= 2 + rpcrdma_segment_maxsz,
+};
+
 /*
  * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
  */
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 757fb963696c..551c51816352 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -218,7 +218,7 @@ extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
 					     struct rpcrdma_msg *,
 					     struct rpcrdma_msg *,
 					     enum rpcrdma_proc);
-extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
+extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 0ba9887f3e22..4e7203439e2f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -260,32 +260,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 	return (int)((unsigned long)va - (unsigned long)startp);
 }
 
-int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+/**
+ * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
+ * @rdma_resp: buffer containing Reply transport header
+ *
+ * Returns length of transport header, in bytes.
+ */
+unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
 {
-	struct rpcrdma_write_array *wr_ary;
+	unsigned int nsegs;
+	__be32 *p;
 
-	/* There is no read-list in a reply */
+	p = rdma_resp;
 
-	/* skip write list */
-	wr_ary = (struct rpcrdma_write_array *)
-		&rmsgp->rm_body.rm_chunks[1];
-	if (wr_ary->wc_discrim)
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
-			wc_target.rs_length;
-	else
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_nchunks;
-
-	/* skip reply array */
-	if (wr_ary->wc_discrim)
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
-	else
-		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_nchunks;
-
-	return (unsigned long) wr_ary - (unsigned long) rmsgp;
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p += rpcrdma_fixed_maxsz + 1;
+
+	/* Skip Write list. */
+	while (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	/* Skip Reply chunk. */
+	if (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	return (unsigned long)p - (unsigned long)rdma_resp;
 }
 
 void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index ad4d286a83c5..ba76f1617965 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 
 	/* Prepare the SGE for the RPCRDMA Header */
 	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+	ctxt->sge[0].length =
+	    svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
 	ctxt->sge[0].addr =
 	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
 			    ctxt->sge[0].length, DMA_TO_DEVICE);
-- 
cgit v1.2.3


From 98fc21d3bfd55a36ce9eb7b32d1ce146f0d1696d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 7 Feb 2017 11:58:23 -0500
Subject: svcrdma: Clean up RPC-over-RDMA Reply header encoder

Replace C structure-based XDR decoding with pointer arithmetic.
Pointer arithmetic is considered more portable, and is used
throughout the kernel's existing XDR encoders. The gcc optimizer
generates similar assembler code either way.

Byte-swapping before a memory store on x86 typically results in an
instruction pipeline stall. Avoid byte-swapping when encoding a new
header.

svcrdma currently doesn't alter a connection's credit grant value
after the connection has been accepted, so it is effectively a
constant. Cache the byte-swapped value in a separate field.

Christoph suggested pulling the header encoding logic into the only
function that uses it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  7 ++-----
 net/sunrpc/xprtrdma/svc_rdma_marshal.c   | 18 +-----------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 19 ++++++++++++-------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  1 +
 4 files changed, 16 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 551c51816352..25ecf92cae96 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -141,7 +141,8 @@ struct svcxprt_rdma {
 	atomic_t             sc_sq_avail;	/* SQEs ready to be consumed */
 	unsigned int	     sc_sq_depth;	/* Depth of SQ */
 	unsigned int	     sc_rq_depth;	/* Depth of RQ */
-	u32		     sc_max_requests;	/* Forward credits */
+	__be32		     sc_fc_credits;	/* Forward credits */
+	u32		     sc_max_requests;	/* Max requests */
 	u32		     sc_max_bc_requests;/* Backward credits */
 	int                  sc_max_req_size;	/* Size of each RQ WR buf */
 
@@ -214,10 +215,6 @@ extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
 					    __be32, __be64, u32);
-extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
-					     struct rpcrdma_msg *,
-					     struct rpcrdma_msg *,
-					     enum rpcrdma_proc);
 extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 
 /* svc_rdma_recvfrom.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 4e7203439e2f..c7d15b1126ce 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -249,7 +249,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 
 	*va++ = rmsgp->rm_xid;
 	*va++ = rmsgp->rm_vers;
-	*va++ = cpu_to_be32(xprt->sc_max_requests);
+	*va++ = xprt->sc_fc_credits;
 	*va++ = rdma_error;
 	*va++ = cpu_to_be32(err);
 	if (err == ERR_VERS) {
@@ -329,19 +329,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
 	seg->rs_offset = rs_offset;
 	seg->rs_length = cpu_to_be32(write_len);
 }
-
-void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
-				  struct rpcrdma_msg *rdma_argp,
-				  struct rpcrdma_msg *rdma_resp,
-				  enum rpcrdma_proc rdma_type)
-{
-	rdma_resp->rm_xid = rdma_argp->rm_xid;
-	rdma_resp->rm_vers = rdma_argp->rm_vers;
-	rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
-	rdma_resp->rm_type = cpu_to_be32(rdma_type);
-
-	/* Encode <nul> chunks lists */
-	rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
-	rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
-	rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index ba76f1617965..515221b16d09 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -560,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	struct rpcrdma_msg *rdma_argp;
 	struct rpcrdma_msg *rdma_resp;
 	struct rpcrdma_write_array *wr_ary, *rp_ary;
-	enum rpcrdma_proc reply_type;
 	int ret;
 	int inline_bytes;
 	struct page *res_page;
 	struct svc_rdma_req_map *vec;
 	u32 inv_rkey;
+	__be32 *p;
 
 	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
@@ -597,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (!res_page)
 		goto err0;
 	rdma_resp = page_address(res_page);
-	if (rp_ary)
-		reply_type = RDMA_NOMSG;
-	else
-		reply_type = RDMA_MSG;
-	svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
-					 rdma_resp, reply_type);
+
+	p = &rdma_resp->rm_xid;
+	*p++ = rdma_argp->rm_xid;
+	*p++ = rdma_argp->rm_vers;
+	*p++ = rdma->sc_fc_credits;
+	*p++ = rp_ary ? rdma_nomsg : rdma_msg;
+
+	/* Start with empty chunks */
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p   = xdr_zero;
 
 	/* Send any write-chunk data and build resp write-list */
 	if (wr_ary) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ca2799af05a6..174928fe5f83 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1002,6 +1002,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	newxprt->sc_max_req_size = svcrdma_max_req_size;
 	newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
 					 svcrdma_max_requests);
+	newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
 	newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
 					    svcrdma_max_bc_requests);
 	newxprt->sc_rq_depth = newxprt->sc_max_requests +
-- 
cgit v1.2.3


From aba7d14ba18c93a2ab37d50b057a885964ef285c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 7 Feb 2017 11:58:48 -0500
Subject: svcrdma: Remove unused sc_dto_q field

Clean up. Commit be99bb11400c ("svcrdma: Use new CQ API for
RPC-over-RDMA server send CQs") removed code that used the sc_dto_q
field, but neglected to remove sc_dto_q at the same time.

Fixes: be99bb11400c ("svcrdma: Use new CQ API for RPC-over- ...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 1 -
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 25ecf92cae96..f77a7bc1612c 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -172,7 +172,6 @@ struct svcxprt_rdma {
 
 	wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */
 	unsigned long	     sc_flags;
-	struct list_head     sc_dto_q;		/* DTO tasklet I/O pending Q */
 	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
 };
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 174928fe5f83..023eaa01157f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -557,7 +557,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 		return NULL;
 	svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
 	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
-	INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
-- 
cgit v1.2.3


From a3ab867fa64f9aedb3b01d570db5b43d2fc355fc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 7 Feb 2017 11:58:56 -0500
Subject: svcrdma: Combine list fields in struct svc_rdma_op_ctxt

Clean up: The free list and the dto_q list fields are never used at
the same time. Reduce the size of struct svc_rdma_op_ctxt by
combining these fields.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  3 +--
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 14 ++++++--------
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 33 +++++++++++++++-----------------
 3 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f77a7bc1612c..b105f73e3ca2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -70,7 +70,7 @@ extern atomic_t rdma_stat_sq_prod;
  * completes.
  */
 struct svc_rdma_op_ctxt {
-	struct list_head free;
+	struct list_head list;
 	struct svc_rdma_op_ctxt *read_hdr;
 	struct svc_rdma_fastreg_mr *frmr;
 	int hdr_count;
@@ -78,7 +78,6 @@ struct svc_rdma_op_ctxt {
 	struct ib_cqe cqe;
 	struct ib_cqe reg_cqe;
 	struct ib_cqe inv_cqe;
-	struct list_head dto_q;
 	u32 byte_len;
 	u32 position;
 	struct svcxprt_rdma *xprt;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 172b537f8cfc..b9ccd73631a9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -608,18 +608,16 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
 	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
-		ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
+		ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
 		rdma_read_complete(rqstp, ctxt);
 		goto complete;
 	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
-		ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
+		ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 	} else {
 		atomic_inc(&rdma_stat_rq_starve);
 		clear_bit(XPT_DATA, &xprt->xpt_flags);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 023eaa01157f..87b8b5a10324 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
 	ctxt = kmalloc(sizeof(*ctxt), flags);
 	if (ctxt) {
 		ctxt->xprt = xprt;
-		INIT_LIST_HEAD(&ctxt->free);
-		INIT_LIST_HEAD(&ctxt->dto_q);
+		INIT_LIST_HEAD(&ctxt->list);
 	}
 	return ctxt;
 }
@@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
 			dprintk("svcrdma: No memory for RDMA ctxt\n");
 			return false;
 		}
-		list_add(&ctxt->free, &xprt->sc_ctxts);
+		list_add(&ctxt->list, &xprt->sc_ctxts);
 	}
 	return true;
 }
@@ -195,8 +194,8 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 		goto out_empty;
 
 	ctxt = list_first_entry(&xprt->sc_ctxts,
-				struct svc_rdma_op_ctxt, free);
-	list_del_init(&ctxt->free);
+				struct svc_rdma_op_ctxt, list);
+	list_del(&ctxt->list);
 	spin_unlock_bh(&xprt->sc_ctxt_lock);
 
 out:
@@ -256,7 +255,7 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 
 	spin_lock_bh(&xprt->sc_ctxt_lock);
 	xprt->sc_ctxt_used--;
-	list_add(&ctxt->free, &xprt->sc_ctxts);
+	list_add(&ctxt->list, &xprt->sc_ctxts);
 	spin_unlock_bh(&xprt->sc_ctxt_lock);
 }
 
@@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
 		struct svc_rdma_op_ctxt *ctxt;
 
 		ctxt = list_first_entry(&xprt->sc_ctxts,
-					struct svc_rdma_op_ctxt, free);
-		list_del(&ctxt->free);
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 		kfree(ctxt);
 	}
 }
@@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	/* All wc fields are now known to be valid */
 	ctxt->byte_len = wc->byte_len;
 	spin_lock(&xprt->sc_rq_dto_lock);
-	list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+	list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
 	spin_unlock(&xprt->sc_rq_dto_lock);
 
 	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
 
 		read_hdr = ctxt->read_hdr;
 		spin_lock(&xprt->sc_rq_dto_lock);
-		list_add_tail(&read_hdr->dto_q,
+		list_add_tail(&read_hdr->list,
 			      &xprt->sc_read_complete_q);
 		spin_unlock(&xprt->sc_rq_dto_lock);
 
@@ -1213,20 +1212,18 @@ static void __svc_rdma_free(struct work_struct *work)
 	 */
 	while (!list_empty(&rdma->sc_read_complete_q)) {
 		struct svc_rdma_op_ctxt *ctxt;
-		ctxt = list_entry(rdma->sc_read_complete_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
+		ctxt = list_first_entry(&rdma->sc_read_complete_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 		svc_rdma_put_context(ctxt, 1);
 	}
 
 	/* Destroy queued, but not processed recv completions */
 	while (!list_empty(&rdma->sc_rq_dto_q)) {
 		struct svc_rdma_op_ctxt *ctxt;
-		ctxt = list_entry(rdma->sc_rq_dto_q.next,
-				  struct svc_rdma_op_ctxt,
-				  dto_q);
-		list_del_init(&ctxt->dto_q);
+		ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
 		svc_rdma_put_context(ctxt, 1);
 	}
 
-- 
cgit v1.2.3


From 6a2cac549b368960c9cd6a993f2f2cc6d720e935 Mon Sep 17 00:00:00 2001
From: LABBE Corentin <clabbe.montjoie@gmail.com>
Date: Wed, 8 Feb 2017 09:31:07 +0100
Subject: net: stmmac: Remove the bus_setup function pointer

The bus_setup function pointer is not used at all, this patch remove it.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Acked-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ----
 include/linux/stmmac.h                            | 1 -
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index bd83bf9ef326..1ef602820110 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1682,10 +1682,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	/* Copy the MAC addr into the HW  */
 	priv->hw->mac->set_umac_addr(priv->hw, dev->dev_addr, 0);
 
-	/* If required, perform hw setup of the bus. */
-	if (priv->plat->bus_setup)
-		priv->plat->bus_setup(priv->ioaddr);
-
 	/* PS and related bits will be programmed according to the speed */
 	if (priv->hw->pcs) {
 		int speed = priv->plat->mac_port_sel_speed;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index d76033d6726d..fc273e9d5f67 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -134,7 +134,6 @@ struct plat_stmmacenet_data {
 	int tx_fifo_size;
 	int rx_fifo_size;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
-	void (*bus_setup)(void __iomem *ioaddr);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
-- 
cgit v1.2.3


From 34fe7c05400663e01e23cddd1fea68bb7a2b3d29 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 14:46:48 +0100
Subject: block: enumify ELEVATOR_*_MERGE

Switch these constants to an enum, and make let the compiler ensure that
all callers of blk_try_merge and elv_merge handle all potential values.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c         | 76 +++++++++++++++++++++++++-----------------------
 block/blk-merge.c        |  2 +-
 block/blk-mq-sched.c     | 35 +++++++++++-----------
 block/blk-mq.c           | 32 +++++++++-----------
 block/blk.h              |  2 +-
 block/cfq-iosched.c      |  4 +--
 block/deadline-iosched.c | 12 +++-----
 block/elevator.c         | 10 ++++---
 block/mq-deadline.c      |  2 +-
 include/linux/elevator.h | 28 ++++++++++--------
 10 files changed, 102 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1d263311353a..00e053c704a1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1511,12 +1511,11 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 {
 	struct blk_plug *plug;
 	struct request *rq;
-	bool ret = false;
 	struct list_head *plug_list;
 
 	plug = current->plug;
 	if (!plug)
-		goto out;
+		return false;
 	*request_count = 0;
 
 	if (q->mq_ops)
@@ -1525,7 +1524,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		plug_list = &plug->list;
 
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
-		int el_ret;
+		bool merged = false;
 
 		if (rq->q == q) {
 			(*request_count)++;
@@ -1541,19 +1540,22 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
 			continue;
 
-		el_ret = blk_try_merge(rq, bio);
-		if (el_ret == ELEVATOR_BACK_MERGE) {
-			ret = bio_attempt_back_merge(q, rq, bio);
-			if (ret)
-				break;
-		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-			ret = bio_attempt_front_merge(q, rq, bio);
-			if (ret)
-				break;
+		switch (blk_try_merge(rq, bio)) {
+		case ELEVATOR_BACK_MERGE:
+			merged = bio_attempt_back_merge(q, rq, bio);
+			break;
+		case ELEVATOR_FRONT_MERGE:
+			merged = bio_attempt_front_merge(q, rq, bio);
+			break;
+		default:
+			break;
 		}
+
+		if (merged)
+			return true;
 	}
-out:
-	return ret;
+
+	return false;
 }
 
 unsigned int blk_plug_queued_count(struct request_queue *q)
@@ -1595,7 +1597,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
 	struct blk_plug *plug;
-	int el_ret, where = ELEVATOR_INSERT_SORT;
+	int where = ELEVATOR_INSERT_SORT;
 	struct request *req, *free;
 	unsigned int request_count = 0;
 	unsigned int wb_acct;
@@ -1633,27 +1635,29 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	el_ret = elv_merge(q, &req, bio);
-	if (el_ret == ELEVATOR_BACK_MERGE) {
-		if (bio_attempt_back_merge(q, req, bio)) {
-			elv_bio_merged(q, req, bio);
-			free = attempt_back_merge(q, req);
-			if (!free)
-				elv_merged_request(q, req, el_ret);
-			else
-				__blk_put_request(q, free);
-			goto out_unlock;
-		}
-	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-		if (bio_attempt_front_merge(q, req, bio)) {
-			elv_bio_merged(q, req, bio);
-			free = attempt_front_merge(q, req);
-			if (!free)
-				elv_merged_request(q, req, el_ret);
-			else
-				__blk_put_request(q, free);
-			goto out_unlock;
-		}
+	switch (elv_merge(q, &req, bio)) {
+	case ELEVATOR_BACK_MERGE:
+		if (!bio_attempt_back_merge(q, req, bio))
+			break;
+		elv_bio_merged(q, req, bio);
+		free = attempt_back_merge(q, req);
+		if (free)
+			__blk_put_request(q, free);
+		else
+			elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
+		goto out_unlock;
+	case ELEVATOR_FRONT_MERGE:
+		if (!bio_attempt_front_merge(q, req, bio))
+			break;
+		elv_bio_merged(q, req, bio);
+		free = attempt_front_merge(q, req);
+		if (free)
+			__blk_put_request(q, free);
+		else
+			elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
+		goto out_unlock;
+	default:
+		break;
 	}
 
 get_rq:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c956d9e7aafd..6cbd90ad5f90 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -801,7 +801,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	return true;
 }
 
-int blk_try_merge(struct request *rq, struct bio *bio)
+enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 {
 	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ee455e7cf9d8..72d0d8361175 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -238,30 +238,29 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 			    struct request **merged_request)
 {
 	struct request *rq;
-	int ret;
 
-	ret = elv_merge(q, &rq, bio);
-	if (ret == ELEVATOR_BACK_MERGE) {
+	switch (elv_merge(q, &rq, bio)) {
+	case ELEVATOR_BACK_MERGE:
 		if (!blk_mq_sched_allow_merge(q, rq, bio))
 			return false;
-		if (bio_attempt_back_merge(q, rq, bio)) {
-			*merged_request = attempt_back_merge(q, rq);
-			if (!*merged_request)
-				elv_merged_request(q, rq, ret);
-			return true;
-		}
-	} else if (ret == ELEVATOR_FRONT_MERGE) {
+		if (!bio_attempt_back_merge(q, rq, bio))
+			return false;
+		*merged_request = attempt_back_merge(q, rq);
+		if (!*merged_request)
+			elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
+		return true;
+	case ELEVATOR_FRONT_MERGE:
 		if (!blk_mq_sched_allow_merge(q, rq, bio))
 			return false;
-		if (bio_attempt_front_merge(q, rq, bio)) {
-			*merged_request = attempt_front_merge(q, rq);
-			if (!*merged_request)
-				elv_merged_request(q, rq, ret);
-			return true;
-		}
+		if (!bio_attempt_front_merge(q, rq, bio))
+			return false;
+		*merged_request = attempt_front_merge(q, rq);
+		if (!*merged_request)
+			elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
+		return true;
+	default:
+		return false;
 	}
-
-	return false;
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e9957df05700..dd9722df4afe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -763,7 +763,7 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 	int checked = 8;
 
 	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
-		int el_ret;
+		bool merged = false;
 
 		if (!checked--)
 			break;
@@ -771,26 +771,22 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 		if (!blk_rq_merge_ok(rq, bio))
 			continue;
 
-		el_ret = blk_try_merge(rq, bio);
-		if (el_ret == ELEVATOR_NO_MERGE)
-			continue;
-
-		if (!blk_mq_sched_allow_merge(q, rq, bio))
+		switch (blk_try_merge(rq, bio)) {
+		case ELEVATOR_BACK_MERGE:
+			if (blk_mq_sched_allow_merge(q, rq, bio))
+				merged = bio_attempt_back_merge(q, rq, bio);
 			break;
-
-		if (el_ret == ELEVATOR_BACK_MERGE) {
-			if (bio_attempt_back_merge(q, rq, bio)) {
-				ctx->rq_merged++;
-				return true;
-			}
-			break;
-		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-			if (bio_attempt_front_merge(q, rq, bio)) {
-				ctx->rq_merged++;
-				return true;
-			}
+		case ELEVATOR_FRONT_MERGE:
+			if (blk_mq_sched_allow_merge(q, rq, bio))
+				merged = bio_attempt_front_merge(q, rq, bio);
 			break;
+		default:
+			continue;
 		}
+
+		if (merged)
+			ctx->rq_merged++;
+		return merged;
 	}
 
 	return false;
diff --git a/block/blk.h b/block/blk.h
index 3e08703902a9..ae82f2ac4019 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -215,7 +215,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 void blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
-int blk_try_merge(struct request *rq, struct bio *bio);
+enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f0f29ee731e1..921262770636 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2528,7 +2528,7 @@ static void cfq_remove_request(struct request *rq)
 	}
 }
 
-static int cfq_merge(struct request_queue *q, struct request **req,
+static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
 		     struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
@@ -2544,7 +2544,7 @@ static int cfq_merge(struct request_queue *q, struct request **req,
 }
 
 static void cfq_merged_request(struct request_queue *q, struct request *req,
-			       int type)
+			       enum elv_merge type)
 {
 	if (type == ELEVATOR_FRONT_MERGE) {
 		struct cfq_queue *cfqq = RQ_CFQQ(req);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 05fc0ea25a98..c68f6bbc0dcd 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -120,12 +120,11 @@ static void deadline_remove_request(struct request_queue *q, struct request *rq)
 	deadline_del_rq_rb(dd, rq);
 }
 
-static int
+static enum elv_merge
 deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *__rq;
-	int ret;
 
 	/*
 	 * check for front merge
@@ -138,20 +137,17 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 			BUG_ON(sector != blk_rq_pos(__rq));
 
 			if (elv_bio_merge_ok(__rq, bio)) {
-				ret = ELEVATOR_FRONT_MERGE;
-				goto out;
+				*req = __rq;
+				return ELEVATOR_FRONT_MERGE;
 			}
 		}
 	}
 
 	return ELEVATOR_NO_MERGE;
-out:
-	*req = __rq;
-	return ret;
 }
 
 static void deadline_merged_request(struct request_queue *q,
-				    struct request *req, int type)
+				    struct request *req, enum elv_merge type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 
diff --git a/block/elevator.c b/block/elevator.c
index 7e4f5880dd64..27ff1ed5a6fa 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -427,11 +427,11 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(elv_dispatch_add_tail);
 
-int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
+enum elv_merge elv_merge(struct request_queue *q, struct request **req,
+		struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 	struct request *__rq;
-	int ret;
 
 	/*
 	 * Levels of merges:
@@ -446,7 +446,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	 * First try one-hit cache.
 	 */
 	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
-		ret = blk_try_merge(q->last_merge, bio);
+		enum elv_merge ret = blk_try_merge(q->last_merge, bio);
+
 		if (ret != ELEVATOR_NO_MERGE) {
 			*req = q->last_merge;
 			return ret;
@@ -514,7 +515,8 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 	return ret;
 }
 
-void elv_merged_request(struct request_queue *q, struct request *rq, int type)
+void elv_merged_request(struct request_queue *q, struct request *rq,
+		enum elv_merge type)
 {
 	struct elevator_queue *e = q->elevator;
 
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index d68d9c273a66..236121633ca0 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -121,7 +121,7 @@ static void deadline_remove_request(struct request_queue *q, struct request *rq)
 }
 
 static void dd_request_merged(struct request_queue *q, struct request *req,
-			      int type)
+			      enum elv_merge type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b5825c4f06f7..b38b4e651ea6 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -9,12 +9,21 @@
 struct io_cq;
 struct elevator_type;
 
-typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
+/*
+ * Return values from elevator merger
+ */
+enum elv_merge {
+	ELEVATOR_NO_MERGE	= 0,
+	ELEVATOR_FRONT_MERGE	= 1,
+	ELEVATOR_BACK_MERGE	= 2,
+};
+
+typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **,
 				 struct bio *);
 
 typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, struct request *);
 
-typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int);
+typedef void (elevator_merged_fn) (struct request_queue *, struct request *, enum elv_merge);
 
 typedef int (elevator_allow_bio_merge_fn) (struct request_queue *,
 					   struct request *, struct bio *);
@@ -87,7 +96,7 @@ struct elevator_mq_ops {
 	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
 	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
 	int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
-	void (*request_merged)(struct request_queue *, struct request *, int);
+	void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
 	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
 	struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
 	void (*put_request)(struct request *);
@@ -166,10 +175,12 @@ extern void elv_dispatch_sort(struct request_queue *, struct request *);
 extern void elv_dispatch_add_tail(struct request_queue *, struct request *);
 extern void elv_add_request(struct request_queue *, struct request *, int);
 extern void __elv_add_request(struct request_queue *, struct request *, int);
-extern int elv_merge(struct request_queue *, struct request **, struct bio *);
+extern enum elv_merge elv_merge(struct request_queue *, struct request **,
+		struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
-extern void elv_merged_request(struct request_queue *, struct request *, int);
+extern void elv_merged_request(struct request_queue *, struct request *,
+		enum elv_merge);
 extern void elv_bio_merged(struct request_queue *q, struct request *,
 				struct bio *);
 extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
@@ -218,13 +229,6 @@ extern void elv_rb_add(struct rb_root *, struct request *);
 extern void elv_rb_del(struct rb_root *, struct request *);
 extern struct request *elv_rb_find(struct rb_root *, sector_t);
 
-/*
- * Return values from elevator merger
- */
-#define ELEVATOR_NO_MERGE	0
-#define ELEVATOR_FRONT_MERGE	1
-#define ELEVATOR_BACK_MERGE	2
-
 /*
  * Insertion selection
  */
-- 
cgit v1.2.3


From 1e739730c5b9ea80a2f25e9cf6e1025d47e3d8ed Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 14:46:49 +0100
Subject: block: optionally merge discontiguous discard bios into a single
 request

Add a new merge strategy that merges discard bios into a request until the
maximum number of discard ranges (or the maximum discard size) is reached
from the plug merging code.  I/O scheduler merging is not wired up yet
but might also be useful, although not for fast devices like NVMe which
are the only user for now.

Note that for now we don't support limiting the size of each discard range,
but if needed that can be added later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c         | 27 +++++++++++++++++++++++++++
 block/blk-merge.c        |  5 ++++-
 block/blk-mq.c           |  3 +++
 block/blk-settings.c     | 20 ++++++++++++++++++++
 block/blk-sysfs.c        | 12 ++++++++++++
 block/blk.h              |  2 ++
 include/linux/blkdev.h   | 26 ++++++++++++++++++++++++++
 include/linux/elevator.h |  1 +
 8 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 00e053c704a1..c0e4d41d3d33 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1483,6 +1483,30 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 	return true;
 }
 
+bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
+		struct bio *bio)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req);
+
+	if (segments >= queue_max_discard_segments(q))
+		goto no_merge;
+	if (blk_rq_sectors(req) + bio_sectors(bio) >
+	    blk_rq_get_max_sectors(req, blk_rq_pos(req)))
+		goto no_merge;
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->__data_len += bio->bi_iter.bi_size;
+	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+	req->nr_phys_segments = segments + 1;
+
+	blk_account_io_start(req, false);
+	return true;
+no_merge:
+	req_set_nomerge(q, req);
+	return false;
+}
+
 /**
  * blk_attempt_plug_merge - try to merge with %current's plugged list
  * @q: request_queue new bio is being queued at
@@ -1547,6 +1571,9 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		case ELEVATOR_FRONT_MERGE:
 			merged = bio_attempt_front_merge(q, rq, bio);
 			break;
+		case ELEVATOR_DISCARD_MERGE:
+			merged = bio_attempt_discard_merge(q, rq, bio);
+			break;
 		default:
 			break;
 		}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6cbd90ad5f90..2afa262425d1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -803,7 +803,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 
 enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 {
-	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
+	if (req_op(rq) == REQ_OP_DISCARD &&
+	    queue_max_discard_segments(rq->q) > 1)
+		return ELEVATOR_DISCARD_MERGE;
+	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
 	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
 		return ELEVATOR_FRONT_MERGE;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index dd9722df4afe..7412191aee57 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -780,6 +780,9 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 			if (blk_mq_sched_allow_merge(q, rq, bio))
 				merged = bio_attempt_front_merge(q, rq, bio);
 			break;
+		case ELEVATOR_DISCARD_MERGE:
+			merged = bio_attempt_discard_merge(q, rq, bio);
+			break;
 		default:
 			continue;
 		}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6eb19bcbf3cb..1e7174ffc9d4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -88,6 +88,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 void blk_set_default_limits(struct queue_limits *lim)
 {
 	lim->max_segments = BLK_MAX_SEGMENTS;
+	lim->max_discard_segments = 1;
 	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->virt_boundary_mask = 0;
@@ -128,6 +129,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	/* Inherit limits from component devices */
 	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
+	lim->max_discard_segments = 1;
 	lim->max_hw_sectors = UINT_MAX;
 	lim->max_segment_size = UINT_MAX;
 	lim->max_sectors = UINT_MAX;
@@ -336,6 +338,22 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments
 }
 EXPORT_SYMBOL(blk_queue_max_segments);
 
+/**
+ * blk_queue_max_discard_segments - set max segments for discard requests
+ * @q:  the request queue for the device
+ * @max_segments:  max number of segments
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the number of
+ *    segments in a discard request.
+ **/
+void blk_queue_max_discard_segments(struct request_queue *q,
+		unsigned short max_segments)
+{
+	q->limits.max_discard_segments = max_segments;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments);
+
 /**
  * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
  * @q:  the request queue for the device
@@ -553,6 +571,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					    b->virt_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+	t->max_discard_segments = min_not_zero(t->max_discard_segments,
+					       b->max_discard_segments);
 	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
 						 b->max_integrity_segments);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 48032c4759a7..070d81bae1d5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -121,6 +121,12 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_max_segments(q), (page));
 }
 
+static ssize_t queue_max_discard_segments_show(struct request_queue *q,
+		char *page)
+{
+	return queue_var_show(queue_max_discard_segments(q), (page));
+}
+
 static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->limits.max_integrity_segments, (page));
@@ -545,6 +551,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
 	.show = queue_max_segments_show,
 };
 
+static struct queue_sysfs_entry queue_max_discard_segments_entry = {
+	.attr = {.name = "max_discard_segments", .mode = S_IRUGO },
+	.show = queue_max_discard_segments_show,
+};
+
 static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
 	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
 	.show = queue_max_integrity_segments_show,
@@ -697,6 +708,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
 	&queue_max_segments_entry.attr,
+	&queue_max_discard_segments_entry.attr,
 	&queue_max_integrity_segments_entry.attr,
 	&queue_max_segment_size_entry.attr,
 	&queue_iosched_entry.attr,
diff --git a/block/blk.h b/block/blk.h
index ae82f2ac4019..d1ea4bd9b9a3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -100,6 +100,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 			     struct bio *bio);
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio);
+bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
+		struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 			    unsigned int *request_count,
 			    struct request **same_queue_rq);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e0bac14347e6..aecca0e7d9ca 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -331,6 +331,7 @@ struct queue_limits {
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
+	unsigned short		max_discard_segments;
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
@@ -1146,6 +1147,8 @@ extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
+extern void blk_queue_max_discard_segments(struct request_queue *,
+		unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
@@ -1189,6 +1192,15 @@ extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 
+/*
+ * Number of physical segments as sent to the device.
+ *
+ * Normally this is the number of discontiguous data segments sent by the
+ * submitter.  But for data-less command like discard we might have no
+ * actual data segments submitted, but the driver might have to add it's
+ * own special payload.  In that case we still return 1 here so that this
+ * special payload will be mapped.
+ */
 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
 {
 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
@@ -1196,6 +1208,15 @@ static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
 	return rq->nr_phys_segments;
 }
 
+/*
+ * Number of discard segments (or ranges) the driver needs to fill in.
+ * Each discard bio merged into a request is counted as one segment.
+ */
+static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
+{
+	return max_t(unsigned short, rq->nr_phys_segments, 1);
+}
+
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
@@ -1384,6 +1405,11 @@ static inline unsigned short queue_max_segments(struct request_queue *q)
 	return q->limits.max_segments;
 }
 
+static inline unsigned short queue_max_discard_segments(struct request_queue *q)
+{
+	return q->limits.max_discard_segments;
+}
+
 static inline unsigned int queue_max_segment_size(struct request_queue *q)
 {
 	return q->limits.max_segment_size;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b38b4e651ea6..8265b6330cc2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -16,6 +16,7 @@ enum elv_merge {
 	ELEVATOR_NO_MERGE	= 0,
 	ELEVATOR_FRONT_MERGE	= 1,
 	ELEVATOR_BACK_MERGE	= 2,
+	ELEVATOR_DISCARD_MERGE	= 3,
 };
 
 typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **,
-- 
cgit v1.2.3


From b35ba01ea6979125e9c23fb322517748278f15e6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 14:46:50 +0100
Subject: nvme: support ranged discard requests

NVMe supports up to 256 ranges per DSM command, so wire up support
for ranged discards up to that limit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 30 +++++++++++++++++++++++-------
 include/linux/nvme.h     |  2 ++
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1640a5c8abbb..2701c21d1719 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -238,26 +238,38 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
 static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmnd)
 {
+	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
 	struct nvme_dsm_range *range;
-	unsigned int nr_bytes = blk_rq_bytes(req);
+	struct bio *bio;
 
-	range = kmalloc(sizeof(*range), GFP_ATOMIC);
+	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
 	if (!range)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
-	range->cattr = cpu_to_le32(0);
-	range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift);
-	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	__rq_for_each_bio(bio, req) {
+		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
+		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+
+		range[n].cattr = cpu_to_le32(0);
+		range[n].nlb = cpu_to_le32(nlb);
+		range[n].slba = cpu_to_le64(slba);
+		n++;
+	}
+
+	if (WARN_ON_ONCE(n != segments)) {
+		kfree(range);
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
 	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
-	cmnd->dsm.nr = 0;
+	cmnd->dsm.nr = segments - 1;
 	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
 	req->special_vec.bv_page = virt_to_page(range);
 	req->special_vec.bv_offset = offset_in_page(range);
-	req->special_vec.bv_len = sizeof(*range);
+	req->special_vec.bv_len = sizeof(*range) * segments;
 	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
 
 	return BLK_MQ_RQ_QUEUE_OK;
@@ -871,6 +883,9 @@ static void nvme_config_discard(struct nvme_ns *ns)
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	u32 logical_block_size = queue_logical_block_size(ns->queue);
 
+	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
+			NVME_DSM_MAX_RANGES);
+
 	if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
 		ns->queue->limits.discard_zeroes_data = 1;
 	else
@@ -879,6 +894,7 @@ static void nvme_config_discard(struct nvme_ns *ns)
 	ns->queue->limits.discard_alignment = logical_block_size;
 	ns->queue->limits.discard_granularity = logical_block_size;
 	blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
+	blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
 }
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 3d1c6f1b15c9..3e2ed49c3ad8 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -553,6 +553,8 @@ enum {
 	NVME_DSMGMT_AD		= 1 << 2,
 };
 
+#define NVME_DSM_MAX_RANGES	256
+
 struct nvme_dsm_range {
 	__le32			cattr;
 	__le32			nlb;
-- 
cgit v1.2.3


From d6fc8821c2d2aba4cc18447a467f543e46e7367d Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 8 Feb 2017 09:54:42 +0800
Subject: SUNRPC/Cache: Always treat the invalid cache as unexpired

When the first time pynfs runs after rpc/nfsd startup, always get the warning,

"Got error: Connection closed"

I found the problem is caused by,
1. A new startup of nfsd, rpc.mountd, etc,
2. A rpc request from client (pynfs test, or normal mounting),
3. An ip_map cache is created but invalid, so upcall to rpc.mountd,
4. rpc.mountd process the ip_map upcall, before write the valid data to nfsd,
   do auth_reload(), and check_useipaddr(),
5. For the first time, old_use_ipaddr = -1, it causes rpc.mountd do write_flush    that doing cache_clean,
6. The ip_map cache will be treat as expired and clean,
7. When rpc.mountd write the valid data to nfsd, a new ip_map is created
   and updated, the cache_check of old ip_map(doing the upcall) will
   return -ETIMEDOUT.
8. RPC layer return SVC_CLOSE and close the xprt after commit 4d712ef1db05
   "svcauth_gss: Close connection when dropping an incoming message"

NeilBrown suggest in another email,

"If CACHE_VALID is not set, then there is no data in the cache item,
 so there is nothing to expire. So it would be nice if cache items that
 don't have CACHE_VALID are never treated as expired."

v3, change the order of the two patches
v2, change the checking of CACHE_PENDING to CACHE_VALID

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 9dcf2c8fe1c5..70738504d647 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -204,8 +204,11 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 	kref_put(&h->ref, cd->cache_put);
 }
 
-static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
+static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h)
 {
+	if (!test_bit(CACHE_VALID, &h->flags))
+		return false;
+
 	return  (h->expiry_time < seconds_since_boot()) ||
 		(detail->flush_time >= h->last_refresh);
 }
-- 
cgit v1.2.3


From 6080ef6e7c0a0592cbcca11200d879faf65e27d4 Mon Sep 17 00:00:00 2001
From: Jeff Westfahl <jeff.westfahl@ni.com>
Date: Tue, 10 Jan 2017 13:30:17 -0600
Subject: mtd: introduce function max_bad_blocks

If implemented, 'max_bad_blocks' returns the maximum number of bad
blocks to reserve for a MTD. An implementation for NAND is coming soon.

Signed-off-by: Jeff Westfahl <jeff.westfahl@ni.com>
Signed-off-by: Zach Brown <zach.brown@ni.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electron.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/mtdpart.c   | 10 ++++++++++
 include/linux/mtd/mtd.h | 13 +++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index fccdd49bb964..08925bb68fd6 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -349,6 +349,14 @@ static const struct mtd_ooblayout_ops part_ooblayout_ops = {
 	.free = part_ooblayout_free,
 };
 
+static int part_max_bad_blocks(struct mtd_info *mtd, loff_t ofs, size_t len)
+{
+	struct mtd_part *part = mtd_to_part(mtd);
+
+	return part->master->_max_bad_blocks(part->master,
+					     ofs + part->offset, len);
+}
+
 static inline void free_partition(struct mtd_part *p)
 {
 	kfree(p->mtd.name);
@@ -475,6 +483,8 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 		slave->mtd._block_isbad = part_block_isbad;
 	if (master->_block_markbad)
 		slave->mtd._block_markbad = part_block_markbad;
+	if (master->_max_bad_blocks)
+		slave->mtd._max_bad_blocks = part_max_bad_blocks;
 
 	if (master->_get_device)
 		slave->mtd._get_device = part_get_device;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 13f8052b9ff9..5bb42c6dacdc 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -322,6 +322,7 @@ struct mtd_info {
 	int (*_block_isreserved) (struct mtd_info *mtd, loff_t ofs);
 	int (*_block_isbad) (struct mtd_info *mtd, loff_t ofs);
 	int (*_block_markbad) (struct mtd_info *mtd, loff_t ofs);
+	int (*_max_bad_blocks) (struct mtd_info *mtd, loff_t ofs, size_t len);
 	int (*_suspend) (struct mtd_info *mtd);
 	void (*_resume) (struct mtd_info *mtd);
 	void (*_reboot) (struct mtd_info *mtd);
@@ -397,6 +398,18 @@ static inline int mtd_oobavail(struct mtd_info *mtd, struct mtd_oob_ops *ops)
 	return ops->mode == MTD_OPS_AUTO_OOB ? mtd->oobavail : mtd->oobsize;
 }
 
+static inline int mtd_max_bad_blocks(struct mtd_info *mtd,
+				     loff_t ofs, size_t len)
+{
+	if (!mtd->_max_bad_blocks)
+		return -ENOTSUPP;
+
+	if (mtd->size < (len + ofs) || ofs < 0)
+		return -EINVAL;
+
+	return mtd->_max_bad_blocks(mtd, ofs, len);
+}
+
 int mtd_wunit_to_pairing_info(struct mtd_info *mtd, int wunit,
 			      struct mtd_pairing_info *info);
 int mtd_pairing_info_to_wunit(struct mtd_info *mtd,
-- 
cgit v1.2.3


From 863d7d9c2e0cbbde6cd15f87c4431dd806f2d917 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Tue, 7 Feb 2017 21:47:16 +0800
Subject: sunrpc/nfs: cleanup procfs/pipefs entry in cache_detail

Record flush/channel/content entries is useless, remove them.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/cache_lib.c           |  3 +--
 include/linux/sunrpc/cache.h | 15 +++-----------
 net/sunrpc/cache.c           | 49 +++++++++++++++-----------------------------
 3 files changed, 21 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 6de15709d024..2ae676f93e6b 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -141,8 +141,7 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
 
 void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
 {
-	if (cd->u.pipefs.dir)
-		sunrpc_cache_unregister_pipefs(cd);
+	sunrpc_cache_unregister_pipefs(cd);
 }
 
 void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 62a60eeacb0a..bb5c9c80f12e 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -63,15 +63,6 @@ struct cache_head {
 
 #define	CACHE_NEW_EXPIRY 120	/* keep new things pending confirmation for 120 seconds */
 
-struct cache_detail_procfs {
-	struct proc_dir_entry	*proc_ent;
-	struct proc_dir_entry   *flush_ent, *channel_ent, *content_ent;
-};
-
-struct cache_detail_pipefs {
-	struct dentry *dir;
-};
-
 struct cache_detail {
 	struct module *		owner;
 	int			hash_size;
@@ -123,9 +114,9 @@ struct cache_detail {
 	time_t			last_warn;		/* when we last warned about no readers */
 
 	union {
-		struct cache_detail_procfs procfs;
-		struct cache_detail_pipefs pipefs;
-	} u;
+		struct proc_dir_entry	*procfs;
+		struct dentry		*pipefs;
+	};
 	struct net		*net;
 };
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8147e8d56eb2..688ef8cbac62 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1600,21 +1600,12 @@ static const struct file_operations cache_flush_operations_procfs = {
 	.llseek		= no_llseek,
 };
 
-static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net)
+static void remove_cache_proc_entries(struct cache_detail *cd)
 {
-	struct sunrpc_net *sn;
-
-	if (cd->u.procfs.proc_ent == NULL)
-		return;
-	if (cd->u.procfs.flush_ent)
-		remove_proc_entry("flush", cd->u.procfs.proc_ent);
-	if (cd->u.procfs.channel_ent)
-		remove_proc_entry("channel", cd->u.procfs.proc_ent);
-	if (cd->u.procfs.content_ent)
-		remove_proc_entry("content", cd->u.procfs.proc_ent);
-	cd->u.procfs.proc_ent = NULL;
-	sn = net_generic(net, sunrpc_net_id);
-	remove_proc_entry(cd->name, sn->proc_net_rpc);
+	if (cd->procfs) {
+		proc_remove(cd->procfs);
+		cd->procfs = NULL;
+	}
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1624,38 +1615,30 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 	struct sunrpc_net *sn;
 
 	sn = net_generic(net, sunrpc_net_id);
-	cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
-	if (cd->u.procfs.proc_ent == NULL)
+	cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
+	if (cd->procfs == NULL)
 		goto out_nomem;
-	cd->u.procfs.channel_ent = NULL;
-	cd->u.procfs.content_ent = NULL;
 
 	p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
-			     cd->u.procfs.proc_ent,
-			     &cache_flush_operations_procfs, cd);
-	cd->u.procfs.flush_ent = p;
+			     cd->procfs, &cache_flush_operations_procfs, cd);
 	if (p == NULL)
 		goto out_nomem;
 
 	if (cd->cache_request || cd->cache_parse) {
 		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
-				     cd->u.procfs.proc_ent,
-				     &cache_file_operations_procfs, cd);
-		cd->u.procfs.channel_ent = p;
+				cd->procfs, &cache_file_operations_procfs, cd);
 		if (p == NULL)
 			goto out_nomem;
 	}
 	if (cd->cache_show) {
 		p = proc_create_data("content", S_IFREG|S_IRUSR,
-				cd->u.procfs.proc_ent,
-				&content_file_operations_procfs, cd);
-		cd->u.procfs.content_ent = p;
+				cd->procfs, &content_file_operations_procfs, cd);
 		if (p == NULL)
 			goto out_nomem;
 	}
 	return 0;
 out_nomem:
-	remove_cache_proc_entries(cd, net);
+	remove_cache_proc_entries(cd);
 	return -ENOMEM;
 }
 #else /* CONFIG_PROC_FS */
@@ -1684,7 +1667,7 @@ EXPORT_SYMBOL_GPL(cache_register_net);
 
 void cache_unregister_net(struct cache_detail *cd, struct net *net)
 {
-	remove_cache_proc_entries(cd, net);
+	remove_cache_proc_entries(cd);
 	sunrpc_destroy_cache_detail(cd);
 }
 EXPORT_SYMBOL_GPL(cache_unregister_net);
@@ -1843,15 +1826,17 @@ int sunrpc_cache_register_pipefs(struct dentry *parent,
 	struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
-	cd->u.pipefs.dir = dir;
+	cd->pipefs = dir;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
 
 void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
 {
-	rpc_remove_cache_dir(cd->u.pipefs.dir);
-	cd->u.pipefs.dir = NULL;
+	if (cd->pipefs) {
+		rpc_remove_cache_dir(cd->pipefs);
+		cd->pipefs = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
 
-- 
cgit v1.2.3


From 5786461bd8ea81433d81297215ee814a9a33ce7a Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Tue, 7 Feb 2017 21:48:11 +0800
Subject: sunrpc: rename NFS_NGROUPS to UNX_NGROUPS for auth unix

NFS_NGROUPS has been move to sunrpc, rename to UNX_NGROUPS.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h |  1 +
 net/sunrpc/auth_unix.c      | 18 ++++++++----------
 net/sunrpc/svcauth_unix.c   |  4 ++--
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index b1bc62ba20a2..39c85fb1f0ed 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -32,6 +32,7 @@
  */
 #define UNX_MAXNODENAME	__NEW_UTS_LEN
 #define UNX_CALLSLACK	(21 + XDR_QUADLEN(UNX_MAXNODENAME))
+#define UNX_NGROUPS	16
 
 struct rpcsec_gss_info;
 
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 306fc0f54596..82337e1ec9cd 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -14,12 +14,10 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/user_namespace.h>
 
-#define NFS_NGROUPS	16
-
 struct unx_cred {
 	struct rpc_cred		uc_base;
 	kgid_t			uc_gid;
-	kgid_t			uc_gids[NFS_NGROUPS];
+	kgid_t			uc_gids[UNX_NGROUPS];
 };
 #define uc_uid			uc_base.cr_uid
 
@@ -82,13 +80,13 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
 
 	if (acred->group_info != NULL)
 		groups = acred->group_info->ngroups;
-	if (groups > NFS_NGROUPS)
-		groups = NFS_NGROUPS;
+	if (groups > UNX_NGROUPS)
+		groups = UNX_NGROUPS;
 
 	cred->uc_gid = acred->gid;
 	for (i = 0; i < groups; i++)
 		cred->uc_gids[i] = acred->group_info->gid[i];
-	if (i < NFS_NGROUPS)
+	if (i < UNX_NGROUPS)
 		cred->uc_gids[i] = INVALID_GID;
 
 	return &cred->uc_base;
@@ -132,12 +130,12 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
 
 	if (acred->group_info != NULL)
 		groups = acred->group_info->ngroups;
-	if (groups > NFS_NGROUPS)
-		groups = NFS_NGROUPS;
+	if (groups > UNX_NGROUPS)
+		groups = UNX_NGROUPS;
 	for (i = 0; i < groups ; i++)
 		if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
 			return 0;
-	if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
+	if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups]))
 		return 0;
 	return 1;
 }
@@ -166,7 +164,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
 	*p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
 	*p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
 	hold = p++;
-	for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++)
+	for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++)
 		*p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
 	*hold = htonl(p - hold - 1);		/* gid array length */
 	*base = htonl((p - base - 1) << 2);	/* cred length */
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 64af4f034de6..f81eaa8e0888 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -403,7 +403,7 @@ svcauth_unix_info_release(struct svc_xprt *xpt)
 /****************************************************************************
  * auth.unix.gid cache
  * simple cache to map a UID to a list of GIDs
- * because AUTH_UNIX aka AUTH_SYS has a max of 16
+ * because AUTH_UNIX aka AUTH_SYS has a max of UNX_NGROUPS
  */
 #define	GID_HASHBITS	8
 #define	GID_HASHMAX	(1<<GID_HASHBITS)
@@ -810,7 +810,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
 	cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
 	cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
 	slen = svc_getnl(argv);			/* gids length */
-	if (slen > 16 || (len -= (slen + 2)*4) < 0)
+	if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0)
 		goto badcred;
 	cred->cr_group_info = groups_alloc(slen);
 	if (cred->cr_group_info == NULL)
-- 
cgit v1.2.3


From af4926e5619f8a33463f7202f27ba091893ed2ad Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Tue, 7 Feb 2017 21:48:49 +0800
Subject: sunrpc: remove dead codes of cr_magic in rpc_cred

Don't found any place using the cr_magic.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/auth.h | 5 -----
 net/sunrpc/auth.c           | 3 ---
 net/sunrpc/auth_null.c      | 3 ---
 3 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 39c85fb1f0ed..8fd3504946ad 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -64,9 +64,6 @@ struct rpc_cred {
 	struct rcu_head		cr_rcu;
 	struct rpc_auth *	cr_auth;
 	const struct rpc_credops *cr_ops;
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-	unsigned long		cr_magic;	/* 0x0f4aa4f0 */
-#endif
 	unsigned long		cr_expire;	/* when to gc */
 	unsigned long		cr_flags;	/* various flags */
 	atomic_t		cr_count;	/* ref count */
@@ -80,8 +77,6 @@ struct rpc_cred {
 #define RPCAUTH_CRED_HASHED	2
 #define RPCAUTH_CRED_NEGATIVE	3
 
-#define RPCAUTH_CRED_MAGIC	0x0f4aa4f0
-
 /* rpc_auth au_flags */
 #define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT	0x0001 /* underlying cred has no key timeout */
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 8b344d0eb6ed..a1ee933e3029 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -648,9 +648,6 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 	cred->cr_auth = auth;
 	cred->cr_ops = ops;
 	cred->cr_expire = jiffies;
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-	cred->cr_magic = RPCAUTH_CRED_MAGIC;
-#endif
 	cred->cr_uid = acred->uid;
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 4d17376b2acb..5f3d527dff65 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -139,7 +139,4 @@ struct rpc_cred null_cred = {
 	.cr_ops		= &null_credops,
 	.cr_count	= ATOMIC_INIT(1),
 	.cr_flags	= 1UL << RPCAUTH_CRED_UPTODATE,
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-	.cr_magic	= RPCAUTH_CRED_MAGIC,
-#endif
 };
-- 
cgit v1.2.3


From ceb374eb06848212e47fb884964a3ae5e79615b5 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@ni.com>
Date: Tue, 10 Jan 2017 13:30:19 -0600
Subject: mtd: nand: Add max_bb_per_die and blocks_per_die fields to nand_chip

The fields max_bb_per_die and blocks_per_die are useful determining the
number of bad blocks a MTD needs to allocate. How they are set will
depend on if the chip is ONFI, JEDEC or a full-id entry in the nand_ids
table.

Signed-off-by: Zach Brown <zach.brown@ni.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electron.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/nand.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index c5f3a012ae62..4e1b441c3a7e 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -801,6 +801,9 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
  *			supported, 0 otherwise.
  * @jedec_params:	[INTERN] holds the JEDEC parameter page when JEDEC is
  *			supported, 0 otherwise.
+ * @max_bb_per_die:	[INTERN] the max number of bad blocks each die of a
+ *			this nand device will encounter their life times.
+ * @blocks_per_die:	[INTERN] The number of PEBs in a die
  * @read_retries:	[INTERN] the number of read retry modes supported
  * @onfi_set_features:	[REPLACEABLE] set the features for ONFI nand
  * @onfi_get_features:	[REPLACEABLE] get the features for ONFI nand
@@ -883,6 +886,8 @@ struct nand_chip {
 		struct nand_onfi_params	onfi_params;
 		struct nand_jedec_params jedec_params;
 	};
+	u16 max_bb_per_die;
+	u32 blocks_per_die;
 
 	struct nand_data_interface *data_interface;
 
-- 
cgit v1.2.3


From 16d0bf1882109da84b4b90d7ba4fc7ba18db0d67 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 8 Feb 2017 19:37:55 +0100
Subject: PM / Domains: Provide dummy governors if CONFIG_PM_GENERIC_DOMAINS=n

This allows to compile-test drivers that refer to governors (always by
reference) when CONFIG_PM_GENERIC_DOMAINS=n.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_domain.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 81ece61075df..5339ed5bd6f9 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -182,6 +182,9 @@ static inline int pm_genpd_remove(struct generic_pm_domain *genpd)
 {
 	return -ENOTSUPP;
 }
+
+#define simple_qos_governor		(*(struct dev_power_governor *)(NULL))
+#define pm_domain_always_on_gov		(*(struct dev_power_governor *)(NULL))
 #endif
 
 static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
-- 
cgit v1.2.3


From 72f2ff0deb870145a5a2d24cd75b4f9936159a62 Mon Sep 17 00:00:00 2001
From: Dongdong Liu <liudongdong3@huawei.com>
Date: Fri, 3 Feb 2017 15:02:07 -0600
Subject: PCI: Disable MSI for HiSilicon Hip06/Hip07 Root Ports

The PCIe Root Port in Hip06/Hip07 SoCs advertises an MSI capability, but it
cannot generate MSIs.  It can transfer MSI/MSI-X from downstream devices,
but does not support MSI/MSI-X itself.

Add a quirk to prevent use of MSI/MSI-X by the Root Port.

[bhelgaas: changelog, sort vendor ID #define, drop device ID #define]
Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Gabriele Paoloni <gabriele.paoloni@huawei.com>
Reviewed-by: Zhou Wang <wangzhou1@hisilicon.com>
---
 drivers/pci/quirks.c    | 1 +
 include/linux/pci_ids.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 1800befa8b8b..c49ac99bda4b 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1634,6 +1634,7 @@ static void quirk_pcie_mch(struct pci_dev *pdev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quirk_pcie_mch);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_pcie_mch);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_pcie_mch);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI,	0x1610,	quirk_pcie_mch);
 
 
 /*
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 73dda0edcb97..a4f77feecbb0 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2516,6 +2516,8 @@
 #define PCI_DEVICE_ID_KORENIX_JETCARDF2	0x1700
 #define PCI_DEVICE_ID_KORENIX_JETCARDF3	0x17ff
 
+#define PCI_VENDOR_ID_HUAWEI         	0x19e5
+
 #define PCI_VENDOR_ID_NETRONOME		0x19ee
 #define PCI_DEVICE_ID_NETRONOME_NFP3200	0x3200
 #define PCI_DEVICE_ID_NETRONOME_NFP3240	0x3240
-- 
cgit v1.2.3


From e553f539f2af39db5e3b2c273cc1a22d34be49ad Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Mon, 17 Oct 2016 12:17:43 -0700
Subject: of: make of_device_make_bus_id() static

of_device_make_bus_id() was changed to non-static by commit c66012253800
("of/device: Make of_device_make_bus_id() usable by other code") more than
6 years ago, but there are no users of it outside of platform.c.  Make the
function static again.

Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/platform.c     | 2 +-
 include/linux/of_device.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index b8064bc2b6eb..5dfcc967dd05 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(of_find_device_by_node);
  * derive a unique name. If it cannot, then it will prepend names from
  * parent nodes until a unique name can be derived.
  */
-void of_device_make_bus_id(struct device *dev)
+static void of_device_make_bus_id(struct device *dev)
 {
 	struct device_node *node = dev->of_node;
 	const __be32 *reg;
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index cc7dd687a89d..5c2aeed17dd4 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -13,7 +13,6 @@ struct device;
 #ifdef CONFIG_OF
 extern const struct of_device_id *of_match_device(
 	const struct of_device_id *matches, const struct device *dev);
-extern void of_device_make_bus_id(struct device *dev);
 
 /**
  * of_driver_match_device - Tell if a driver's of_match_table matches a device.
-- 
cgit v1.2.3


From d23bb113952db88b03a71c9533e5a40e444e18d3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 8 Feb 2017 11:17:52 -0500
Subject: SUNRPC: Remove unused function rpc_get_timeout()

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/clnt.h |  1 -
 net/sunrpc/clnt.c           | 15 ---------------
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 333ad11b3dd9..33f216edb434 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -182,7 +182,6 @@ int		rpc_protocol(struct rpc_clnt *);
 struct net *	rpc_net_ns(struct rpc_clnt *);
 size_t		rpc_max_payload(struct rpc_clnt *);
 size_t		rpc_max_bc_payload(struct rpc_clnt *);
-unsigned long	rpc_get_timeout(struct rpc_clnt *clnt);
 void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 const char	*rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 1dc9f3bac099..2838a1fab460 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1452,21 +1452,6 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
 
-/**
- * rpc_get_timeout - Get timeout for transport in units of HZ
- * @clnt: RPC client to query
- */
-unsigned long rpc_get_timeout(struct rpc_clnt *clnt)
-{
-	unsigned long ret;
-
-	rcu_read_lock();
-	ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval;
-	rcu_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL_GPL(rpc_get_timeout);
-
 /**
  * rpc_force_rebind - force transport to check that remote port is unchanged
  * @clnt: client to rebind
-- 
cgit v1.2.3


From 7196dbb02ea05835b9ee56910ee82cb55422c7f1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 8 Feb 2017 11:17:54 -0500
Subject: SUNRPC: Allow changing of the TCP timeout parameters on the fly

When the NFSv4 server tells us the lease period, we usually want
to adjust down the timeout parameters on the TCP connection to
ensure that we don't miss lease renewals due to a faulty connection.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h     |  4 ++++
 include/linux/sunrpc/xprtsock.h |  3 +++
 net/sunrpc/clnt.c               | 30 ++++++++++++++++++------
 net/sunrpc/xprtsock.c           | 51 +++++++++++++++++++++++++++++++++++++----
 4 files changed, 77 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a5da60b24d83..eab1c749e192 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -137,6 +137,9 @@ struct rpc_xprt_ops {
 	void		(*release_request)(struct rpc_task *task);
 	void		(*close)(struct rpc_xprt *xprt);
 	void		(*destroy)(struct rpc_xprt *xprt);
+	void		(*set_connect_timeout)(struct rpc_xprt *xprt,
+					unsigned long connect_timeout,
+					unsigned long reconnect_timeout);
 	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
 	int		(*enable_swap)(struct rpc_xprt *xprt);
 	void		(*disable_swap)(struct rpc_xprt *xprt);
@@ -221,6 +224,7 @@ struct rpc_xprt {
 	struct timer_list	timer;
 	unsigned long		last_used,
 				idle_timeout,
+				connect_timeout,
 				max_reconnect_timeout;
 
 	/*
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index bef3fb0abb8f..c9959d7e3579 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -55,6 +55,8 @@ struct sock_xprt {
 	size_t			rcvsize,
 				sndsize;
 
+	struct rpc_timeout	tcp_timeout;
+
 	/*
 	 * Saved socket callback addresses
 	 */
@@ -81,6 +83,7 @@ struct sock_xprt {
 
 #define XPRT_SOCK_CONNECTING	1U
 #define XPRT_SOCK_DATA_READY	(2)
+#define XPRT_SOCK_UPD_TIMEOUT	(3)
 
 #endif /* __KERNEL__ */
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2838a1fab460..b5bc0c589f6a 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2684,6 +2684,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
 {
 	struct rpc_xprt_switch *xps;
 	struct rpc_xprt *xprt;
+	unsigned long connect_timeout;
 	unsigned long reconnect_timeout;
 	unsigned char resvport;
 	int ret = 0;
@@ -2696,6 +2697,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
 		return -EAGAIN;
 	}
 	resvport = xprt->resvport;
+	connect_timeout = xprt->connect_timeout;
 	reconnect_timeout = xprt->max_reconnect_timeout;
 	rcu_read_unlock();
 
@@ -2705,7 +2707,10 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
 		goto out_put_switch;
 	}
 	xprt->resvport = resvport;
-	xprt->max_reconnect_timeout = reconnect_timeout;
+	if (xprt->ops->set_connect_timeout != NULL)
+		xprt->ops->set_connect_timeout(xprt,
+				connect_timeout,
+				reconnect_timeout);
 
 	rpc_xprt_switch_set_roundrobin(xps);
 	if (setup) {
@@ -2722,24 +2727,35 @@ out_put_switch:
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
 
+struct connect_timeout_data {
+	unsigned long connect_timeout;
+	unsigned long reconnect_timeout;
+};
+
 static int
-rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt,
 		struct rpc_xprt *xprt,
 		void *data)
 {
-	unsigned long timeout = *((unsigned long *)data);
+	struct connect_timeout_data *timeo = data;
 
-	if (timeout < xprt->max_reconnect_timeout)
-		xprt->max_reconnect_timeout = timeout;
+	if (xprt->ops->set_connect_timeout)
+		xprt->ops->set_connect_timeout(xprt,
+				timeo->connect_timeout,
+				timeo->reconnect_timeout);
 	return 0;
 }
 
 void
 rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
 {
+	struct connect_timeout_data timeout = {
+		.connect_timeout = timeo,
+		.reconnect_timeout = timeo,
+	};
 	rpc_clnt_iterate_for_each_xprt(clnt,
-			rpc_xprt_cap_max_reconnect_timeout,
-			&timeo);
+			rpc_xprt_set_connect_timeout,
+			&timeout);
 }
 EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c8ac649a51cb..810e9b59be16 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -52,6 +52,8 @@
 #include "sunrpc.h"
 
 static void xs_close(struct rpc_xprt *xprt);
+static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
+		struct socket *sock);
 
 /*
  * xprtsock tunables
@@ -666,6 +668,9 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	if (task->tk_flags & RPC_TASK_SENT)
 		zerocopy = false;
 
+	if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
+		xs_tcp_set_socket_timeouts(xprt, transport->sock);
+
 	/* Continue transmitting the packet/record. We must be careful
 	 * to cope with writespace callbacks arriving _after_ we have
 	 * called sendmsg(). */
@@ -2238,11 +2243,20 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
 static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 		struct socket *sock)
 {
-	unsigned int keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
-	unsigned int keepcnt = xprt->timeout->to_retries + 1;
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	unsigned int keepidle;
+	unsigned int keepcnt;
 	unsigned int opt_on = 1;
 	unsigned int timeo;
 
+	spin_lock_bh(&xprt->transport_lock);
+	keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
+	keepcnt = xprt->timeout->to_retries + 1;
+	timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
+		(xprt->timeout->to_retries + 1);
+	clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
+	spin_unlock_bh(&xprt->transport_lock);
+
 	/* TCP Keepalive options */
 	kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
 			(char *)&opt_on, sizeof(opt_on));
@@ -2254,12 +2268,38 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 			(char *)&keepcnt, sizeof(keepcnt));
 
 	/* TCP user timeout (see RFC5482) */
-	timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
-		(xprt->timeout->to_retries + 1);
 	kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
 			(char *)&timeo, sizeof(timeo));
 }
 
+static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
+		unsigned long connect_timeout,
+		unsigned long reconnect_timeout)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct rpc_timeout to;
+	unsigned long initval;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (reconnect_timeout < xprt->max_reconnect_timeout)
+		xprt->max_reconnect_timeout = reconnect_timeout;
+	if (connect_timeout < xprt->connect_timeout) {
+		memcpy(&to, xprt->timeout, sizeof(to));
+		initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1);
+		/* Arbitrary lower limit */
+		if (initval <  XS_TCP_INIT_REEST_TO << 1)
+			initval = XS_TCP_INIT_REEST_TO << 1;
+		to.to_initval = initval;
+		to.to_maxval = initval;
+		memcpy(&transport->tcp_timeout, &to,
+				sizeof(transport->tcp_timeout));
+		xprt->timeout = &transport->tcp_timeout;
+		xprt->connect_timeout = connect_timeout;
+	}
+	set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2728,6 +2768,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
 	.close			= xs_tcp_shutdown,
 	.destroy		= xs_destroy,
+	.set_connect_timeout	= xs_tcp_set_connect_timeout,
 	.print_stats		= xs_tcp_print_stats,
 	.enable_swap		= xs_enable_swap,
 	.disable_swap		= xs_disable_swap,
@@ -3014,6 +3055,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 	xprt->timeout = &xs_tcp_default_timeout;
 
 	xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
+	xprt->connect_timeout = xprt->timeout->to_initval *
+		(xprt->timeout->to_retries + 1);
 
 	INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
-- 
cgit v1.2.3


From 26ae102f2cfd0215daa57dc790aa3bfe534403a9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 8 Feb 2017 11:17:55 -0500
Subject: NFSv4: Set the connection timeout to match the lease period

Set the timeout for TCP connections to be 1 lease period to ensure
that we don't lose our lease due to a faulty TCP connection.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4renewd.c         |  2 +-
 include/linux/sunrpc/clnt.h |  5 +++--
 net/sunrpc/clnt.c           | 10 ++++++----
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 82e77198d17e..1f8c2ae43a8d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -153,7 +153,7 @@ void nfs4_set_lease_period(struct nfs_client *clp,
 	spin_unlock(&clp->cl_lock);
 
 	/* Cap maximum reconnect timeout at 1/2 lease period */
-	rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1);
+	rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1);
 }
 
 /*
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 33f216edb434..6095ecba0dde 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -201,8 +201,9 @@ int		rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *,
 				struct rpc_xprt *,
 				void *),
 			void *data);
-void		rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
-			unsigned long timeo);
+void		rpc_set_connect_timeout(struct rpc_clnt *clnt,
+			unsigned long connect_timeout,
+			unsigned long reconnect_timeout);
 
 int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
 			struct rpc_xprt_switch *,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b5bc0c589f6a..52da3ce54bb5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2747,17 +2747,19 @@ rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt,
 }
 
 void
-rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
+rpc_set_connect_timeout(struct rpc_clnt *clnt,
+		unsigned long connect_timeout,
+		unsigned long reconnect_timeout)
 {
 	struct connect_timeout_data timeout = {
-		.connect_timeout = timeo,
-		.reconnect_timeout = timeo,
+		.connect_timeout = connect_timeout,
+		.reconnect_timeout = reconnect_timeout,
 	};
 	rpc_clnt_iterate_for_each_xprt(clnt,
 			rpc_xprt_set_connect_timeout,
 			&timeout);
 }
-EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+EXPORT_SYMBOL_GPL(rpc_set_connect_timeout);
 
 void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
 {
-- 
cgit v1.2.3


From 90858794c96028ec1294fda12488a19f3a2b1d4a Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 24 Jul 2016 20:51:44 -0400
Subject: module.h: remove extable.h include now users have migrated

With hopefully most/all users of module.h that were looking for
exception table functions moved over to the new extable.h header,
we can remove the back-compat include that let us transition
without introducing build regressions.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Jessica Yu <jeyu@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/module.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 7c84273d60b9..2e6df4c41c86 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -18,7 +18,6 @@
 #include <linux/moduleparam.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
-#include <linux/extable.h>	/* only as arch move module.h -> extable.h */
 #include <linux/rbtree_latch.h>
 
 #include <linux/percpu.h>
-- 
cgit v1.2.3


From 0764c604c8128f17fd740ff8b1701d0a1301eb7e Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Fri, 3 Feb 2017 11:29:26 -0600
Subject: PM / OPP: Expose _of_get_opp_desc_node as dev_pm_opp API

Rename _of_get_opp_desc_node to dev_pm_opp_of_get_opp_desc_node and add it
to include/linux/pm_opp.h to allow other drivers, such as platform OPP
and cpufreq drivers, to make use of it.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/of.c | 9 +++++----
 include/linux/pm_opp.h      | 6 ++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index 083b79fa0c62..779428676f63 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -243,7 +243,7 @@ void dev_pm_opp_of_remove_table(struct device *dev)
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
 /* Returns opp descriptor node for a device, caller must do of_node_put() */
-static struct device_node *_of_get_opp_desc_node(struct device *dev)
+struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
 {
 	/*
 	 * There should be only ONE phandle present in "operating-points-v2"
@@ -252,6 +252,7 @@ static struct device_node *_of_get_opp_desc_node(struct device *dev)
 
 	return of_parse_phandle(dev->of_node, "operating-points-v2", 0);
 }
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node);
 
 /**
  * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
@@ -478,7 +479,7 @@ int dev_pm_opp_of_add_table(struct device *dev)
 	 * OPPs have two version of bindings now. The older one is deprecated,
 	 * try for the new binding first.
 	 */
-	opp_np = _of_get_opp_desc_node(dev);
+	opp_np = dev_pm_opp_of_get_opp_desc_node(dev);
 	if (!opp_np) {
 		/*
 		 * Try old-deprecated bindings for backward compatibility with
@@ -570,7 +571,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 	int cpu, ret = 0;
 
 	/* Get OPP descriptor node */
-	np = _of_get_opp_desc_node(cpu_dev);
+	np = dev_pm_opp_of_get_opp_desc_node(cpu_dev);
 	if (!np) {
 		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
 		return -ENOENT;
@@ -595,7 +596,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 		}
 
 		/* Get OPP descriptor node */
-		tmp_np = _of_get_opp_desc_node(tcpu_dev);
+		tmp_np = dev_pm_opp_of_get_opp_desc_node(tcpu_dev);
 		if (!tmp_np) {
 			dev_err(tcpu_dev, "%s: Couldn't find opp node.\n",
 				__func__);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 731d548657aa..a6685b3dde26 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -288,6 +288,7 @@ void dev_pm_opp_of_remove_table(struct device *dev);
 int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
 void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask);
 int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
+struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev);
 #else
 static inline int dev_pm_opp_of_add_table(struct device *dev)
 {
@@ -311,6 +312,11 @@ static inline int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct
 {
 	return -ENOTSUPP;
 }
+
+static inline struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
+{
+	return NULL;
+}
 #endif
 
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 9faf1c0fd5a943e498dbc0c86ff42e965b347d08 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 9 Feb 2017 01:18:15 +0800
Subject: sctp: drop unnecessary __packed from some stream reconf structures

commit 85c727b59483 ("sctp: drop __packed from almost all SCTP structures")
has removed __packed from almost all SCTP structures. But there still are
three structures where it should be dropped.

This patch is to remove it from some stream reconf structures.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 2408c6877ca0..d74fca3f3141 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -721,7 +721,7 @@ struct sctp_infox {
 struct sctp_reconf_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	__u8 params[0];
-} __packed;
+};
 
 struct sctp_strreset_outreq {
 	sctp_paramhdr_t param_hdr;
@@ -729,12 +729,12 @@ struct sctp_strreset_outreq {
 	__u32 response_seq;
 	__u32 send_reset_at_tsn;
 	__u16 list_of_streams[0];
-} __packed;
+};
 
 struct sctp_strreset_inreq {
 	sctp_paramhdr_t param_hdr;
 	__u32 request_seq;
 	__u16 list_of_streams[0];
-} __packed;
+};
 
 #endif /* __LINUX_SCTP_H__ */
-- 
cgit v1.2.3


From c56480a1e90261842f54f3a5a9ebc12d827f0c3e Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 9 Feb 2017 01:18:17 +0800
Subject: sctp: add support for generating stream reconf ssn/tsn reset request
 chunk

This patch is to define SSN/TSN Reset Request Parameter described
in rfc6525 section 4.3.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  5 +++++
 include/net/sctp/sm.h    |  2 ++
 net/sctp/sm_make_chunk.c | 29 +++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index d74fca3f3141..71c0d41d9a59 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -737,4 +737,9 @@ struct sctp_strreset_inreq {
 	__u16 list_of_streams[0];
 };
 
+struct sctp_strreset_tsnreq {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 430ed139fbbb..ac37c1782e23 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -265,6 +265,8 @@ struct sctp_chunk *sctp_make_strreset_req(
 				const struct sctp_association *asoc,
 				__u16 stream_num, __u16 *stream_list,
 				bool out, bool in);
+struct sctp_chunk *sctp_make_strreset_tsnreq(
+				const struct sctp_association *asoc);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c7d3249f88ec..749842aead33 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3658,3 +3658,32 @@ struct sctp_chunk *sctp_make_strreset_req(
 
 	return retval;
 }
+
+/* RE-CONFIG 4.3 (SSN/TSN RESET ALL)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 15       |      Parameter Length = 8     |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Request Sequence Number              |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnreq(
+				const struct sctp_association *asoc)
+{
+	struct sctp_strreset_tsnreq tsnreq;
+	__u16 length = sizeof(tsnreq);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST;
+	tsnreq.param_hdr.length = htons(length);
+	tsnreq.request_seq = htonl(asoc->strreset_outseq);
+
+	sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq);
+
+	return retval;
+}
-- 
cgit v1.2.3


From 78098117f8bfad4f2104c3f7b6b69071af95a246 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 9 Feb 2017 01:18:19 +0800
Subject: sctp: add support for generating stream reconf add incoming/outgoing
 streams request chunk

This patch is to define Add Incoming/Outgoing Streams Request
Parameter described in rfc6525 section 4.5 and 4.6. They can
be in one same chunk trunk as rfc6525 section 3.1-7 describes,
so make them in one function.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  7 +++++++
 include/net/sctp/sm.h    |  3 +++
 net/sctp/sm_make_chunk.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 71c0d41d9a59..b055788de0cf 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -742,4 +742,11 @@ struct sctp_strreset_tsnreq {
 	__u32 request_seq;
 };
 
+struct sctp_strreset_addstrm {
+	sctp_paramhdr_t param_hdr;
+	__u32 request_seq;
+	__u16 number_of_streams;
+	__u16 reserved;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index ac37c1782e23..3675fde3a26e 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -267,6 +267,9 @@ struct sctp_chunk *sctp_make_strreset_req(
 				bool out, bool in);
 struct sctp_chunk *sctp_make_strreset_tsnreq(
 				const struct sctp_association *asoc);
+struct sctp_chunk *sctp_make_strreset_addstrm(
+				const struct sctp_association *asoc,
+				__u16 out, __u16 in);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 749842aead33..7f8dbf2c6cee 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3687,3 +3687,49 @@ struct sctp_chunk *sctp_make_strreset_tsnreq(
 
 	return retval;
 }
+
+/* RE-CONFIG 4.5/4.6 (ADD STREAM)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 17       |      Parameter Length = 12    |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |          Re-configuration Request Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |      Number of new streams    |         Reserved              |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_addstrm(
+				const struct sctp_association *asoc,
+				__u16 out, __u16 in)
+{
+	struct sctp_strreset_addstrm addstrm;
+	__u16 size = sizeof(addstrm);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, (!!out + !!in) * size);
+	if (!retval)
+		return NULL;
+
+	if (out) {
+		addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS;
+		addstrm.param_hdr.length = htons(size);
+		addstrm.number_of_streams = htons(out);
+		addstrm.request_seq = htonl(asoc->strreset_outseq);
+		addstrm.reserved = 0;
+
+		sctp_addto_chunk(retval, size, &addstrm);
+	}
+
+	if (in) {
+		addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS;
+		addstrm.param_hdr.length = htons(size);
+		addstrm.number_of_streams = htons(in);
+		addstrm.request_seq = htonl(asoc->strreset_outseq + !!out);
+		addstrm.reserved = 0;
+
+		sctp_addto_chunk(retval, size, &addstrm);
+	}
+
+	return retval;
+}
-- 
cgit v1.2.3


From 2fd260f03b6a365bad48522f3948463928f91c2c Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 24 Jan 2017 19:35:56 +0200
Subject: PCI/AER: Remove unused .link_reset() callback

No hardware seems to actually call .link_reset(), and no driver implements
it as more than a nop stub.

Drop mentions of the callback from everywhere.  It's dropped from the
documentation as well, but the doc really needs to be updated to reflect
reality better (e.g., on PCIe, slot reset is the link reset).  This will be
done in a later patch.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/PCI/pci-error-recovery.txt | 24 +++---------------------
 drivers/infiniband/hw/hfi1/pcie.c        | 10 ----------
 drivers/infiniband/hw/qib/qib_pcie.c     |  8 --------
 drivers/media/pci/ngene/ngene-cards.c    |  7 -------
 drivers/misc/genwqe/card_base.c          |  1 -
 include/linux/pci.h                      |  3 ---
 6 files changed, 3 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
index ac26869c7db4..da3b2176d5da 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -78,7 +78,6 @@ struct pci_error_handlers
 {
 	int (*error_detected)(struct pci_dev *dev, enum pci_channel_state);
 	int (*mmio_enabled)(struct pci_dev *dev);
-	int (*link_reset)(struct pci_dev *dev);
 	int (*slot_reset)(struct pci_dev *dev);
 	void (*resume)(struct pci_dev *dev);
 };
@@ -104,8 +103,7 @@ if it implements any, it must implement error_detected(). If a callback
 is not implemented, the corresponding feature is considered unsupported.
 For example, if mmio_enabled() and resume() aren't there, then it
 is assumed that the driver is not doing any direct recovery and requires
-a slot reset. If link_reset() is not implemented, the card is assumed to
-not care about link resets. Typically a driver will want to know about
+a slot reset.  Typically a driver will want to know about
 a slot_reset().
 
 The actual steps taken by a platform to recover from a PCI error
@@ -232,25 +230,9 @@ proceeds to STEP 4 (Slot Reset)
 
 STEP 3: Link Reset
 ------------------
-The platform resets the link, and then calls the link_reset() callback
-on all affected device drivers.  This is a PCI-Express specific state
+The platform resets the link.  This is a PCI-Express specific step
 and is done whenever a non-fatal error has been detected that can be
-"solved" by resetting the link. This call informs the driver of the
-reset and the driver should check to see if the device appears to be
-in working condition.
-
-The driver is not supposed to restart normal driver I/O operations
-at this point.  It should limit itself to "probing" the device to
-check its recoverability status. If all is right, then the platform
-will call resume() once all drivers have ack'd link_reset().
-
-	Result codes:
-		(identical to STEP 3 (MMIO Enabled)
-
-The platform then proceeds to either STEP 4 (Slot Reset) or STEP 5
-(Resume Operations).
-
->>> The current powerpc implementation does not implement this callback.
+"solved" by resetting the link.
 
 STEP 4: Slot Reset
 ------------------
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 4ac8f330c5cb..ebd941fc8a92 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -598,15 +598,6 @@ pci_slot_reset(struct pci_dev *pdev)
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
-static pci_ers_result_t
-pci_link_reset(struct pci_dev *pdev)
-{
-	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-	dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
-	return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
 static void
 pci_resume(struct pci_dev *pdev)
 {
@@ -625,7 +616,6 @@ pci_resume(struct pci_dev *pdev)
 const struct pci_error_handlers hfi1_pci_err_handler = {
 	.error_detected = pci_error_detected,
 	.mmio_enabled = pci_mmio_enabled,
-	.link_reset = pci_link_reset,
 	.slot_reset = pci_slot_reset,
 	.resume = pci_resume,
 };
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
index 6abe1c621aa4..c379b8342a09 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -682,13 +682,6 @@ qib_pci_slot_reset(struct pci_dev *pdev)
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
-static pci_ers_result_t
-qib_pci_link_reset(struct pci_dev *pdev)
-{
-	qib_devinfo(pdev, "QIB link_reset function called, ignored\n");
-	return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
 static void
 qib_pci_resume(struct pci_dev *pdev)
 {
@@ -707,7 +700,6 @@ qib_pci_resume(struct pci_dev *pdev)
 const struct pci_error_handlers qib_pci_err_handler = {
 	.error_detected = qib_pci_error_detected,
 	.mmio_enabled = qib_pci_mmio_enabled,
-	.link_reset = qib_pci_link_reset,
 	.slot_reset = qib_pci_slot_reset,
 	.resume = qib_pci_resume,
 };
diff --git a/drivers/media/pci/ngene/ngene-cards.c b/drivers/media/pci/ngene/ngene-cards.c
index 423e8c889310..8438c1c8acde 100644
--- a/drivers/media/pci/ngene/ngene-cards.c
+++ b/drivers/media/pci/ngene/ngene-cards.c
@@ -781,12 +781,6 @@ static pci_ers_result_t ngene_error_detected(struct pci_dev *dev,
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
-static pci_ers_result_t ngene_link_reset(struct pci_dev *dev)
-{
-	printk(KERN_INFO DEVICE_NAME ": link reset\n");
-	return 0;
-}
-
 static pci_ers_result_t ngene_slot_reset(struct pci_dev *dev)
 {
 	printk(KERN_INFO DEVICE_NAME ": slot reset\n");
@@ -800,7 +794,6 @@ static void ngene_resume(struct pci_dev *dev)
 
 static const struct pci_error_handlers ngene_errors = {
 	.error_detected = ngene_error_detected,
-	.link_reset = ngene_link_reset,
 	.slot_reset = ngene_slot_reset,
 	.resume = ngene_resume,
 };
diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c
index 6c1f49a85023..4fd21e86ad56 100644
--- a/drivers/misc/genwqe/card_base.c
+++ b/drivers/misc/genwqe/card_base.c
@@ -1336,7 +1336,6 @@ static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs)
 static struct pci_error_handlers genwqe_err_handler = {
 	.error_detected = genwqe_err_error_detected,
 	.mmio_enabled	= genwqe_err_result_none,
-	.link_reset	= genwqe_err_result_none,
 	.slot_reset	= genwqe_err_slot_reset,
 	.resume		= genwqe_err_resume,
 };
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e2d1a124216a..2c0158b4dbed 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -678,9 +678,6 @@ struct pci_error_handlers {
 	/* MMIO has been re-enabled, but not DMA */
 	pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev);
 
-	/* PCI Express link has been reset */
-	pci_ers_result_t (*link_reset)(struct pci_dev *dev);
-
 	/* PCI slot has been reset */
 	pci_ers_result_t (*slot_reset)(struct pci_dev *dev);
 
-- 
cgit v1.2.3


From 42e9401bd1467d22c4dc4d2c637347b874e6a80b Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Thu, 9 Feb 2017 11:50:24 +0100
Subject: mtd: Add partition device node to mtd partition devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The user visible change here is that mtd partitions get an of_node link
in sysfs.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/mtdpart.c          | 1 +
 drivers/mtd/ofpart.c           | 1 +
 include/linux/mtd/partitions.h | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 08925bb68fd6..ea5e5307f667 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -432,6 +432,7 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 	slave->mtd.dev.parent = IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) ?
 				&master->dev :
 				master->dev.parent;
+	slave->mtd.dev.of_node = part->of_node;
 
 	slave->mtd._read = part_read;
 	slave->mtd._write = part_write;
diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c
index ede407d6e106..464470122493 100644
--- a/drivers/mtd/ofpart.c
+++ b/drivers/mtd/ofpart.c
@@ -108,6 +108,7 @@ static int parse_ofpart_partitions(struct mtd_info *master,
 
 		parts[i].offset = of_read_number(reg, a_cells);
 		parts[i].size = of_read_number(reg + a_cells, s_cells);
+		parts[i].of_node = pp;
 
 		partname = of_get_property(pp, "label", &len);
 		if (!partname)
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 70736e1e6c8f..06df1e06b6e0 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -41,6 +41,7 @@ struct mtd_partition {
 	uint64_t size;			/* partition size */
 	uint64_t offset;		/* offset within the master MTD space */
 	uint32_t mask_flags;		/* master MTD flags to mask out for this partition */
+	struct device_node *of_node;
 };
 
 #define MTDPART_OFS_RETAIN	(-3)
-- 
cgit v1.2.3


From f405df5de3170c00e5c54f8b7cf4766044a032ba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 18:06:19 +0100
Subject: refcount_t: Introduce a special purpose refcount type

Provide refcount_t, an atomic_t like primitive built just for
refcounting.

It provides saturation semantics such that overflow becomes impossible
and thereby 'spurious' use-after-free is avoided.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/refcount.h | 294 +++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug        |  13 +++
 2 files changed, 307 insertions(+)
 create mode 100644 include/linux/refcount.h

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
new file mode 100644
index 000000000000..600aadf9cca4
--- /dev/null
+++ b/include/linux/refcount.h
@@ -0,0 +1,294 @@
+#ifndef _LINUX_REFCOUNT_H
+#define _LINUX_REFCOUNT_H
+
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_DEBUG_REFCOUNT
+#define REFCOUNT_WARN(cond, str) WARN_ON(cond)
+#define __refcount_check	__must_check
+#else
+#define REFCOUNT_WARN(cond, str) (void)(cond)
+#define __refcount_check
+#endif
+
+typedef struct refcount_struct {
+	atomic_t refs;
+} refcount_t;
+
+#define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
+
+static inline void refcount_set(refcount_t *r, unsigned int n)
+{
+	atomic_set(&r->refs, n);
+}
+
+static inline unsigned int refcount_read(const refcount_t *r)
+{
+	return atomic_read(&r->refs);
+}
+
+static inline __refcount_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (!val)
+			return false;
+
+		if (unlikely(val == UINT_MAX))
+			return true;
+
+		new = val + i;
+		if (new < val)
+			new = UINT_MAX;
+		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+
+static inline void refcount_add(unsigned int i, refcount_t *r)
+{
+	REFCOUNT_WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+}
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_inc_not_zero(refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		new = val + 1;
+
+		if (!val)
+			return false;
+
+		if (unlikely(!new))
+			return true;
+
+		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+static inline void refcount_inc(refcount_t *r)
+{
+	REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (unlikely(val == UINT_MAX))
+			return false;
+
+		new = val - i;
+		if (new > val) {
+			REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+			return false;
+		}
+
+		old = atomic_cmpxchg_release(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	return !new;
+}
+
+static inline __refcount_check
+bool refcount_dec_and_test(refcount_t *r)
+{
+	return refcount_sub_and_test(1, r);
+}
+
+/*
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+static inline
+void refcount_dec(refcount_t *r)
+{
+	REFCOUNT_WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+}
+
+/*
+ * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
+ * success thereof.
+ *
+ * Like all decrement operations, it provides release memory order and provides
+ * a control dependency.
+ *
+ * It can be used like a try-delete operator; this explicit case is provided
+ * and not cmpxchg in generic, because that would allow implementing unsafe
+ * operations.
+ */
+static inline __refcount_check
+bool refcount_dec_if_one(refcount_t *r)
+{
+	return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+}
+
+/*
+ * No atomic_t counterpart, it decrements unless the value is 1, in which case
+ * it will return false.
+ *
+ * Was often done like: atomic_add_unless(&var, -1, 1)
+ */
+static inline __refcount_check
+bool refcount_dec_not_one(refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (unlikely(val == UINT_MAX))
+			return true;
+
+		if (val == 1)
+			return false;
+
+		new = val - 1;
+		if (new > val) {
+			REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+			return true;
+		}
+
+		old = atomic_cmpxchg_release(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	return true;
+}
+
+/*
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
+{
+	if (refcount_dec_not_one(r))
+		return false;
+
+	mutex_lock(lock);
+	if (!refcount_dec_and_test(r)) {
+		mutex_unlock(lock);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
+{
+	if (refcount_dec_not_one(r))
+		return false;
+
+	spin_lock(lock);
+	if (!refcount_dec_and_test(r)) {
+		spin_unlock(lock);
+		return false;
+	}
+
+	return true;
+}
+
+#endif /* _LINUX_REFCOUNT_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5b37821632a2..8f1f0e609891 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -716,6 +716,19 @@ source "lib/Kconfig.kmemcheck"
 
 source "lib/Kconfig.kasan"
 
+config DEBUG_REFCOUNT
+	bool "Verbose refcount checks"
+	help
+	  Say Y here if you want reference counters (refcount_t and kref) to
+	  generate WARNs on dubious usage. Without this refcount_t will still
+	  be a saturating counter and avoid Use-After-Free by turning it into
+	  a resource leak Denial-Of-Service.
+
+	  Use of this option will increase kernel text size but will alert the
+	  admin of potential abuse.
+
+	  If in doubt, say "N".
+
 endmenu # "Memory Debugging"
 
 config ARCH_HAS_KCOV
-- 
cgit v1.2.3


From 10383aea2f445bce9b2a2b308def08134b438c8e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 18:06:19 +0100
Subject: kref: Implement 'struct kref' using refcount_t

Use the refcount_t 'atomic' type to implement 'struct kref', this makes kref
more robust by bringing saturation semantics.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kref.h | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 9db9685fed84..f4156f88f557 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -15,17 +15,14 @@
 #ifndef _KREF_H_
 #define _KREF_H_
 
-#include <linux/bug.h>
-#include <linux/atomic.h>
-#include <linux/kernel.h>
-#include <linux/mutex.h>
 #include <linux/spinlock.h>
+#include <linux/refcount.h>
 
 struct kref {
-	atomic_t refcount;
+	refcount_t refcount;
 };
 
-#define KREF_INIT(n)	{ .refcount = ATOMIC_INIT(n), }
+#define KREF_INIT(n)	{ .refcount = REFCOUNT_INIT(n), }
 
 /**
  * kref_init - initialize object.
@@ -33,12 +30,12 @@ struct kref {
  */
 static inline void kref_init(struct kref *kref)
 {
-	atomic_set(&kref->refcount, 1);
+	refcount_set(&kref->refcount, 1);
 }
 
-static inline int kref_read(const struct kref *kref)
+static inline unsigned int kref_read(const struct kref *kref)
 {
-	return atomic_read(&kref->refcount);
+	return refcount_read(&kref->refcount);
 }
 
 /**
@@ -47,11 +44,7 @@ static inline int kref_read(const struct kref *kref)
  */
 static inline void kref_get(struct kref *kref)
 {
-	/* If refcount was 0 before incrementing then we have a race
-	 * condition when this kref is freeing by some other thread right now.
-	 * In this case one should use kref_get_unless_zero()
-	 */
-	WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
+	refcount_inc(&kref->refcount);
 }
 
 /**
@@ -75,7 +68,7 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)
 {
 	WARN_ON(release == NULL);
 
-	if (atomic_dec_and_test(&kref->refcount)) {
+	if (refcount_dec_and_test(&kref->refcount)) {
 		release(kref);
 		return 1;
 	}
@@ -88,7 +81,7 @@ static inline int kref_put_mutex(struct kref *kref,
 {
 	WARN_ON(release == NULL);
 
-	if (atomic_dec_and_mutex_lock(&kref->refcount, lock)) {
+	if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
 		release(kref);
 		return 1;
 	}
@@ -101,7 +94,7 @@ static inline int kref_put_lock(struct kref *kref,
 {
 	WARN_ON(release == NULL);
 
-	if (atomic_dec_and_lock(&kref->refcount, lock)) {
+	if (refcount_dec_and_lock(&kref->refcount, lock)) {
 		release(kref);
 		return 1;
 	}
@@ -126,6 +119,6 @@ static inline int kref_put_lock(struct kref *kref,
  */
 static inline int __must_check kref_get_unless_zero(struct kref *kref)
 {
-	return atomic_add_unless(&kref->refcount, 1, 0);
+	return refcount_inc_not_zero(&kref->refcount);
 }
 #endif /* _KREF_H_ */
-- 
cgit v1.2.3


From 6ce77bfd6cedbff61eabf8837dc0901bb671cc86 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 26 Jan 2017 11:40:57 +0200
Subject: perf/core: Allow kernel filters on CPU events

While supporting file-based address filters for CPU events requires some
extra context switch handling, kernel address filters are easy, since the
kernel mapping is preserved across address spaces. It is also useful as
it permits tracing scheduling paths of the kernel.

This patch allows setting up kernel filters for CPU events.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170126094057.13805-4-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  2 ++
 kernel/events/core.c       | 42 ++++++++++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c58e93c130c..000fdb211c7d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -482,6 +482,7 @@ struct perf_addr_filter {
  * @list:	list of filters for this event
  * @lock:	spinlock that serializes accesses to the @list and event's
  *		(and its children's) filter generations.
+ * @nr_file_filters:	number of file-based filters
  *
  * A child event will use parent's @list (and therefore @lock), so they are
  * bundled together; see perf_event_addr_filters().
@@ -489,6 +490,7 @@ struct perf_addr_filter {
 struct perf_addr_filters_head {
 	struct list_head	list;
 	raw_spinlock_t		lock;
+	unsigned int		nr_file_filters;
 };
 
 /**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1730995c31ec..a8664247b3e4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8090,6 +8090,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 	if (task == TASK_TOMBSTONE)
 		return;
 
+	if (!ifh->nr_file_filters)
+		return;
+
 	mm = get_task_mm(event->ctx->task);
 	if (!mm)
 		goto restart;
@@ -8268,6 +8271,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 				if (!filename)
 					goto fail;
 
+				/*
+				 * For now, we only support file-based filters
+				 * in per-task events; doing so for CPU-wide
+				 * events requires additional context switching
+				 * trickery, since same object code will be
+				 * mapped at different virtual addresses in
+				 * different processes.
+				 */
+				ret = -EOPNOTSUPP;
+				if (!event->ctx->task)
+					goto fail_free_name;
+
 				/* look up the path and grab its inode */
 				ret = kern_path(filename, LOOKUP_FOLLOW, &path);
 				if (ret)
@@ -8283,6 +8298,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 				    !S_ISREG(filter->inode->i_mode))
 					/* free_filters_list() will iput() */
 					goto fail;
+
+				event->addr_filters.nr_file_filters++;
 			}
 
 			/* ready to consume more filters */
@@ -8322,24 +8339,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
 	if (WARN_ON_ONCE(event->parent))
 		return -EINVAL;
 
-	/*
-	 * For now, we only support filtering in per-task events; doing so
-	 * for CPU-wide events requires additional context switching trickery,
-	 * since same object code will be mapped at different virtual
-	 * addresses in different processes.
-	 */
-	if (!event->ctx->task)
-		return -EOPNOTSUPP;
-
 	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
 	if (ret)
-		return ret;
+		goto fail_clear_files;
 
 	ret = event->pmu->addr_filters_validate(&filters);
-	if (ret) {
-		free_filters_list(&filters);
-		return ret;
-	}
+	if (ret)
+		goto fail_free_filters;
 
 	/* remove existing filters, if any */
 	perf_addr_filters_splice(event, &filters);
@@ -8347,6 +8353,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
 	/* install new filters */
 	perf_event_for_each_child(event, perf_event_addr_filters_apply);
 
+	return ret;
+
+fail_free_filters:
+	free_filters_list(&filters);
+
+fail_clear_files:
+	event->addr_filters.nr_file_filters = 0;
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From dfb4357da6ddbdf57d583ba64361c9d792b0e0b1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 8 Feb 2017 11:26:59 -0800
Subject: time: Remove CONFIG_TIMER_STATS

Currently CONFIG_TIMER_STATS exposes process information across namespaces:

kernel/time/timer_list.c print_timer():

        SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);

/proc/timer_list:

 #11: <0000000000000000>, hrtimer_wakeup, S:01, do_nanosleep, cron/2570

Given that the tracer can give the same information, this patch entirely
removes CONFIG_TIMER_STATS.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: linux-doc@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Xing Gao <xgao01@email.wm.edu>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jessica Frazelle <me@jessfraz.com>
Cc: kernel-hardening@lists.openwall.com
Cc: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: Arjan van de Ven <arjan@linux.intel.com>
Link: http://lkml.kernel.org/r/20170208192659.GA32582@beast
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/timers/timer_stats.txt |  73 ------
 include/linux/hrtimer.h              |  11 -
 include/linux/timer.h                |  45 ----
 kernel/kthread.c                     |   1 -
 kernel/time/Makefile                 |   1 -
 kernel/time/hrtimer.c                |  38 ----
 kernel/time/timer.c                  |  48 +---
 kernel/time/timer_list.c             |  10 -
 kernel/time/timer_stats.c            | 425 -----------------------------------
 kernel/workqueue.c                   |   2 -
 lib/Kconfig.debug                    |  14 --
 11 files changed, 2 insertions(+), 666 deletions(-)
 delete mode 100644 Documentation/timers/timer_stats.txt
 delete mode 100644 kernel/time/timer_stats.c

(limited to 'include/linux')

diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt
deleted file mode 100644
index de835ee97455..000000000000
--- a/Documentation/timers/timer_stats.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-timer_stats - timer usage statistics
-------------------------------------
-
-timer_stats is a debugging facility to make the timer (ab)usage in a Linux
-system visible to kernel and userspace developers. If enabled in the config
-but not used it has almost zero runtime overhead, and a relatively small
-data structure overhead. Even if collection is enabled runtime all the
-locking is per-CPU and lookup is hashed.
-
-timer_stats should be used by kernel and userspace developers to verify that
-their code does not make unduly use of timers. This helps to avoid unnecessary
-wakeups, which should be avoided to optimize power consumption.
-
-It can be enabled by CONFIG_TIMER_STATS in the "Kernel hacking" configuration
-section.
-
-timer_stats collects information about the timer events which are fired in a
-Linux system over a sample period:
-
-- the pid of the task(process) which initialized the timer
-- the name of the process which initialized the timer
-- the function where the timer was initialized
-- the callback function which is associated to the timer
-- the number of events (callbacks)
-
-timer_stats adds an entry to /proc: /proc/timer_stats
-
-This entry is used to control the statistics functionality and to read out the
-sampled information.
-
-The timer_stats functionality is inactive on bootup.
-
-To activate a sample period issue:
-# echo 1 >/proc/timer_stats
-
-To stop a sample period issue:
-# echo 0 >/proc/timer_stats
-
-The statistics can be retrieved by:
-# cat /proc/timer_stats
-
-While sampling is enabled, each readout from /proc/timer_stats will see
-newly updated statistics. Once sampling is disabled, the sampled information
-is kept until a new sample period is started. This allows multiple readouts.
-
-Sample output of /proc/timer_stats:
-
-Timerstats sample period: 3.888770 s
-  12,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  15,     1 swapper          hcd_submit_urb (rh_timer_func)
-   4,   959 kedac            schedule_timeout (process_timeout)
-   1,     0 swapper          page_writeback_init (wb_timer_fn)
-  28,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  22,  2948 IRQ 4            tty_flip_buffer_push (delayed_work_timer_fn)
-   3,  3100 bash             schedule_timeout (process_timeout)
-   1,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
-   1,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
-   1,     1 swapper          neigh_table_init_no_netlink (neigh_periodic_timer)
-   1,  2292 ip               __netdev_watchdog_up (dev_watchdog)
-   1,    23 events/1         do_cache_clean (delayed_work_timer_fn)
-90 total events, 30.0 events/sec
-
-The first column is the number of events, the second column the pid, the third
-column is the name of the process. The forth column shows the function which
-initialized the timer and in parenthesis the callback function which was
-executed on expiry.
-
-    Thomas, Ingo
-
-Added flag to indicate 'deferrable timer' in /proc/timer_stats. A deferrable
-timer will appear as follows
-  10D,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
-
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cdab81ba29f8..e52b427223ba 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -88,12 +88,6 @@ enum hrtimer_restart {
  * @base:	pointer to the timer base (per cpu and per clock)
  * @state:	state information (See bit values above)
  * @is_rel:	Set if the timer was armed relative
- * @start_pid:  timer statistics field to store the pid of the task which
- *		started the timer
- * @start_site:	timer statistics field to store the site where the timer
- *		was started
- * @start_comm: timer statistics field to store the name of the process which
- *		started the timer
  *
  * The hrtimer structure must be initialized by hrtimer_init()
  */
@@ -104,11 +98,6 @@ struct hrtimer {
 	struct hrtimer_clock_base	*base;
 	u8				state;
 	u8				is_rel;
-#ifdef CONFIG_TIMER_STATS
-	int				start_pid;
-	void				*start_site;
-	char				start_comm[16];
-#endif
 };
 
 /**
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 51d601f192d4..5a209b84fd9e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -20,11 +20,6 @@ struct timer_list {
 	unsigned long		data;
 	u32			flags;
 
-#ifdef CONFIG_TIMER_STATS
-	int			start_pid;
-	void			*start_site;
-	char			start_comm[16];
-#endif
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
@@ -197,46 +192,6 @@ extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
  */
 #define NEXT_TIMER_MAX_DELTA	((1UL << 30) - 1)
 
-/*
- * Timer-statistics info:
- */
-#ifdef CONFIG_TIMER_STATS
-
-extern int timer_stats_active;
-
-extern void init_timer_stats(void);
-
-extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-				     void *timerf, char *comm, u32 flags);
-
-extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
-					       void *addr);
-
-static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
-{
-	if (likely(!timer_stats_active))
-		return;
-	__timer_stats_timer_set_start_info(timer, __builtin_return_address(0));
-}
-
-static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
-{
-	timer->start_site = NULL;
-}
-#else
-static inline void init_timer_stats(void)
-{
-}
-
-static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
-{
-}
-
-static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
-{
-}
-#endif
-
 extern void add_timer(struct timer_list *timer);
 
 extern int try_to_del_timer_sync(struct timer_list *timer);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2318fba86277..8461a4372e8a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -850,7 +850,6 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 
 	list_add(&work->node, &worker->delayed_work_list);
 	work->worker = worker;
-	timer_stats_timer_set_start_info(&dwork->timer);
 	timer->expires = jiffies + delay;
 	add_timer(timer);
 }
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 976840d29a71..938dbf33ef49 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -15,6 +15,5 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o tick-sched.o
-obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c6ecedd3b839..edabde646e58 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -766,34 +766,6 @@ void hrtimers_resume(void)
 	clock_was_set_delayed();
 }
 
-static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	if (timer->start_site)
-		return;
-	timer->start_site = __builtin_return_address(0);
-	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
-	timer->start_pid = current->pid;
-#endif
-}
-
-static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-#endif
-}
-
-static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	if (likely(!timer_stats_active))
-		return;
-	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm, 0);
-#endif
-}
-
 /*
  * Counterpart to lock_hrtimer_base above:
  */
@@ -932,7 +904,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
 		 * rare case and less expensive than a smp call.
 		 */
 		debug_deactivate(timer);
-		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
 		if (!restart)
@@ -990,8 +961,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
 
-	timer_stats_hrtimer_set_start_info(timer);
-
 	leftmost = enqueue_hrtimer(timer, new_base);
 	if (!leftmost)
 		goto unlock;
@@ -1128,12 +1097,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
 	timerqueue_init(&timer->node);
-
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-	timer->start_pid = -1;
-	memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
 }
 
 /**
@@ -1217,7 +1180,6 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	raw_write_seqcount_barrier(&cpu_base->seq);
 
 	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
-	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
 	/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ec33a6933eae..82a6bfa0c307 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -571,38 +571,6 @@ internal_add_timer(struct timer_base *base, struct timer_list *timer)
 	trigger_dyntick_cpu(base, timer);
 }
 
-#ifdef CONFIG_TIMER_STATS
-void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
-{
-	if (timer->start_site)
-		return;
-
-	timer->start_site = addr;
-	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
-	timer->start_pid = current->pid;
-}
-
-static void timer_stats_account_timer(struct timer_list *timer)
-{
-	void *site;
-
-	/*
-	 * start_site can be concurrently reset by
-	 * timer_stats_timer_clear_start_info()
-	 */
-	site = READ_ONCE(timer->start_site);
-	if (likely(!site))
-		return;
-
-	timer_stats_update_stats(timer, timer->start_pid, site,
-				 timer->function, timer->start_comm,
-				 timer->flags);
-}
-
-#else
-static void timer_stats_account_timer(struct timer_list *timer) {}
-#endif
-
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 
 static struct debug_obj_descr timer_debug_descr;
@@ -789,11 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 {
 	timer->entry.pprev = NULL;
 	timer->flags = flags | raw_smp_processor_id();
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-	timer->start_pid = -1;
-	memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
@@ -1001,8 +964,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		base = lock_timer_base(timer, &flags);
 	}
 
-	timer_stats_timer_set_start_info(timer);
-
 	ret = detach_if_pending(timer, base, false);
 	if (!ret && pending_only)
 		goto out_unlock;
@@ -1130,7 +1091,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	struct timer_base *new_base, *base;
 	unsigned long flags;
 
-	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 
 	new_base = get_timer_cpu_base(timer->flags, cpu);
@@ -1176,7 +1136,6 @@ int del_timer(struct timer_list *timer)
 
 	debug_assert_init(timer);
 
-	timer_stats_timer_clear_start_info(timer);
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		ret = detach_if_pending(timer, base, true);
@@ -1204,10 +1163,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
 
 	base = lock_timer_base(timer, &flags);
 
-	if (base->running_timer != timer) {
-		timer_stats_timer_clear_start_info(timer);
+	if (base->running_timer != timer)
 		ret = detach_if_pending(timer, base, true);
-	}
+
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
@@ -1331,7 +1289,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		unsigned long data;
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
-		timer_stats_account_timer(timer);
 
 		base->running_timer = timer;
 		detach_timer(timer, true);
@@ -1868,7 +1825,6 @@ static void __init init_timer_cpus(void)
 void __init init_timers(void)
 {
 	init_timer_cpus();
-	init_timer_stats();
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index afe6cd1944fc..387a3a5aa388 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -62,21 +62,11 @@ static void
 print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
 	    int idx, u64 now)
 {
-#ifdef CONFIG_TIMER_STATS
-	char tmp[TASK_COMM_LEN + 1];
-#endif
 	SEQ_printf(m, " #%d: ", idx);
 	print_name_offset(m, taddr);
 	SEQ_printf(m, ", ");
 	print_name_offset(m, timer->function);
 	SEQ_printf(m, ", S:%02x", timer->state);
-#ifdef CONFIG_TIMER_STATS
-	SEQ_printf(m, ", ");
-	print_name_offset(m, timer->start_site);
-	memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
-	tmp[TASK_COMM_LEN] = 0;
-	SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
-#endif
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
 		(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
deleted file mode 100644
index afddded947df..000000000000
--- a/kernel/time/timer_stats.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * kernel/time/timer_stats.c
- *
- * Collect timer usage statistics.
- *
- * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
- * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * timer_stats is based on timer_top, a similar functionality which was part of
- * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
- * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
- * on dynamic allocation of the statistics entries and linear search based
- * lookup combined with a global lock, rather than the static array, hash
- * and per-CPU locking which is used by timer_stats. It was written for the
- * pre hrtimer kernel code and therefore did not take hrtimers into account.
- * Nevertheless it provided the base for the timer_stats implementation and
- * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
- * for this effort.
- *
- * timer_top.c is
- *	Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
- *	Written by Daniel Petrini <d.pensator@gmail.com>
- *	timer_top.c was released under the GNU General Public License version 2
- *
- * We export the addresses and counting of timer functions being called,
- * the pid and cmdline from the owner process if applicable.
- *
- * Start/stop data collection:
- * # echo [1|0] >/proc/timer_stats
- *
- * Display the information collected so far:
- * # cat /proc/timer_stats
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-
-#include <linux/uaccess.h>
-
-/*
- * This is our basic unit of interest: a timer expiry event identified
- * by the timer, its start/expire functions and the PID of the task that
- * started the timer. We count the number of times an event happens:
- */
-struct entry {
-	/*
-	 * Hash list:
-	 */
-	struct entry		*next;
-
-	/*
-	 * Hash keys:
-	 */
-	void			*timer;
-	void			*start_func;
-	void			*expire_func;
-	pid_t			pid;
-
-	/*
-	 * Number of timeout events:
-	 */
-	unsigned long		count;
-	u32			flags;
-
-	/*
-	 * We save the command-line string to preserve
-	 * this information past task exit:
-	 */
-	char			comm[TASK_COMM_LEN + 1];
-
-} ____cacheline_aligned_in_smp;
-
-/*
- * Spinlock protecting the tables - not taken during lookup:
- */
-static DEFINE_RAW_SPINLOCK(table_lock);
-
-/*
- * Per-CPU lookup locks for fast hash lookup:
- */
-static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
-
-/*
- * Mutex to serialize state changes with show-stats activities:
- */
-static DEFINE_MUTEX(show_mutex);
-
-/*
- * Collection status, active/inactive:
- */
-int __read_mostly timer_stats_active;
-
-/*
- * Beginning/end timestamps of measurement:
- */
-static ktime_t time_start, time_stop;
-
-/*
- * tstat entry structs only get allocated while collection is
- * active and never freed during that time - this simplifies
- * things quite a bit.
- *
- * They get freed when a new collection period is started.
- */
-#define MAX_ENTRIES_BITS	10
-#define MAX_ENTRIES		(1UL << MAX_ENTRIES_BITS)
-
-static unsigned long nr_entries;
-static struct entry entries[MAX_ENTRIES];
-
-static atomic_t overflow_count;
-
-/*
- * The entries are in a hash-table, for fast lookup:
- */
-#define TSTAT_HASH_BITS		(MAX_ENTRIES_BITS - 1)
-#define TSTAT_HASH_SIZE		(1UL << TSTAT_HASH_BITS)
-#define TSTAT_HASH_MASK		(TSTAT_HASH_SIZE - 1)
-
-#define __tstat_hashfn(entry)						\
-	(((unsigned long)(entry)->timer       ^				\
-	  (unsigned long)(entry)->start_func  ^				\
-	  (unsigned long)(entry)->expire_func ^				\
-	  (unsigned long)(entry)->pid		) & TSTAT_HASH_MASK)
-
-#define tstat_hashentry(entry)	(tstat_hash_table + __tstat_hashfn(entry))
-
-static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
-
-static void reset_entries(void)
-{
-	nr_entries = 0;
-	memset(entries, 0, sizeof(entries));
-	memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
-	atomic_set(&overflow_count, 0);
-}
-
-static struct entry *alloc_entry(void)
-{
-	if (nr_entries >= MAX_ENTRIES)
-		return NULL;
-
-	return entries + nr_entries++;
-}
-
-static int match_entries(struct entry *entry1, struct entry *entry2)
-{
-	return entry1->timer       == entry2->timer	  &&
-	       entry1->start_func  == entry2->start_func  &&
-	       entry1->expire_func == entry2->expire_func &&
-	       entry1->pid	   == entry2->pid;
-}
-
-/*
- * Look up whether an entry matching this item is present
- * in the hash already. Must be called with irqs off and the
- * lookup lock held:
- */
-static struct entry *tstat_lookup(struct entry *entry, char *comm)
-{
-	struct entry **head, *curr, *prev;
-
-	head = tstat_hashentry(entry);
-	curr = *head;
-
-	/*
-	 * The fastpath is when the entry is already hashed,
-	 * we do this with the lookup lock held, but with the
-	 * table lock not held:
-	 */
-	while (curr) {
-		if (match_entries(curr, entry))
-			return curr;
-
-		curr = curr->next;
-	}
-	/*
-	 * Slowpath: allocate, set up and link a new hash entry:
-	 */
-	prev = NULL;
-	curr = *head;
-
-	raw_spin_lock(&table_lock);
-	/*
-	 * Make sure we have not raced with another CPU:
-	 */
-	while (curr) {
-		if (match_entries(curr, entry))
-			goto out_unlock;
-
-		prev = curr;
-		curr = curr->next;
-	}
-
-	curr = alloc_entry();
-	if (curr) {
-		*curr = *entry;
-		curr->count = 0;
-		curr->next = NULL;
-		memcpy(curr->comm, comm, TASK_COMM_LEN);
-
-		smp_mb(); /* Ensure that curr is initialized before insert */
-
-		if (prev)
-			prev->next = curr;
-		else
-			*head = curr;
-	}
- out_unlock:
-	raw_spin_unlock(&table_lock);
-
-	return curr;
-}
-
-/**
- * timer_stats_update_stats - Update the statistics for a timer.
- * @timer:	pointer to either a timer_list or a hrtimer
- * @pid:	the pid of the task which set up the timer
- * @startf:	pointer to the function which did the timer setup
- * @timerf:	pointer to the timer callback function of the timer
- * @comm:	name of the process which set up the timer
- * @tflags:	The flags field of the timer
- *
- * When the timer is already registered, then the event counter is
- * incremented. Otherwise the timer is registered in a free slot.
- */
-void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-			      void *timerf, char *comm, u32 tflags)
-{
-	/*
-	 * It doesn't matter which lock we take:
-	 */
-	raw_spinlock_t *lock;
-	struct entry *entry, input;
-	unsigned long flags;
-
-	if (likely(!timer_stats_active))
-		return;
-
-	lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
-
-	input.timer = timer;
-	input.start_func = startf;
-	input.expire_func = timerf;
-	input.pid = pid;
-	input.flags = tflags;
-
-	raw_spin_lock_irqsave(lock, flags);
-	if (!timer_stats_active)
-		goto out_unlock;
-
-	entry = tstat_lookup(&input, comm);
-	if (likely(entry))
-		entry->count++;
-	else
-		atomic_inc(&overflow_count);
-
- out_unlock:
-	raw_spin_unlock_irqrestore(lock, flags);
-}
-
-static void print_name_offset(struct seq_file *m, unsigned long addr)
-{
-	char symname[KSYM_NAME_LEN];
-
-	if (lookup_symbol_name(addr, symname) < 0)
-		seq_printf(m, "<%p>", (void *)addr);
-	else
-		seq_printf(m, "%s", symname);
-}
-
-static int tstats_show(struct seq_file *m, void *v)
-{
-	struct timespec64 period;
-	struct entry *entry;
-	unsigned long ms;
-	long events = 0;
-	ktime_t time;
-	int i;
-
-	mutex_lock(&show_mutex);
-	/*
-	 * If still active then calculate up to now:
-	 */
-	if (timer_stats_active)
-		time_stop = ktime_get();
-
-	time = ktime_sub(time_stop, time_start);
-
-	period = ktime_to_timespec64(time);
-	ms = period.tv_nsec / 1000000;
-
-	seq_puts(m, "Timer Stats Version: v0.3\n");
-	seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
-	if (atomic_read(&overflow_count))
-		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
-	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
-
-	for (i = 0; i < nr_entries; i++) {
-		entry = entries + i;
-		if (entry->flags & TIMER_DEFERRABLE) {
-			seq_printf(m, "%4luD, %5d %-16s ",
-				entry->count, entry->pid, entry->comm);
-		} else {
-			seq_printf(m, " %4lu, %5d %-16s ",
-				entry->count, entry->pid, entry->comm);
-		}
-
-		print_name_offset(m, (unsigned long)entry->start_func);
-		seq_puts(m, " (");
-		print_name_offset(m, (unsigned long)entry->expire_func);
-		seq_puts(m, ")\n");
-
-		events += entry->count;
-	}
-
-	ms += period.tv_sec * 1000;
-	if (!ms)
-		ms = 1;
-
-	if (events && period.tv_sec)
-		seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
-			   events, events * 1000 / ms,
-			   (events * 1000000 / ms) % 1000);
-	else
-		seq_printf(m, "%ld total events\n", events);
-
-	mutex_unlock(&show_mutex);
-
-	return 0;
-}
-
-/*
- * After a state change, make sure all concurrent lookup/update
- * activities have stopped:
- */
-static void sync_access(void)
-{
-	unsigned long flags;
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
-
-		raw_spin_lock_irqsave(lock, flags);
-		/* nothing */
-		raw_spin_unlock_irqrestore(lock, flags);
-	}
-}
-
-static ssize_t tstats_write(struct file *file, const char __user *buf,
-			    size_t count, loff_t *offs)
-{
-	char ctl[2];
-
-	if (count != 2 || *offs)
-		return -EINVAL;
-
-	if (copy_from_user(ctl, buf, count))
-		return -EFAULT;
-
-	mutex_lock(&show_mutex);
-	switch (ctl[0]) {
-	case '0':
-		if (timer_stats_active) {
-			timer_stats_active = 0;
-			time_stop = ktime_get();
-			sync_access();
-		}
-		break;
-	case '1':
-		if (!timer_stats_active) {
-			reset_entries();
-			time_start = ktime_get();
-			smp_mb();
-			timer_stats_active = 1;
-		}
-		break;
-	default:
-		count = -EINVAL;
-	}
-	mutex_unlock(&show_mutex);
-
-	return count;
-}
-
-static int tstats_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, tstats_show, NULL);
-}
-
-static const struct file_operations tstats_fops = {
-	.open		= tstats_open,
-	.read		= seq_read,
-	.write		= tstats_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-void __init init_timer_stats(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
-}
-
-static int __init init_tstats_procfs(void)
-{
-	struct proc_dir_entry *pe;
-
-	pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
-	if (!pe)
-		return -ENOMEM;
-	return 0;
-}
-__initcall(init_tstats_procfs);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1d9fb6543a66..072cbc9b175d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1523,8 +1523,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 		return;
 	}
 
-	timer_stats_timer_set_start_info(&dwork->timer);
-
 	dwork->wq = wq;
 	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index eb9e9a7870fa..132af338d6dd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -980,20 +980,6 @@ config DEBUG_TIMEKEEPING
 
 	  If unsure, say N.
 
-config TIMER_STATS
-	bool "Collect kernel timers statistics"
-	depends on DEBUG_KERNEL && PROC_FS
-	help
-	  If you say Y here, additional code will be inserted into the
-	  timer routines to collect statistics about kernel timers being
-	  reprogrammed. The statistics can be read from /proc/timer_stats.
-	  The statistics collection is started by writing 1 to /proc/timer_stats,
-	  writing 0 stops it. This feature is useful to collect information
-	  about timer usage patterns in kernel and userspace. This feature
-	  is lightweight if enabled in the kernel config but not activated
-	  (it defaults to deactivated on bootup and will only be activated
-	  if some application like powertop activates it explicitly).
-
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT
-- 
cgit v1.2.3


From 534766dfef999f7e7349bbd91cd19c1673792af3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 31 Jan 2017 16:58:42 +0100
Subject: iommu: Rename iommu_get_instance()

Rename the function to iommu_ops_from_fwnode(), because that
is what the function actually does. The new name is much
more descriptive about what the function does.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 2 +-
 drivers/iommu/iommu.c     | 2 +-
 include/linux/iommu.h     | 4 ++--
 include/linux/of_iommu.h  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e0d2e6e6e40c..3752521c62ab 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -536,7 +536,7 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
 		if (!iort_fwnode)
 			return NULL;
 
-		ops = iommu_get_instance(iort_fwnode);
+		ops = iommu_ops_from_fwnode(iort_fwnode);
 		if (!ops)
 			return NULL;
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index aed906a3e3db..2bb61e806a52 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1664,7 +1664,7 @@ void iommu_register_instance(struct fwnode_handle *fwnode,
 	spin_unlock(&iommu_instance_lock);
 }
 
-const struct iommu_ops *iommu_get_instance(struct fwnode_handle *fwnode)
+const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 {
 	struct iommu_instance *instance;
 	const struct iommu_ops *ops = NULL;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0ff5111f6959..085e1f0e6c07 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -354,7 +354,7 @@ void iommu_fwspec_free(struct device *dev);
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
 void iommu_register_instance(struct fwnode_handle *fwnode,
 			     const struct iommu_ops *ops);
-const struct iommu_ops *iommu_get_instance(struct fwnode_handle *fwnode);
+const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 
 #else /* CONFIG_IOMMU_API */
 
@@ -590,7 +590,7 @@ static inline void iommu_register_instance(struct fwnode_handle *fwnode,
 }
 
 static inline
-const struct iommu_ops *iommu_get_instance(struct fwnode_handle *fwnode)
+const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index 6a7fc5051099..66fcbc949899 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -39,7 +39,7 @@ static inline void of_iommu_set_ops(struct device_node *np,
 
 static inline const struct iommu_ops *of_iommu_get_ops(struct device_node *np)
 {
-	return iommu_get_instance(&np->fwnode);
+	return iommu_ops_from_fwnode(&np->fwnode);
 }
 
 extern struct of_device_id __iommu_of_table;
-- 
cgit v1.2.3


From b0119e870837dcd15a207b4701542ebac5d19b45 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 1 Feb 2017 13:23:08 +0100
Subject: iommu: Introduce new 'struct iommu_device'

This struct represents one hardware iommu in the iommu core
code. For now it only has the iommu-ops associated with it,
but that will be extended soon.

The register/unregister interface is also added, as well as
making use of it in the Intel and AMD IOMMU drivers.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd_iommu.c       |  4 ++--
 drivers/iommu/amd_iommu_init.c  |  5 +++++
 drivers/iommu/amd_iommu_types.h |  3 +++
 drivers/iommu/dmar.c            |  9 +++++++++
 drivers/iommu/intel-iommu.c     |  4 ++--
 drivers/iommu/iommu.c           | 19 +++++++++++++++++++
 include/linux/intel-iommu.h     |  2 ++
 include/linux/iommu.h           | 35 +++++++++++++++++++++++++++++++++++
 8 files changed, 77 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 019e02707cd5..689d88fa29a6 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -112,7 +112,7 @@ static struct timer_list queue_timer;
  * Domain for untranslated devices - only allocated
  * if iommu=pt passed on kernel cmd line.
  */
-static const struct iommu_ops amd_iommu_ops;
+const struct iommu_ops amd_iommu_ops;
 
 static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
 int amd_iommu_max_glx_val = -1;
@@ -3217,7 +3217,7 @@ static void amd_iommu_apply_dm_region(struct device *dev,
 	WARN_ON_ONCE(reserve_iova(&dma_dom->iovad, start, end) == NULL);
 }
 
-static const struct iommu_ops amd_iommu_ops = {
+const struct iommu_ops amd_iommu_ops = {
 	.capable = amd_iommu_capable,
 	.domain_alloc = amd_iommu_domain_alloc,
 	.domain_free  = amd_iommu_domain_free,
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 6799cf9713f7..b7ccfb21b271 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -94,6 +94,8 @@
  * out of it.
  */
 
+extern const struct iommu_ops amd_iommu_ops;
+
 /*
  * structure describing one IOMMU in the ACPI table. Typically followed by one
  * or more ivhd_entrys.
@@ -1639,6 +1641,9 @@ static int iommu_init_pci(struct amd_iommu *iommu)
 					       amd_iommu_groups, "ivhd%d",
 					       iommu->index);
 
+	iommu_device_set_ops(&iommu->iommu, &amd_iommu_ops);
+	iommu_device_register(&iommu->iommu);
+
 	return pci_enable_device(iommu->dev);
 }
 
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 0d91785ebdc3..0683505a498a 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -538,6 +538,9 @@ struct amd_iommu {
 	/* IOMMU sysfs device */
 	struct device *iommu_dev;
 
+	/* Handle for IOMMU core code */
+	struct iommu_device iommu;
+
 	/*
 	 * We can't rely on the BIOS to restore all values on reinit, so we
 	 * need to stash them
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index a88576d50740..83fee0e8cf43 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -74,6 +74,8 @@ static unsigned long dmar_seq_ids[BITS_TO_LONGS(DMAR_UNITS_SUPPORTED)];
 static int alloc_iommu(struct dmar_drhd_unit *drhd);
 static void free_iommu(struct intel_iommu *iommu);
 
+extern const struct iommu_ops intel_iommu_ops;
+
 static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
 {
 	/*
@@ -1084,6 +1086,12 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
 			err = PTR_ERR(iommu->iommu_dev);
 			goto err_unmap;
 		}
+
+		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+
+		err = iommu_device_register(&iommu->iommu);
+		if (err)
+			goto err_unmap;
 	}
 
 	drhd->iommu = iommu;
@@ -1102,6 +1110,7 @@ error:
 static void free_iommu(struct intel_iommu *iommu)
 {
 	iommu_device_destroy(iommu->iommu_dev);
+	iommu_device_unregister(&iommu->iommu);
 
 	if (iommu->irq) {
 		if (iommu->pr_irq) {
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c66c273dfd8a..e6e8f5bf3a8f 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -547,7 +547,7 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 static DEFINE_SPINLOCK(device_domain_lock);
 static LIST_HEAD(device_domain_list);
 
-static const struct iommu_ops intel_iommu_ops;
+const struct iommu_ops intel_iommu_ops;
 
 static bool translation_pre_enabled(struct intel_iommu *iommu)
 {
@@ -5292,7 +5292,7 @@ struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
 }
 #endif /* CONFIG_INTEL_IOMMU_SVM */
 
-static const struct iommu_ops intel_iommu_ops = {
+const struct iommu_ops intel_iommu_ops = {
 	.capable	= intel_iommu_capable,
 	.domain_alloc	= intel_iommu_domain_alloc,
 	.domain_free	= intel_iommu_domain_free,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cc569b1b66a2..1dfd70ea27e4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -77,6 +77,25 @@ struct iommu_group_attribute iommu_group_attr_##_name =		\
 #define to_iommu_group(_kobj)		\
 	container_of(_kobj, struct iommu_group, kobj)
 
+static LIST_HEAD(iommu_device_list);
+static DEFINE_SPINLOCK(iommu_device_lock);
+
+int iommu_device_register(struct iommu_device *iommu)
+{
+	spin_lock(&iommu_device_lock);
+	list_add_tail(&iommu->list, &iommu_device_list);
+	spin_unlock(&iommu_device_lock);
+
+	return 0;
+}
+
+void iommu_device_unregister(struct iommu_device *iommu)
+{
+	spin_lock(&iommu_device_lock);
+	list_del(&iommu->list);
+	spin_unlock(&iommu_device_lock);
+}
+
 static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 						 unsigned type);
 static int __iommu_attach_device(struct iommu_domain *domain,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d49e26c6cdc7..99a65a397861 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -29,6 +29,7 @@
 #include <linux/dma_remapping.h>
 #include <linux/mmu_notifier.h>
 #include <linux/list.h>
+#include <linux/iommu.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 
@@ -440,6 +441,7 @@ struct intel_iommu {
 	struct irq_domain *ir_msi_domain;
 #endif
 	struct device	*iommu_dev; /* IOMMU-sysfs device */
+	struct iommu_device iommu;  /* IOMMU core code handle */
 	int		node;
 	u32		flags;      /* Software defined flags */
 };
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 085e1f0e6c07..900ddd212364 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -204,6 +204,26 @@ struct iommu_ops {
 	unsigned long pgsize_bitmap;
 };
 
+/**
+ * struct iommu_device - IOMMU core representation of one IOMMU hardware
+ *			 instance
+ * @list: Used by the iommu-core to keep a list of registered iommus
+ * @ops: iommu-ops for talking to this iommu
+ */
+struct iommu_device {
+	struct list_head list;
+	const struct iommu_ops *ops;
+};
+
+int  iommu_device_register(struct iommu_device *iommu);
+void iommu_device_unregister(struct iommu_device *iommu);
+
+static inline void iommu_device_set_ops(struct iommu_device *iommu,
+					const struct iommu_ops *ops)
+{
+	iommu->ops = ops;
+}
+
 #define IOMMU_GROUP_NOTIFY_ADD_DEVICE		1 /* Device added */
 #define IOMMU_GROUP_NOTIFY_DEL_DEVICE		2 /* Pre Device removed */
 #define IOMMU_GROUP_NOTIFY_BIND_DRIVER		3 /* Pre Driver bind */
@@ -361,6 +381,7 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 struct iommu_ops {};
 struct iommu_group {};
 struct iommu_fwspec {};
+struct iommu_device {};
 
 static inline bool iommu_present(struct bus_type *bus)
 {
@@ -558,6 +579,20 @@ static inline void iommu_device_destroy(struct device *dev)
 {
 }
 
+static inline int  iommu_device_register(struct iommu_device *iommu)
+{
+	return -ENODEV;
+}
+
+static inline void iommu_device_set_ops(struct iommu_device *iommu,
+					const struct iommu_ops *ops)
+{
+}
+
+static inline void iommu_device_unregister(struct iommu_device *iommu)
+{
+}
+
 static inline int iommu_device_link(struct device *dev, struct device *link)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 39ab9555c24110671f8dc671311a26e5c985b592 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 1 Feb 2017 16:56:46 +0100
Subject: iommu: Add sysfs bindings for struct iommu_device

There is currently support for iommu sysfs bindings, but
those need to be implemented in the IOMMU drivers. Add a
more generic version of this by adding a struct device to
struct iommu_device and use that for the sysfs bindings.

Also convert the AMD and Intel IOMMU driver to make use of
it.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd_iommu.c       | 14 ++++++++-----
 drivers/iommu/amd_iommu_init.c  |  6 ++----
 drivers/iommu/amd_iommu_types.h |  3 ---
 drivers/iommu/dmar.c            | 13 +++++-------
 drivers/iommu/intel-iommu.c     | 15 ++++++++------
 drivers/iommu/iommu-sysfs.c     | 45 +++++++++++++++++------------------------
 include/linux/intel-iommu.h     |  1 -
 include/linux/iommu.h           | 33 ++++++++++++++++--------------
 8 files changed, 61 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 689d88fa29a6..4fee2fdbef3e 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -445,6 +445,7 @@ static void init_iommu_group(struct device *dev)
 static int iommu_init_device(struct device *dev)
 {
 	struct iommu_dev_data *dev_data;
+	struct amd_iommu *iommu;
 	int devid;
 
 	if (dev->archdata.iommu)
@@ -454,6 +455,8 @@ static int iommu_init_device(struct device *dev)
 	if (devid < 0)
 		return devid;
 
+	iommu = amd_iommu_rlookup_table[devid];
+
 	dev_data = find_dev_data(devid);
 	if (!dev_data)
 		return -ENOMEM;
@@ -469,8 +472,7 @@ static int iommu_init_device(struct device *dev)
 
 	dev->archdata.iommu = dev_data;
 
-	iommu_device_link(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
-			  dev);
+	iommu_device_link(&iommu->iommu.dev, dev);
 
 	return 0;
 }
@@ -495,13 +497,16 @@ static void iommu_ignore_device(struct device *dev)
 
 static void iommu_uninit_device(struct device *dev)
 {
-	int devid;
 	struct iommu_dev_data *dev_data;
+	struct amd_iommu *iommu;
+	int devid;
 
 	devid = get_device_id(dev);
 	if (devid < 0)
 		return;
 
+	iommu = amd_iommu_rlookup_table[devid];
+
 	dev_data = search_dev_data(devid);
 	if (!dev_data)
 		return;
@@ -509,8 +514,7 @@ static void iommu_uninit_device(struct device *dev)
 	if (dev_data->domain)
 		detach_device(dev);
 
-	iommu_device_unlink(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
-			    dev);
+	iommu_device_unlink(&iommu->iommu.dev, dev);
 
 	iommu_group_remove_device(dev);
 
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index b7ccfb21b271..6b9e66122528 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -1637,10 +1637,8 @@ static int iommu_init_pci(struct amd_iommu *iommu)
 	amd_iommu_erratum_746_workaround(iommu);
 	amd_iommu_ats_write_check_workaround(iommu);
 
-	iommu->iommu_dev = iommu_device_create(&iommu->dev->dev, iommu,
-					       amd_iommu_groups, "ivhd%d",
-					       iommu->index);
-
+	iommu_device_sysfs_add(&iommu->iommu, &iommu->dev->dev,
+			       amd_iommu_groups, "ivhd%d", iommu->index);
 	iommu_device_set_ops(&iommu->iommu, &amd_iommu_ops);
 	iommu_device_register(&iommu->iommu);
 
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 0683505a498a..af00f381a7b1 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -535,9 +535,6 @@ struct amd_iommu {
 	/* if one, we need to send a completion wait command */
 	bool need_sync;
 
-	/* IOMMU sysfs device */
-	struct device *iommu_dev;
-
 	/* Handle for IOMMU core code */
 	struct iommu_device iommu;
 
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 83fee0e8cf43..fc13146f8d16 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1078,14 +1078,11 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
 	raw_spin_lock_init(&iommu->register_lock);
 
 	if (intel_iommu_enabled) {
-		iommu->iommu_dev = iommu_device_create(NULL, iommu,
-						       intel_iommu_groups,
-						       "%s", iommu->name);
-
-		if (IS_ERR(iommu->iommu_dev)) {
-			err = PTR_ERR(iommu->iommu_dev);
+		err = iommu_device_sysfs_add(&iommu->iommu, NULL,
+					     intel_iommu_groups,
+					     "%s", iommu->name);
+		if (err)
 			goto err_unmap;
-		}
 
 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
 
@@ -1109,7 +1106,7 @@ error:
 
 static void free_iommu(struct intel_iommu *iommu)
 {
-	iommu_device_destroy(iommu->iommu_dev);
+	iommu_device_sysfs_remove(&iommu->iommu);
 	iommu_device_unregister(&iommu->iommu);
 
 	if (iommu->irq) {
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e6e8f5bf3a8f..316730c63af6 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4834,10 +4834,13 @@ int __init intel_iommu_init(void)
 
 	init_iommu_pm_ops();
 
-	for_each_active_iommu(iommu, drhd)
-		iommu->iommu_dev = iommu_device_create(NULL, iommu,
-						       intel_iommu_groups,
-						       "%s", iommu->name);
+	for_each_active_iommu(iommu, drhd) {
+		iommu_device_sysfs_add(&iommu->iommu, NULL,
+				       intel_iommu_groups,
+				       "%s", iommu->name);
+		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+		iommu_device_register(&iommu->iommu);
+	}
 
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	bus_register_notifier(&pci_bus_type, &device_nb);
@@ -5159,7 +5162,7 @@ static int intel_iommu_add_device(struct device *dev)
 	if (!iommu)
 		return -ENODEV;
 
-	iommu_device_link(iommu->iommu_dev, dev);
+	iommu_device_link(&iommu->iommu.dev, dev);
 
 	group = iommu_group_get_for_dev(dev);
 
@@ -5181,7 +5184,7 @@ static void intel_iommu_remove_device(struct device *dev)
 
 	iommu_group_remove_device(dev);
 
-	iommu_device_unlink(iommu->iommu_dev, dev);
+	iommu_device_unlink(&iommu->iommu.dev, dev);
 }
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
index 39b2d9127dbf..bb87d35e471d 100644
--- a/drivers/iommu/iommu-sysfs.c
+++ b/drivers/iommu/iommu-sysfs.c
@@ -50,54 +50,45 @@ static int __init iommu_dev_init(void)
 postcore_initcall(iommu_dev_init);
 
 /*
- * Create an IOMMU device and return a pointer to it.  IOMMU specific
- * attributes can be provided as an attribute group, allowing a unique
- * namespace per IOMMU type.
+ * Init the struct device for the IOMMU. IOMMU specific attributes can
+ * be provided as an attribute group, allowing a unique namespace per
+ * IOMMU type.
  */
-struct device *iommu_device_create(struct device *parent, void *drvdata,
-				   const struct attribute_group **groups,
-				   const char *fmt, ...)
+int iommu_device_sysfs_add(struct iommu_device *iommu,
+			   struct device *parent,
+			   const struct attribute_group **groups,
+			   const char *fmt, ...)
 {
-	struct device *dev;
 	va_list vargs;
 	int ret;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return ERR_PTR(-ENOMEM);
+	device_initialize(&iommu->dev);
 
-	device_initialize(dev);
-
-	dev->class = &iommu_class;
-	dev->parent = parent;
-	dev->groups = groups;
-	dev_set_drvdata(dev, drvdata);
+	iommu->dev.class = &iommu_class;
+	iommu->dev.parent = parent;
+	iommu->dev.groups = groups;
 
 	va_start(vargs, fmt);
-	ret = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
+	ret = kobject_set_name_vargs(&iommu->dev.kobj, fmt, vargs);
 	va_end(vargs);
 	if (ret)
 		goto error;
 
-	ret = device_add(dev);
+	ret = device_add(&iommu->dev);
 	if (ret)
 		goto error;
 
-	return dev;
+	return 0;
 
 error:
-	put_device(dev);
-	return ERR_PTR(ret);
+	put_device(&iommu->dev);
+	return ret;
 }
 
-void iommu_device_destroy(struct device *dev)
+void iommu_device_sysfs_remove(struct iommu_device *iommu)
 {
-	if (!dev || IS_ERR(dev))
-		return;
-
-	device_unregister(dev);
+	device_unregister(&iommu->dev);
 }
-
 /*
  * IOMMU drivers can indicate a device is managed by a given IOMMU using
  * this interface.  A link to the device will be created in the "devices"
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 99a65a397861..3ba9b536387b 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -440,7 +440,6 @@ struct intel_iommu {
 	struct irq_domain *ir_domain;
 	struct irq_domain *ir_msi_domain;
 #endif
-	struct device	*iommu_dev; /* IOMMU-sysfs device */
 	struct iommu_device iommu;  /* IOMMU core code handle */
 	int		node;
 	u32		flags;      /* Software defined flags */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 900ddd212364..c578ca135bed 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -209,14 +209,21 @@ struct iommu_ops {
  *			 instance
  * @list: Used by the iommu-core to keep a list of registered iommus
  * @ops: iommu-ops for talking to this iommu
+ * @dev: struct device for sysfs handling
  */
 struct iommu_device {
 	struct list_head list;
 	const struct iommu_ops *ops;
+	struct device dev;
 };
 
 int  iommu_device_register(struct iommu_device *iommu);
 void iommu_device_unregister(struct iommu_device *iommu);
+int  iommu_device_sysfs_add(struct iommu_device *iommu,
+			    struct device *parent,
+			    const struct attribute_group **groups,
+			    const char *fmt, ...) __printf(4, 5);
+void iommu_device_sysfs_remove(struct iommu_device *iommu);
 
 static inline void iommu_device_set_ops(struct iommu_device *iommu,
 					const struct iommu_ops *ops)
@@ -287,10 +294,6 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
-struct device *iommu_device_create(struct device *parent, void *drvdata,
-				   const struct attribute_group **groups,
-				   const char *fmt, ...) __printf(4, 5);
-void iommu_device_destroy(struct device *dev);
 int iommu_device_link(struct device *dev, struct device *link);
 void iommu_device_unlink(struct device *dev, struct device *link);
 
@@ -567,29 +570,29 @@ static inline int iommu_domain_set_attr(struct iommu_domain *domain,
 	return -EINVAL;
 }
 
-static inline struct device *iommu_device_create(struct device *parent,
-					void *drvdata,
-					const struct attribute_group **groups,
-					const char *fmt, ...)
+static inline int  iommu_device_register(struct iommu_device *iommu)
 {
-	return ERR_PTR(-ENODEV);
+	return -ENODEV;
 }
 
-static inline void iommu_device_destroy(struct device *dev)
+static inline void iommu_device_set_ops(struct iommu_device *iommu,
+					const struct iommu_ops *ops)
 {
 }
 
-static inline int  iommu_device_register(struct iommu_device *iommu)
+static inline void iommu_device_unregister(struct iommu_device *iommu)
 {
-	return -ENODEV;
 }
 
-static inline void iommu_device_set_ops(struct iommu_device *iommu,
-					const struct iommu_ops *ops)
+static inline int  iommu_device_sysfs_add(struct iommu_device *iommu,
+					  struct device *parent,
+					  const struct attribute_group **groups,
+					  const char *fmt, ...)
 {
+	return -ENODEV;
 }
 
-static inline void iommu_device_unregister(struct iommu_device *iommu)
+static inline void iommu_device_sysfs_remove(struct iommu_device *iommu)
 {
 }
 
-- 
cgit v1.2.3


From e3d10af1128b6bc394f21656ff13753130f3c107 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 1 Feb 2017 17:23:22 +0100
Subject: iommu: Make iommu_device_link/unlink take a struct iommu_device

This makes the interface more consistent with
iommu_device_sysfs_add/remove.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd_iommu.c   |  4 ++--
 drivers/iommu/intel-iommu.c |  4 ++--
 drivers/iommu/iommu-sysfs.c | 16 ++++++++--------
 include/linux/iommu.h       |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 4fee2fdbef3e..ba708c5bdaed 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -472,7 +472,7 @@ static int iommu_init_device(struct device *dev)
 
 	dev->archdata.iommu = dev_data;
 
-	iommu_device_link(&iommu->iommu.dev, dev);
+	iommu_device_link(&iommu->iommu, dev);
 
 	return 0;
 }
@@ -514,7 +514,7 @@ static void iommu_uninit_device(struct device *dev)
 	if (dev_data->domain)
 		detach_device(dev);
 
-	iommu_device_unlink(&iommu->iommu.dev, dev);
+	iommu_device_unlink(&iommu->iommu, dev);
 
 	iommu_group_remove_device(dev);
 
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 316730c63af6..cbe7c49b7565 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5162,7 +5162,7 @@ static int intel_iommu_add_device(struct device *dev)
 	if (!iommu)
 		return -ENODEV;
 
-	iommu_device_link(&iommu->iommu.dev, dev);
+	iommu_device_link(&iommu->iommu, dev);
 
 	group = iommu_group_get_for_dev(dev);
 
@@ -5184,7 +5184,7 @@ static void intel_iommu_remove_device(struct device *dev)
 
 	iommu_group_remove_device(dev);
 
-	iommu_device_unlink(&iommu->iommu.dev, dev);
+	iommu_device_unlink(&iommu->iommu, dev);
 }
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
index bb87d35e471d..c58351ed61c1 100644
--- a/drivers/iommu/iommu-sysfs.c
+++ b/drivers/iommu/iommu-sysfs.c
@@ -95,31 +95,31 @@ void iommu_device_sysfs_remove(struct iommu_device *iommu)
  * directory of the IOMMU device in sysfs and an "iommu" link will be
  * created under the linked device, pointing back at the IOMMU device.
  */
-int iommu_device_link(struct device *dev, struct device *link)
+int iommu_device_link(struct iommu_device *iommu, struct device *link)
 {
 	int ret;
 
-	if (!dev || IS_ERR(dev))
+	if (!iommu || IS_ERR(iommu))
 		return -ENODEV;
 
-	ret = sysfs_add_link_to_group(&dev->kobj, "devices",
+	ret = sysfs_add_link_to_group(&iommu->dev.kobj, "devices",
 				      &link->kobj, dev_name(link));
 	if (ret)
 		return ret;
 
-	ret = sysfs_create_link_nowarn(&link->kobj, &dev->kobj, "iommu");
+	ret = sysfs_create_link_nowarn(&link->kobj, &iommu->dev.kobj, "iommu");
 	if (ret)
-		sysfs_remove_link_from_group(&dev->kobj, "devices",
+		sysfs_remove_link_from_group(&iommu->dev.kobj, "devices",
 					     dev_name(link));
 
 	return ret;
 }
 
-void iommu_device_unlink(struct device *dev, struct device *link)
+void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
 {
-	if (!dev || IS_ERR(dev))
+	if (!iommu || IS_ERR(iommu))
 		return;
 
 	sysfs_remove_link(&link->kobj, "iommu");
-	sysfs_remove_link_from_group(&dev->kobj, "devices", dev_name(link));
+	sysfs_remove_link_from_group(&iommu->dev.kobj, "devices", dev_name(link));
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c578ca135bed..bae3cfc8b4a3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -224,6 +224,8 @@ int  iommu_device_sysfs_add(struct iommu_device *iommu,
 			    const struct attribute_group **groups,
 			    const char *fmt, ...) __printf(4, 5);
 void iommu_device_sysfs_remove(struct iommu_device *iommu);
+int  iommu_device_link(struct iommu_device   *iommu, struct device *link);
+void iommu_device_unlink(struct iommu_device *iommu, struct device *link);
 
 static inline void iommu_device_set_ops(struct iommu_device *iommu,
 					const struct iommu_ops *ops)
@@ -294,8 +296,6 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
-int iommu_device_link(struct device *dev, struct device *link);
-void iommu_device_unlink(struct device *dev, struct device *link);
 
 /* Window handling function prototypes */
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-- 
cgit v1.2.3


From c73e1ac8b2bc6ab18d9f9a96b17ee7388b49a0c0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 7 Feb 2017 18:18:46 +0100
Subject: iommu: Add iommu_device_set_fwnode() interface

Allow to store a fwnode in 'struct iommu_device';

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bae3cfc8b4a3..626c935edee1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -214,6 +214,7 @@ struct iommu_ops {
 struct iommu_device {
 	struct list_head list;
 	const struct iommu_ops *ops;
+	struct fwnode_handle *fwnode;
 	struct device dev;
 };
 
@@ -233,6 +234,12 @@ static inline void iommu_device_set_ops(struct iommu_device *iommu,
 	iommu->ops = ops;
 }
 
+static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
+					   struct fwnode_handle *fwnode)
+{
+	iommu->fwnode = fwnode;
+}
+
 #define IOMMU_GROUP_NOTIFY_ADD_DEVICE		1 /* Device added */
 #define IOMMU_GROUP_NOTIFY_DEL_DEVICE		2 /* Pre Device removed */
 #define IOMMU_GROUP_NOTIFY_BIND_DRIVER		3 /* Pre Driver bind */
@@ -580,6 +587,11 @@ static inline void iommu_device_set_ops(struct iommu_device *iommu,
 {
 }
 
+static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
+					   struct fwnode_handle *fwnode)
+{
+}
+
 static inline void iommu_device_unregister(struct iommu_device *iommu)
 {
 }
-- 
cgit v1.2.3


From e99ca98f1d7190c16601b00d0c96212d7c00577d Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ricardo.ribalda@gmail.com>
Date: Fri, 2 Dec 2016 12:31:44 +0100
Subject: mtd: spi-nor: Add support for S3AN spi-nor devices

Xilinx Spartan-3AN FPGAs contain an In-System Flash where they keep
their configuration data and (optionally) some user data.

The protocol of this flash follows most of the spi-nor standard. With
the following differences:

- Page size might not be a power of two.
- The address calculation (default addressing mode).
- The spi nor commands used.

Protocol is described on Xilinx User Guide UG333

Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Reviewed-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
---
 drivers/mtd/spi-nor/spi-nor.c | 154 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/mtd/spi-nor.h   |  12 ++++
 2 files changed, 161 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index da7cd69d4857..be1d41d21d7d 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -75,6 +75,12 @@ struct flash_info {
 					 * bit. Must be used with
 					 * SPI_NOR_HAS_LOCK.
 					 */
+#define	SPI_S3AN		BIT(10)	/*
+					 * Xilinx Spartan 3AN In-System Flash
+					 * (MFR cannot be used for probing
+					 * because it has the same value as
+					 * ATMEL flashes)
+					 */
 };
 
 #define JEDEC_MFR(info)	((info)->id[0])
@@ -217,6 +223,21 @@ static inline int set_4byte(struct spi_nor *nor, const struct flash_info *info,
 		return nor->write_reg(nor, SPINOR_OP_BRWR, nor->cmd_buf, 1);
 	}
 }
+
+static int s3an_sr_ready(struct spi_nor *nor)
+{
+	int ret;
+	u8 val;
+
+	ret = nor->read_reg(nor, SPINOR_OP_XRDSR, &val, 1);
+	if (ret < 0) {
+		dev_err(nor->dev, "error %d reading XRDSR\n", (int) ret);
+		return ret;
+	}
+
+	return !!(val & XSR_RDY);
+}
+
 static inline int spi_nor_sr_ready(struct spi_nor *nor)
 {
 	int sr = read_sr(nor);
@@ -238,7 +259,11 @@ static inline int spi_nor_fsr_ready(struct spi_nor *nor)
 static int spi_nor_ready(struct spi_nor *nor)
 {
 	int sr, fsr;
-	sr = spi_nor_sr_ready(nor);
+
+	if (nor->flags & SNOR_F_READY_XSR_RDY)
+		sr = s3an_sr_ready(nor);
+	else
+		sr = spi_nor_sr_ready(nor);
 	if (sr < 0)
 		return sr;
 	fsr = nor->flags & SNOR_F_USE_FSR ? spi_nor_fsr_ready(nor) : 1;
@@ -319,6 +344,24 @@ static void spi_nor_unlock_and_unprep(struct spi_nor *nor, enum spi_nor_ops ops)
 	mutex_unlock(&nor->lock);
 }
 
+/*
+ * This code converts an address to the Default Address Mode, that has non
+ * power of two page sizes. We must support this mode because it is the default
+ * mode supported by Xilinx tools, it can access the whole flash area and
+ * changing over to the Power-of-two mode is irreversible and corrupts the
+ * original data.
+ * Addr can safely be unsigned int, the biggest S3AN device is smaller than
+ * 4 MiB.
+ */
+static loff_t spi_nor_s3an_addr_convert(struct spi_nor *nor, unsigned int addr)
+{
+	unsigned int offset = addr;
+
+	offset %= nor->page_size;
+
+	return ((addr - offset) << 1) | offset;
+}
+
 /*
  * Initiate the erasure of a single sector
  */
@@ -327,6 +370,9 @@ static int spi_nor_erase_sector(struct spi_nor *nor, u32 addr)
 	u8 buf[SPI_NOR_MAX_ADDR_WIDTH];
 	int i;
 
+	if (nor->flags & SNOR_F_S3AN_ADDR_DEFAULT)
+		addr = spi_nor_s3an_addr_convert(nor, addr);
+
 	if (nor->erase)
 		return nor->erase(nor, addr);
 
@@ -368,7 +414,7 @@ static int spi_nor_erase(struct mtd_info *mtd, struct erase_info *instr)
 		return ret;
 
 	/* whole-chip erase? */
-	if (len == mtd->size) {
+	if (len == mtd->size && !(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) {
 		unsigned long timeout;
 
 		write_enable(nor);
@@ -782,6 +828,19 @@ static int spi_nor_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 		.addr_width = (_addr_width),				\
 		.flags = (_flags),
 
+#define S3AN_INFO(_jedec_id, _n_sectors, _page_size)			\
+		.id = {							\
+			((_jedec_id) >> 16) & 0xff,			\
+			((_jedec_id) >> 8) & 0xff,			\
+			(_jedec_id) & 0xff				\
+			},						\
+		.id_len = 3,						\
+		.sector_size = (8*_page_size),				\
+		.n_sectors = (_n_sectors),				\
+		.page_size = _page_size,				\
+		.addr_width = 3,					\
+		.flags = SPI_NOR_NO_FR | SPI_S3AN,
+
 /* NOTE: double check command sets and memory organization when you add
  * more nor chips.  This current list focusses on newer chips, which
  * have been converging on command sets which including JEDEC ID.
@@ -1014,6 +1073,13 @@ static const struct flash_info spi_nor_ids[] = {
 	{ "cat25c09", CAT25_INFO( 128, 8, 32, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
 	{ "cat25c17", CAT25_INFO( 256, 8, 32, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
 	{ "cat25128", CAT25_INFO(2048, 8, 64, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) },
+
+	/* Xilinx S3AN Internal Flash */
+	{ "3S50AN", S3AN_INFO(0x1f2200, 64, 264) },
+	{ "3S200AN", S3AN_INFO(0x1f2400, 256, 264) },
+	{ "3S400AN", S3AN_INFO(0x1f2400, 256, 264) },
+	{ "3S700AN", S3AN_INFO(0x1f2500, 512, 264) },
+	{ "3S1400AN", S3AN_INFO(0x1f2600, 512, 528) },
 	{ },
 };
 
@@ -1054,7 +1120,12 @@ static int spi_nor_read(struct mtd_info *mtd, loff_t from, size_t len,
 		return ret;
 
 	while (len) {
-		ret = nor->read(nor, from, len, buf);
+		loff_t addr = from;
+
+		if (nor->flags & SNOR_F_S3AN_ADDR_DEFAULT)
+			addr = spi_nor_s3an_addr_convert(nor, addr);
+
+		ret = nor->read(nor, addr, len, buf);
 		if (ret == 0) {
 			/* We shouldn't see 0-length reads */
 			ret = -EIO;
@@ -1175,8 +1246,23 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 	for (i = 0; i < len; ) {
 		ssize_t written;
+		loff_t addr = to + i;
 
-		page_offset = (to + i) & (nor->page_size - 1);
+		/*
+		 * If page_size is a power of two, the offset can be quickly
+		 * calculated with an AND operation. On the other cases we
+		 * need to do a modulus operation (more expensive).
+		 * Power of two numbers have only one bit set and we can use
+		 * the instruction hweight32 to detect if we need to do a
+		 * modulus (do_div()) or not.
+		 */
+		if (hweight32(nor->page_size) == 1) {
+			page_offset = addr & (nor->page_size - 1);
+		} else {
+			uint64_t aux = addr;
+
+			page_offset = do_div(aux, nor->page_size);
+		}
 		WARN_ONCE(page_offset,
 			  "Writing at offset %zu into a NOR page. Writing partial pages may decrease reliability and increase wear of NOR flash.",
 			  page_offset);
@@ -1184,8 +1270,11 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
 		page_remain = min_t(size_t,
 				    nor->page_size - page_offset, len - i);
 
+		if (nor->flags & SNOR_F_S3AN_ADDR_DEFAULT)
+			addr = spi_nor_s3an_addr_convert(nor, addr);
+
 		write_enable(nor);
-		ret = nor->write(nor, to + i, page_remain, buf + i);
+		ret = nor->write(nor, addr, page_remain, buf + i);
 		if (ret < 0)
 			goto write_err;
 		written = ret;
@@ -1312,6 +1401,47 @@ static int spi_nor_check(struct spi_nor *nor)
 	return 0;
 }
 
+static int s3an_nor_scan(const struct flash_info *info, struct spi_nor *nor)
+{
+	int ret;
+	u8 val;
+
+	ret = nor->read_reg(nor, SPINOR_OP_XRDSR, &val, 1);
+	if (ret < 0) {
+		dev_err(nor->dev, "error %d reading XRDSR\n", (int) ret);
+		return ret;
+	}
+
+	nor->erase_opcode = SPINOR_OP_XSE;
+	nor->program_opcode = SPINOR_OP_XPP;
+	nor->read_opcode = SPINOR_OP_READ;
+	nor->flags |= SNOR_F_NO_OP_CHIP_ERASE;
+
+	/*
+	 * This flashes have a page size of 264 or 528 bytes (known as
+	 * Default addressing mode). It can be changed to a more standard
+	 * Power of two mode where the page size is 256/512. This comes
+	 * with a price: there is 3% less of space, the data is corrupted
+	 * and the page size cannot be changed back to default addressing
+	 * mode.
+	 *
+	 * The current addressing mode can be read from the XRDSR register
+	 * and should not be changed, because is a destructive operation.
+	 */
+	if (val & XSR_PAGESIZE) {
+		/* Flash in Power of 2 mode */
+		nor->page_size = (nor->page_size == 264) ? 256 : 512;
+		nor->mtd.writebufsize = nor->page_size;
+		nor->mtd.size = 8 * nor->page_size * info->n_sectors;
+		nor->mtd.erasesize = 8 * nor->page_size;
+	} else {
+		/* Flash in Default addressing mode */
+		nor->flags |= SNOR_F_S3AN_ADDR_DEFAULT;
+	}
+
+	return 0;
+}
+
 int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode)
 {
 	const struct flash_info *info = NULL;
@@ -1359,6 +1489,14 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode)
 
 	mutex_init(&nor->lock);
 
+	/*
+	 * Make sure the XSR_RDY flag is set before calling
+	 * spi_nor_wait_till_ready(). Xilinx S3AN share MFR
+	 * with Atmel spi-nor
+	 */
+	if (info->flags & SPI_S3AN)
+		nor->flags |=  SNOR_F_READY_XSR_RDY;
+
 	/*
 	 * Atmel, SST, Intel/Numonyx, and others serial NOR tend to power up
 	 * with the software protection bits set
@@ -1517,6 +1655,12 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode)
 
 	nor->read_dummy = spi_nor_read_dummy_cycles(nor);
 
+	if (info->flags & SPI_S3AN) {
+		ret = s3an_nor_scan(info, nor);
+		if (ret)
+			return ret;
+	}
+
 	dev_info(dev, "%s (%lld Kbytes)\n", info->name,
 			(long long)mtd->size >> 10);
 
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index c425c7b4c2a0..4950b2ef08c0 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -68,6 +68,15 @@
 #define SPINOR_OP_WRDI		0x04	/* Write disable */
 #define SPINOR_OP_AAI_WP	0xad	/* Auto address increment word program */
 
+/* Used for S3AN flashes only */
+#define SPINOR_OP_XSE		0x50	/* Sector erase */
+#define SPINOR_OP_XPP		0x82	/* Page program */
+#define SPINOR_OP_XRDSR		0xd7	/* Read status register */
+
+#define XSR_PAGESIZE		BIT(0)	/* Page size in Po2 or Linear */
+#define XSR_RDY			BIT(7)	/* Ready */
+
+
 /* Used for Macronix and Winbond flashes. */
 #define SPINOR_OP_EN4B		0xb7	/* Enter 4-byte mode */
 #define SPINOR_OP_EX4B		0xe9	/* Exit 4-byte mode */
@@ -119,6 +128,9 @@ enum spi_nor_ops {
 enum spi_nor_option_flags {
 	SNOR_F_USE_FSR		= BIT(0),
 	SNOR_F_HAS_SR_TB	= BIT(1),
+	SNOR_F_NO_OP_CHIP_ERASE	= BIT(2),
+	SNOR_F_S3AN_ADDR_DEFAULT = BIT(3),
+	SNOR_F_READY_XSR_RDY	= BIT(4),
 };
 
 /**
-- 
cgit v1.2.3


From 902cc69a0820252c84c6f7caed350882cea166ba Mon Sep 17 00:00:00 2001
From: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Date: Thu, 27 Oct 2016 11:55:39 +0200
Subject: mtd: spi-nor: rename SPINOR_OP_* macros of the 4-byte address op
 codes

This patch renames the SPINOR_OP_* macros of the 4-byte address
instruction set so the new names all share a common pattern: the 4-byte
address name is built from the 3-byte address name appending the "_4B"
suffix.

The patch also introduces new op codes to support other SPI protocols such
as SPI 1-4-4 and SPI 1-2-2.

This is a transitional patch and will help a later patch of spi-nor.c
to automate the translation from the 3-byte address op codes into their
4-byte address version.

Signed-off-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Marek Vasut <marek.vasut@gmail.com>
---
 drivers/mtd/devices/serial_flash_cmds.h |  7 -------
 drivers/mtd/devices/st_spi_fsm.c        | 28 ++++++++++++++--------------
 drivers/mtd/spi-nor/spi-nor.c           |  8 ++++----
 drivers/spi/spi-bcm-qspi.c              |  6 +++---
 include/linux/mtd/spi-nor.h             | 22 ++++++++++++++++------
 5 files changed, 37 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/serial_flash_cmds.h b/drivers/mtd/devices/serial_flash_cmds.h
index f59a125295d0..8b81e15105dd 100644
--- a/drivers/mtd/devices/serial_flash_cmds.h
+++ b/drivers/mtd/devices/serial_flash_cmds.h
@@ -18,19 +18,12 @@
 #define SPINOR_OP_RDVCR		0x85
 
 /* JEDEC Standard - Serial Flash Discoverable Parmeters (SFDP) Commands */
-#define SPINOR_OP_READ_1_2_2	0xbb	/* DUAL I/O READ */
-#define SPINOR_OP_READ_1_4_4	0xeb	/* QUAD I/O READ */
-
 #define SPINOR_OP_WRITE		0x02	/* PAGE PROGRAM */
 #define SPINOR_OP_WRITE_1_1_2	0xa2	/* DUAL INPUT PROGRAM */
 #define SPINOR_OP_WRITE_1_2_2	0xd2	/* DUAL INPUT EXT PROGRAM */
 #define SPINOR_OP_WRITE_1_1_4	0x32	/* QUAD INPUT PROGRAM */
 #define SPINOR_OP_WRITE_1_4_4	0x12	/* QUAD INPUT EXT PROGRAM */
 
-/* READ commands with 32-bit addressing */
-#define SPINOR_OP_READ4_1_2_2	0xbc
-#define SPINOR_OP_READ4_1_4_4	0xec
-
 /* Configuration flags */
 #define FLASH_FLAG_SINGLE	0x000000ff
 #define FLASH_FLAG_READ_WRITE	0x00000001
diff --git a/drivers/mtd/devices/st_spi_fsm.c b/drivers/mtd/devices/st_spi_fsm.c
index 5454b4113589..804313a33f2b 100644
--- a/drivers/mtd/devices/st_spi_fsm.c
+++ b/drivers/mtd/devices/st_spi_fsm.c
@@ -507,13 +507,13 @@ static struct seq_rw_config n25q_read3_configs[] = {
  *	- 'FAST' variants configured for 8 dummy cycles (see note above.)
  */
 static struct seq_rw_config n25q_read4_configs[] = {
-	{FLASH_FLAG_READ_1_4_4, SPINOR_OP_READ4_1_4_4,	0, 4, 4, 0x00, 0, 8},
-	{FLASH_FLAG_READ_1_1_4, SPINOR_OP_READ4_1_1_4,	0, 1, 4, 0x00, 0, 8},
-	{FLASH_FLAG_READ_1_2_2, SPINOR_OP_READ4_1_2_2,	0, 2, 2, 0x00, 0, 8},
-	{FLASH_FLAG_READ_1_1_2, SPINOR_OP_READ4_1_1_2,	0, 1, 2, 0x00, 0, 8},
-	{FLASH_FLAG_READ_FAST,	SPINOR_OP_READ4_FAST,	0, 1, 1, 0x00, 0, 8},
-	{FLASH_FLAG_READ_WRITE, SPINOR_OP_READ4,	0, 1, 1, 0x00, 0, 0},
-	{0x00,			0,			0, 0, 0, 0x00, 0, 0},
+	{FLASH_FLAG_READ_1_4_4, SPINOR_OP_READ_1_4_4_4B, 0, 4, 4, 0x00, 0, 8},
+	{FLASH_FLAG_READ_1_1_4, SPINOR_OP_READ_1_1_4_4B, 0, 1, 4, 0x00, 0, 8},
+	{FLASH_FLAG_READ_1_2_2, SPINOR_OP_READ_1_2_2_4B, 0, 2, 2, 0x00, 0, 8},
+	{FLASH_FLAG_READ_1_1_2, SPINOR_OP_READ_1_1_2_4B, 0, 1, 2, 0x00, 0, 8},
+	{FLASH_FLAG_READ_FAST,	SPINOR_OP_READ_FAST_4B,  0, 1, 1, 0x00, 0, 8},
+	{FLASH_FLAG_READ_WRITE, SPINOR_OP_READ_4B,       0, 1, 1, 0x00, 0, 0},
+	{0x00,			0,                       0, 0, 0, 0x00, 0, 0},
 };
 
 /*
@@ -553,13 +553,13 @@ static int stfsm_mx25_en_32bit_addr_seq(struct stfsm_seq *seq)
  * entering a state that is incompatible with the SPIBoot Controller.
  */
 static struct seq_rw_config stfsm_s25fl_read4_configs[] = {
-	{FLASH_FLAG_READ_1_4_4,  SPINOR_OP_READ4_1_4_4,  0, 4, 4, 0x00, 2, 4},
-	{FLASH_FLAG_READ_1_1_4,  SPINOR_OP_READ4_1_1_4,  0, 1, 4, 0x00, 0, 8},
-	{FLASH_FLAG_READ_1_2_2,  SPINOR_OP_READ4_1_2_2,  0, 2, 2, 0x00, 4, 0},
-	{FLASH_FLAG_READ_1_1_2,  SPINOR_OP_READ4_1_1_2,  0, 1, 2, 0x00, 0, 8},
-	{FLASH_FLAG_READ_FAST,   SPINOR_OP_READ4_FAST,   0, 1, 1, 0x00, 0, 8},
-	{FLASH_FLAG_READ_WRITE,  SPINOR_OP_READ4,        0, 1, 1, 0x00, 0, 0},
-	{0x00,                   0,                      0, 0, 0, 0x00, 0, 0},
+	{FLASH_FLAG_READ_1_4_4,  SPINOR_OP_READ_1_4_4_4B,  0, 4, 4, 0x00, 2, 4},
+	{FLASH_FLAG_READ_1_1_4,  SPINOR_OP_READ_1_1_4_4B,  0, 1, 4, 0x00, 0, 8},
+	{FLASH_FLAG_READ_1_2_2,  SPINOR_OP_READ_1_2_2_4B,  0, 2, 2, 0x00, 4, 0},
+	{FLASH_FLAG_READ_1_1_2,  SPINOR_OP_READ_1_1_2_4B,  0, 1, 2, 0x00, 0, 8},
+	{FLASH_FLAG_READ_FAST,   SPINOR_OP_READ_FAST_4B,   0, 1, 1, 0x00, 0, 8},
+	{FLASH_FLAG_READ_WRITE,  SPINOR_OP_READ_4B,        0, 1, 1, 0x00, 0, 0},
+	{0x00,                   0,                        0, 0, 0, 0x00, 0, 0},
 };
 
 static struct seq_rw_config stfsm_s25fl_write4_configs[] = {
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 8610765e7aaa..ede42c2e47b5 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -1625,16 +1625,16 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode)
 			/* Dedicated 4-byte command set */
 			switch (nor->flash_read) {
 			case SPI_NOR_QUAD:
-				nor->read_opcode = SPINOR_OP_READ4_1_1_4;
+				nor->read_opcode = SPINOR_OP_READ_1_1_4_4B;
 				break;
 			case SPI_NOR_DUAL:
-				nor->read_opcode = SPINOR_OP_READ4_1_1_2;
+				nor->read_opcode = SPINOR_OP_READ_1_1_2_4B;
 				break;
 			case SPI_NOR_FAST:
-				nor->read_opcode = SPINOR_OP_READ4_FAST;
+				nor->read_opcode = SPINOR_OP_READ_FAST_4B;
 				break;
 			case SPI_NOR_NORMAL:
-				nor->read_opcode = SPINOR_OP_READ4;
+				nor->read_opcode = SPINOR_OP_READ_4B;
 				break;
 			}
 			nor->program_opcode = SPINOR_OP_PP_4B;
diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index 14f9dea3173f..d7843fd8c610 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -371,7 +371,7 @@ static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi, int width,
 			/* default mode, does not need flex_cmd */
 			flex_mode = 0;
 		else
-			command = SPINOR_OP_READ4_FAST;
+			command = SPINOR_OP_READ_FAST_4B;
 		break;
 	case SPI_NBITS_DUAL:
 		bpc = 0x00000001;
@@ -384,7 +384,7 @@ static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi, int width,
 		} else {
 			command = SPINOR_OP_READ_1_1_2;
 			if (spans_4byte)
-				command = SPINOR_OP_READ4_1_1_2;
+				command = SPINOR_OP_READ_1_1_2_4B;
 		}
 		break;
 	case SPI_NBITS_QUAD:
@@ -399,7 +399,7 @@ static int bcm_qspi_bspi_set_flex_mode(struct bcm_qspi *qspi, int width,
 		} else {
 			command = SPINOR_OP_READ_1_1_4;
 			if (spans_4byte)
-				command = SPINOR_OP_READ4_1_1_4;
+				command = SPINOR_OP_READ_1_1_4_4B;
 		}
 		break;
 	default:
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 4950b2ef08c0..f2a718030476 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -43,9 +43,13 @@
 #define SPINOR_OP_WRSR		0x01	/* Write status register 1 byte */
 #define SPINOR_OP_READ		0x03	/* Read data bytes (low frequency) */
 #define SPINOR_OP_READ_FAST	0x0b	/* Read data bytes (high frequency) */
-#define SPINOR_OP_READ_1_1_2	0x3b	/* Read data bytes (Dual SPI) */
-#define SPINOR_OP_READ_1_1_4	0x6b	/* Read data bytes (Quad SPI) */
+#define SPINOR_OP_READ_1_1_2	0x3b	/* Read data bytes (Dual Output SPI) */
+#define SPINOR_OP_READ_1_2_2	0xbb	/* Read data bytes (Dual I/O SPI) */
+#define SPINOR_OP_READ_1_1_4	0x6b	/* Read data bytes (Quad Output SPI) */
+#define SPINOR_OP_READ_1_4_4	0xeb	/* Read data bytes (Quad I/O SPI) */
 #define SPINOR_OP_PP		0x02	/* Page program (up to 256 bytes) */
+#define SPINOR_OP_PP_1_1_4	0x32	/* Quad page program */
+#define SPINOR_OP_PP_1_4_4	0x38	/* Quad page program */
 #define SPINOR_OP_BE_4K		0x20	/* Erase 4KiB block */
 #define SPINOR_OP_BE_4K_PMC	0xd7	/* Erase 4KiB block on PMC chips */
 #define SPINOR_OP_BE_32K	0x52	/* Erase 32KiB block */
@@ -56,11 +60,17 @@
 #define SPINOR_OP_RDFSR		0x70	/* Read flag status register */
 
 /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
-#define SPINOR_OP_READ4		0x13	/* Read data bytes (low frequency) */
-#define SPINOR_OP_READ4_FAST	0x0c	/* Read data bytes (high frequency) */
-#define SPINOR_OP_READ4_1_1_2	0x3c	/* Read data bytes (Dual SPI) */
-#define SPINOR_OP_READ4_1_1_4	0x6c	/* Read data bytes (Quad SPI) */
+#define SPINOR_OP_READ_4B	0x13	/* Read data bytes (low frequency) */
+#define SPINOR_OP_READ_FAST_4B	0x0c	/* Read data bytes (high frequency) */
+#define SPINOR_OP_READ_1_1_2_4B	0x3c	/* Read data bytes (Dual Output SPI) */
+#define SPINOR_OP_READ_1_2_2_4B	0xbc	/* Read data bytes (Dual I/O SPI) */
+#define SPINOR_OP_READ_1_1_4_4B	0x6c	/* Read data bytes (Quad Output SPI) */
+#define SPINOR_OP_READ_1_4_4_4B	0xec	/* Read data bytes (Quad I/O SPI) */
 #define SPINOR_OP_PP_4B		0x12	/* Page program (up to 256 bytes) */
+#define SPINOR_OP_PP_1_1_4_4B	0x34	/* Quad page program */
+#define SPINOR_OP_PP_1_4_4_4B	0x3e	/* Quad page program */
+#define SPINOR_OP_BE_4K_4B	0x21	/* Erase 4KiB block */
+#define SPINOR_OP_BE_32K_4B	0x5c	/* Erase 32KiB block */
 #define SPINOR_OP_SE_4B		0xdc	/* Sector erase (usually 64KiB) */
 
 /* Used for SST flashes only. */
-- 
cgit v1.2.3


From 2b5e77308f3356faad640b24af6dc5aa7233eb2d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Fri, 10 Feb 2017 13:23:23 +0100
Subject: irqdesc: Add a resource managed version of irq_alloc_descs()

Add a devres flavor of __devm_irq_alloc_descs() and corresponding
helper macros.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-doc@vger.kernel.org
Cc: Jonathan Corbet <corbet@lwn.net>
Link: http://lkml.kernel.org/r/1486729403-21132-1-git-send-email-bgolaszewski@baylibre.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/driver-model/devres.txt |  5 ++++
 include/linux/irq.h                   | 19 ++++++++++++
 kernel/irq/devres.c                   | 55 +++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index ca9d1eb46bc0..bf34d5b3a733 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -306,6 +306,11 @@ IRQ
   devm_request_any_context_irq()
   devm_request_irq()
   devm_request_threaded_irq()
+  devm_irq_alloc_descs()
+  devm_irq_alloc_desc()
+  devm_irq_alloc_desc_at()
+  devm_irq_alloc_desc_from()
+  devm_irq_alloc_descs_from()
 
 LED
   devm_led_classdev_register()
diff --git a/include/linux/irq.h b/include/linux/irq.h
index e79875574b39..d915caecafa1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -715,6 +715,10 @@ unsigned int arch_dynirq_lower_bound(unsigned int from);
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 		      struct module *owner, const struct cpumask *affinity);
 
+int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
+			   unsigned int cnt, int node, struct module *owner,
+			   const struct cpumask *affinity);
+
 /* use macros to avoid needing export.h for THIS_MODULE */
 #define irq_alloc_descs(irq, from, cnt, node)	\
 	__irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL)
@@ -731,6 +735,21 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 #define irq_alloc_descs_from(from, cnt, node)	\
 	irq_alloc_descs(-1, from, cnt, node)
 
+#define devm_irq_alloc_descs(dev, irq, from, cnt, node)		\
+	__devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL)
+
+#define devm_irq_alloc_desc(dev, node)				\
+	devm_irq_alloc_descs(dev, -1, 0, 1, node)
+
+#define devm_irq_alloc_desc_at(dev, at, node)			\
+	devm_irq_alloc_descs(dev, at, at, 1, node)
+
+#define devm_irq_alloc_desc_from(dev, from, node)		\
+	devm_irq_alloc_descs(dev, -1, from, 1, node)
+
+#define devm_irq_alloc_descs_from(dev, from, cnt, node)		\
+	devm_irq_alloc_descs(dev, -1, from, cnt, node)
+
 void irq_free_descs(unsigned int irq, unsigned int cnt);
 static inline void irq_free_desc(unsigned int irq)
 {
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 74d90a754268..18babef39361 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -2,6 +2,7 @@
 #include <linux/interrupt.h>
 #include <linux/device.h>
 #include <linux/gfp.h>
+#include <linux/irq.h>
 
 /*
  * Device resource management aware IRQ request/free implementation.
@@ -137,3 +138,57 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 	free_irq(irq, dev_id);
 }
 EXPORT_SYMBOL(devm_free_irq);
+
+struct irq_desc_devres {
+	unsigned int from;
+	unsigned int cnt;
+};
+
+static void devm_irq_desc_release(struct device *dev, void *res)
+{
+	struct irq_desc_devres *this = res;
+
+	irq_free_descs(this->from, this->cnt);
+}
+
+/**
+ * __devm_irq_alloc_descs - Allocate and initialize a range of irq descriptors
+ *			    for a managed device
+ * @dev:	Device to allocate the descriptors for
+ * @irq:	Allocate for specific irq number if irq >= 0
+ * @from:	Start the search from this irq number
+ * @cnt:	Number of consecutive irqs to allocate
+ * @node:	Preferred node on which the irq descriptor should be allocated
+ * @owner:	Owning module (can be NULL)
+ * @affinity:	Optional pointer to an affinity mask array of size @cnt
+ *		which hints where the irq descriptors should be allocated
+ *		and which default affinities to use
+ *
+ * Returns the first irq number or error code.
+ *
+ * Note: Use the provided wrappers (devm_irq_alloc_desc*) for simplicity.
+ */
+int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
+			   unsigned int cnt, int node, struct module *owner,
+			   const struct cpumask *affinity)
+{
+	struct irq_desc_devres *dr;
+	int base;
+
+	dr = devres_alloc(devm_irq_desc_release, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	base = __irq_alloc_descs(irq, from, cnt, node, owner, affinity);
+	if (base < 0) {
+		devres_free(dr);
+		return base;
+	}
+
+	dr->from = base;
+	dr->cnt = cnt;
+	devres_add(dev, dr);
+
+	return base;
+}
+EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs);
-- 
cgit v1.2.3


From d0f6f5832603931b0a8da044fb9abe8289e201ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 2 Feb 2017 12:19:12 +0100
Subject: iommu: Remove iommu_register_instance interface

And also move its remaining functionality to
iommu_device_register() and 'struct iommu_device'.

Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: devicetree@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm-smmu-v3.c  |  2 --
 drivers/iommu/arm-smmu.c     |  1 -
 drivers/iommu/exynos-iommu.c |  2 --
 drivers/iommu/iommu.c        | 37 ++++++-------------------------------
 drivers/iommu/msm_iommu.c    |  2 --
 drivers/iommu/mtk_iommu.c    |  1 -
 include/linux/iommu.h        |  7 -------
 include/linux/of_iommu.h     |  6 ------
 8 files changed, 6 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 32133e289ff6..53751379dee3 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2702,8 +2702,6 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 
 	ret = iommu_device_register(&smmu->iommu);
 
-	iommu_register_instance(dev->fwnode, &arm_smmu_ops);
-
 #ifdef CONFIG_PCI
 	if (pci_bus_type.iommu_ops != &arm_smmu_ops) {
 		pci_request_acs();
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index f4ce1e77344e..8fb4af2dfbfa 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -2121,7 +2121,6 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	iommu_register_instance(dev->fwnode, &arm_smmu_ops);
 	platform_set_drvdata(pdev, smmu);
 	arm_smmu_device_reset(smmu);
 
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 64325d8ddd40..778ecccd7a29 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -642,8 +642,6 @@ static int __init exynos_sysmmu_probe(struct platform_device *pdev)
 
 	pm_runtime_enable(dev);
 
-	of_iommu_set_ops(dev->of_node, &exynos_iommu_ops);
-
 	return 0;
 }
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1dfd70ea27e4..162d865e2e29 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1658,43 +1658,18 @@ out:
 	return ret;
 }
 
-struct iommu_instance {
-	struct list_head list;
-	struct fwnode_handle *fwnode;
-	const struct iommu_ops *ops;
-};
-static LIST_HEAD(iommu_instance_list);
-static DEFINE_SPINLOCK(iommu_instance_lock);
-
-void iommu_register_instance(struct fwnode_handle *fwnode,
-			     const struct iommu_ops *ops)
-{
-	struct iommu_instance *iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-
-	if (WARN_ON(!iommu))
-		return;
-
-	of_node_get(to_of_node(fwnode));
-	INIT_LIST_HEAD(&iommu->list);
-	iommu->fwnode = fwnode;
-	iommu->ops = ops;
-	spin_lock(&iommu_instance_lock);
-	list_add_tail(&iommu->list, &iommu_instance_list);
-	spin_unlock(&iommu_instance_lock);
-}
-
 const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 {
-	struct iommu_instance *instance;
 	const struct iommu_ops *ops = NULL;
+	struct iommu_device *iommu;
 
-	spin_lock(&iommu_instance_lock);
-	list_for_each_entry(instance, &iommu_instance_list, list)
-		if (instance->fwnode == fwnode) {
-			ops = instance->ops;
+	spin_lock(&iommu_device_lock);
+	list_for_each_entry(iommu, &iommu_device_list, list)
+		if (iommu->fwnode == fwnode) {
+			ops = iommu->ops;
 			break;
 		}
-	spin_unlock(&iommu_instance_lock);
+	spin_unlock(&iommu_device_lock);
 	return ops;
 }
 
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 30795cbab5ef..d0448353d501 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -810,8 +810,6 @@ static int msm_iommu_probe(struct platform_device *pdev)
 		goto fail;
 	}
 
-	of_iommu_set_ops(pdev->dev.of_node, &msm_iommu_ops);
-
 	pr_info("device mapped at %p, irq %d with %d ctx banks\n",
 		iommu->base, iommu->irq, iommu->ncb);
 
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index d484fa608db8..5d14cd15198d 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -681,7 +681,6 @@ static int mtk_iommu_init_fn(struct device_node *np)
 		return ret;
 	}
 
-	of_iommu_set_ops(np, &mtk_iommu_ops);
 	return 0;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 626c935edee1..9e82fc83765e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -382,8 +382,6 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 		      const struct iommu_ops *ops);
 void iommu_fwspec_free(struct device *dev);
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
-void iommu_register_instance(struct fwnode_handle *fwnode,
-			     const struct iommu_ops *ops);
 const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 
 #else /* CONFIG_IOMMU_API */
@@ -634,11 +632,6 @@ static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids,
 	return -ENODEV;
 }
 
-static inline void iommu_register_instance(struct fwnode_handle *fwnode,
-					   const struct iommu_ops *ops)
-{
-}
-
 static inline
 const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 {
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index 66fcbc949899..fc4add39361a 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -31,12 +31,6 @@ static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
 
 #endif	/* CONFIG_OF_IOMMU */
 
-static inline void of_iommu_set_ops(struct device_node *np,
-				    const struct iommu_ops *ops)
-{
-	iommu_register_instance(&np->fwnode, ops);
-}
-
 static inline const struct iommu_ops *of_iommu_get_ops(struct device_node *np)
 {
 	return iommu_ops_from_fwnode(&np->fwnode);
-- 
cgit v1.2.3


From 0508ad1fff11a8b0acdf0333b5fe108d7bd5fce4 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 1 Feb 2017 10:53:41 -0600
Subject: drivers/fsi: Add empty fsi bus definitions

This change adds the initial (empty) fsi bus definition, and introduces
drivers/fsi/.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Chris Bostic <cbostic@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/Kconfig        |  2 ++
 drivers/Makefile       |  1 +
 drivers/fsi/Kconfig    | 12 ++++++++++++
 drivers/fsi/Makefile   |  2 ++
 drivers/fsi/fsi-core.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/fsi.h    | 22 ++++++++++++++++++++++
 6 files changed, 77 insertions(+)
 create mode 100644 drivers/fsi/Kconfig
 create mode 100644 drivers/fsi/Makefile
 create mode 100644 drivers/fsi/fsi-core.c
 create mode 100644 include/linux/fsi.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index e1e2066cecdb..117ca14ccf85 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -202,4 +202,6 @@ source "drivers/hwtracing/intel_th/Kconfig"
 
 source "drivers/fpga/Kconfig"
 
+source "drivers/fsi/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 060026a02f59..67ce51d62015 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -173,3 +173,4 @@ obj-$(CONFIG_STM)		+= hwtracing/stm/
 obj-$(CONFIG_ANDROID)		+= android/
 obj-$(CONFIG_NVMEM)		+= nvmem/
 obj-$(CONFIG_FPGA)		+= fpga/
+obj-$(CONFIG_FSI)		+= fsi/
diff --git a/drivers/fsi/Kconfig b/drivers/fsi/Kconfig
new file mode 100644
index 000000000000..04c1a0efa7a7
--- /dev/null
+++ b/drivers/fsi/Kconfig
@@ -0,0 +1,12 @@
+#
+# FSI subsystem
+#
+
+menu "FSI support"
+
+config FSI
+	tristate "FSI support"
+	---help---
+	  FSI - the FRU Support Interface - is a simple bus for low-level
+	  access to POWER-based hardware.
+endmenu
diff --git a/drivers/fsi/Makefile b/drivers/fsi/Makefile
new file mode 100644
index 000000000000..db0e5e7c1655
--- /dev/null
+++ b/drivers/fsi/Makefile
@@ -0,0 +1,2 @@
+
+obj-$(CONFIG_FSI) += fsi-core.o
diff --git a/drivers/fsi/fsi-core.c b/drivers/fsi/fsi-core.c
new file mode 100644
index 000000000000..3e45306359b3
--- /dev/null
+++ b/drivers/fsi/fsi-core.c
@@ -0,0 +1,38 @@
+/*
+ * FSI core driver
+ *
+ * Copyright (C) IBM Corporation 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/device.h>
+#include <linux/fsi.h>
+#include <linux/module.h>
+
+/* FSI core & Linux bus type definitions */
+
+struct bus_type fsi_bus_type = {
+	.name		= "fsi",
+};
+EXPORT_SYMBOL_GPL(fsi_bus_type);
+
+static int fsi_init(void)
+{
+	return bus_register(&fsi_bus_type);
+}
+
+static void fsi_exit(void)
+{
+	bus_unregister(&fsi_bus_type);
+}
+
+module_init(fsi_init);
+module_exit(fsi_exit);
diff --git a/include/linux/fsi.h b/include/linux/fsi.h
new file mode 100644
index 000000000000..47aa181b6404
--- /dev/null
+++ b/include/linux/fsi.h
@@ -0,0 +1,22 @@
+/* FSI device & driver interfaces
+ *
+ * Copyright (C) IBM Corporation 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef LINUX_FSI_H
+#define LINUX_FSI_H
+
+#include <linux/device.h>
+
+extern struct bus_type fsi_bus_type;
+
+#endif /* LINUX_FSI_H */
-- 
cgit v1.2.3


From fda07a6c94ac5c9bd73d7b0134c6cc6861375341 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 1 Feb 2017 10:53:42 -0600
Subject: drivers/fsi: Add device & driver definitions

Add structs for fsi devices & drivers, and struct device conversion
functions.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Chris Bostic <cbostic@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fsi.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsi.h b/include/linux/fsi.h
index 47aa181b6404..f73886a5af01 100644
--- a/include/linux/fsi.h
+++ b/include/linux/fsi.h
@@ -17,6 +17,17 @@
 
 #include <linux/device.h>
 
+struct fsi_device {
+	struct device dev;
+};
+
+struct fsi_driver {
+	struct device_driver drv;
+};
+
+#define to_fsi_dev(devp) container_of(devp, struct fsi_device, dev)
+#define to_fsi_drv(drvp) container_of(drvp, struct fsi_driver, drv)
+
 extern struct bus_type fsi_bus_type;
 
 #endif /* LINUX_FSI_H */
-- 
cgit v1.2.3


From dd37eed7db0f7607fa41887c11c1e428faa15d0c Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 1 Feb 2017 10:53:43 -0600
Subject: drivers/fsi: add driver to device matches

Driver bind to devices based on the engine types & (optional) versions.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Chris Bostic <cbostic@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fsi/fsi-core.c | 21 +++++++++++++++++++++
 include/linux/fsi.h    | 21 +++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/fsi/fsi-core.c b/drivers/fsi/fsi-core.c
index 3e45306359b3..3d55bd547178 100644
--- a/drivers/fsi/fsi-core.c
+++ b/drivers/fsi/fsi-core.c
@@ -19,8 +19,29 @@
 
 /* FSI core & Linux bus type definitions */
 
+static int fsi_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct fsi_device *fsi_dev = to_fsi_dev(dev);
+	struct fsi_driver *fsi_drv = to_fsi_drv(drv);
+	const struct fsi_device_id *id;
+
+	if (!fsi_drv->id_table)
+		return 0;
+
+	for (id = fsi_drv->id_table; id->engine_type; id++) {
+		if (id->engine_type != fsi_dev->engine_type)
+			continue;
+		if (id->version == FSI_VERSION_ANY ||
+				id->version == fsi_dev->version)
+			return 1;
+	}
+
+	return 0;
+}
+
 struct bus_type fsi_bus_type = {
 	.name		= "fsi",
+	.match		= fsi_bus_match,
 };
 EXPORT_SYMBOL_GPL(fsi_bus_type);
 
diff --git a/include/linux/fsi.h b/include/linux/fsi.h
index f73886a5af01..273cbf6400ea 100644
--- a/include/linux/fsi.h
+++ b/include/linux/fsi.h
@@ -18,11 +18,28 @@
 #include <linux/device.h>
 
 struct fsi_device {
-	struct device dev;
+	struct device		dev;
+	u8			engine_type;
+	u8			version;
 };
 
+struct fsi_device_id {
+	u8	engine_type;
+	u8	version;
+};
+
+#define FSI_VERSION_ANY		0
+
+#define FSI_DEVICE(t) \
+	.engine_type = (t), .version = FSI_VERSION_ANY,
+
+#define FSI_DEVICE_VERSIONED(t, v) \
+	.engine_type = (t), .version = (v),
+
+
 struct fsi_driver {
-	struct device_driver drv;
+	struct device_driver		drv;
+	const struct fsi_device_id	*id_table;
 };
 
 #define to_fsi_dev(devp) container_of(devp, struct fsi_device, dev)
-- 
cgit v1.2.3


From baa6d396635129d8a67793e884f3b2182c7354b3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Wed, 1 Feb 2017 12:48:44 -0700
Subject: fpga: Add scatterlist based programming

Requiring contiguous kernel memory is not a good idea, this is a limited
resource and allocation can fail under normal work loads.

This introduces a .write_sg op that supporting drivers can provide
to DMA directly from dis-contiguous memory and a new entry point
fpga_mgr_buf_load_sg that users can call to directly provide page
lists.

The full matrix of compatibility is provided, either the linear or sg
interface can be used by the user with a driver supporting either
interface.

A notable change for drivers is that the .write op can now be called
multiple times.

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Acked-by: Alan Tull <atull@opensource.altera.com>
Acked-by: Moritz Fischer <moritz.fischer@ettus.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/fpga/fpga-mgr.txt |  19 +++-
 drivers/fpga/fpga-mgr.c         | 236 +++++++++++++++++++++++++++++++++++-----
 include/linux/fpga/fpga-mgr.h   |   5 +
 3 files changed, 227 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/fpga/fpga-mgr.txt b/Documentation/fpga/fpga-mgr.txt
index 86ee5078fd03..78f197fadfd1 100644
--- a/Documentation/fpga/fpga-mgr.txt
+++ b/Documentation/fpga/fpga-mgr.txt
@@ -22,7 +22,16 @@ To program the FPGA from a file or from a buffer:
 			      struct fpga_image_info *info,
 		              const char *buf, size_t count);
 
-Load the FPGA from an image which exists as a buffer in memory.
+Load the FPGA from an image which exists as a contiguous buffer in
+memory. Allocating contiguous kernel memory for the buffer should be avoided,
+users are encouraged to use the _sg interface instead of this.
+
+        int fpga_mgr_buf_load_sg(struct fpga_manager *mgr,
+				 struct fpga_image_info *info,
+				 struct sg_table *sgt);
+
+Load the FPGA from an image from non-contiguous in memory. Callers can
+construct a sg_table using alloc_page backed memory.
 
 	int fpga_mgr_firmware_load(struct fpga_manager *mgr,
 				   struct fpga_image_info *info,
@@ -166,7 +175,7 @@ success or negative error codes otherwise.
 
 The programming sequence is:
  1. .write_init
- 2. .write (may be called once or multiple times)
+ 2. .write or .write_sg (may be called once or multiple times)
  3. .write_complete
 
 The .write_init function will prepare the FPGA to receive the image data.  The
@@ -176,7 +185,11 @@ buffer up at least this much before starting.
 
 The .write function writes a buffer to the FPGA. The buffer may be contain the
 whole FPGA image or may be a smaller chunk of an FPGA image.  In the latter
-case, this function is called multiple times for successive chunks.
+case, this function is called multiple times for successive chunks. This interface
+is suitable for drivers which use PIO.
+
+The .write_sg version behaves the same as .write except the input is a sg_table
+scatter list. This interface is suitable for drivers which use DMA.
 
 The .write_complete function is called after all the image has been written
 to put the FPGA into operating mode.
diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index f0a69d3e60a5..86d2cb203533 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -25,16 +25,106 @@
 #include <linux/of.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
 
 static DEFINE_IDA(fpga_mgr_ida);
 static struct class *fpga_mgr_class;
 
+/*
+ * Call the low level driver's write_init function.  This will do the
+ * device-specific things to get the FPGA into the state where it is ready to
+ * receive an FPGA image. The low level driver only gets to see the first
+ * initial_header_size bytes in the buffer.
+ */
+static int fpga_mgr_write_init_buf(struct fpga_manager *mgr,
+				   struct fpga_image_info *info,
+				   const char *buf, size_t count)
+{
+	int ret;
+
+	mgr->state = FPGA_MGR_STATE_WRITE_INIT;
+	if (!mgr->mops->initial_header_size)
+		ret = mgr->mops->write_init(mgr, info, NULL, 0);
+	else
+		ret = mgr->mops->write_init(
+		    mgr, info, buf, min(mgr->mops->initial_header_size, count));
+
+	if (ret) {
+		dev_err(&mgr->dev, "Error preparing FPGA for writing\n");
+		mgr->state = FPGA_MGR_STATE_WRITE_INIT_ERR;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int fpga_mgr_write_init_sg(struct fpga_manager *mgr,
+				  struct fpga_image_info *info,
+				  struct sg_table *sgt)
+{
+	struct sg_mapping_iter miter;
+	size_t len;
+	char *buf;
+	int ret;
+
+	if (!mgr->mops->initial_header_size)
+		return fpga_mgr_write_init_buf(mgr, info, NULL, 0);
+
+	/*
+	 * First try to use miter to map the first fragment to access the
+	 * header, this is the typical path.
+	 */
+	sg_miter_start(&miter, sgt->sgl, sgt->nents, SG_MITER_FROM_SG);
+	if (sg_miter_next(&miter) &&
+	    miter.length >= mgr->mops->initial_header_size) {
+		ret = fpga_mgr_write_init_buf(mgr, info, miter.addr,
+					      miter.length);
+		sg_miter_stop(&miter);
+		return ret;
+	}
+	sg_miter_stop(&miter);
+
+	/* Otherwise copy the fragments into temporary memory. */
+	buf = kmalloc(mgr->mops->initial_header_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	len = sg_copy_to_buffer(sgt->sgl, sgt->nents, buf,
+				mgr->mops->initial_header_size);
+	ret = fpga_mgr_write_init_buf(mgr, info, buf, len);
+
+	kfree(buf);
+
+	return ret;
+}
+
+/*
+ * After all the FPGA image has been written, do the device specific steps to
+ * finish and set the FPGA into operating mode.
+ */
+static int fpga_mgr_write_complete(struct fpga_manager *mgr,
+				   struct fpga_image_info *info)
+{
+	int ret;
+
+	mgr->state = FPGA_MGR_STATE_WRITE_COMPLETE;
+	ret = mgr->mops->write_complete(mgr, info);
+	if (ret) {
+		dev_err(&mgr->dev, "Error after writing image data to FPGA\n");
+		mgr->state = FPGA_MGR_STATE_WRITE_COMPLETE_ERR;
+		return ret;
+	}
+	mgr->state = FPGA_MGR_STATE_OPERATING;
+
+	return 0;
+}
+
 /**
- * fpga_mgr_buf_load - load fpga from image in buffer
+ * fpga_mgr_buf_load_sg - load fpga from image in buffer from a scatter list
  * @mgr:	fpga manager
  * @info:	fpga image specific information
- * @buf:	buffer contain fpga image
- * @count:	byte count of buf
+ * @sgt:	scatterlist table
  *
  * Step the low level fpga manager through the device-specific steps of getting
  * an FPGA ready to be configured, writing the image to it, then doing whatever
@@ -42,54 +132,139 @@ static struct class *fpga_mgr_class;
  * mgr pointer from of_fpga_mgr_get() or fpga_mgr_get() and checked that it is
  * not an error code.
  *
+ * This is the preferred entry point for FPGA programming, it does not require
+ * any contiguous kernel memory.
+ *
  * Return: 0 on success, negative error code otherwise.
  */
-int fpga_mgr_buf_load(struct fpga_manager *mgr, struct fpga_image_info *info,
-		      const char *buf, size_t count)
+int fpga_mgr_buf_load_sg(struct fpga_manager *mgr, struct fpga_image_info *info,
+			 struct sg_table *sgt)
 {
-	struct device *dev = &mgr->dev;
 	int ret;
 
-	/*
-	 * Call the low level driver's write_init function.  This will do the
-	 * device-specific things to get the FPGA into the state where it is
-	 * ready to receive an FPGA image. The low level driver only gets to
-	 * see the first initial_header_size bytes in the buffer.
-	 */
-	mgr->state = FPGA_MGR_STATE_WRITE_INIT;
-	ret = mgr->mops->write_init(mgr, info, buf,
-				    min(mgr->mops->initial_header_size, count));
+	ret = fpga_mgr_write_init_sg(mgr, info, sgt);
+	if (ret)
+		return ret;
+
+	/* Write the FPGA image to the FPGA. */
+	mgr->state = FPGA_MGR_STATE_WRITE;
+	if (mgr->mops->write_sg) {
+		ret = mgr->mops->write_sg(mgr, sgt);
+	} else {
+		struct sg_mapping_iter miter;
+
+		sg_miter_start(&miter, sgt->sgl, sgt->nents, SG_MITER_FROM_SG);
+		while (sg_miter_next(&miter)) {
+			ret = mgr->mops->write(mgr, miter.addr, miter.length);
+			if (ret)
+				break;
+		}
+		sg_miter_stop(&miter);
+	}
+
 	if (ret) {
-		dev_err(dev, "Error preparing FPGA for writing\n");
-		mgr->state = FPGA_MGR_STATE_WRITE_INIT_ERR;
+		dev_err(&mgr->dev, "Error while writing image data to FPGA\n");
+		mgr->state = FPGA_MGR_STATE_WRITE_ERR;
 		return ret;
 	}
 
+	return fpga_mgr_write_complete(mgr, info);
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_buf_load_sg);
+
+static int fpga_mgr_buf_load_mapped(struct fpga_manager *mgr,
+				    struct fpga_image_info *info,
+				    const char *buf, size_t count)
+{
+	int ret;
+
+	ret = fpga_mgr_write_init_buf(mgr, info, buf, count);
+	if (ret)
+		return ret;
+
 	/*
 	 * Write the FPGA image to the FPGA.
 	 */
 	mgr->state = FPGA_MGR_STATE_WRITE;
 	ret = mgr->mops->write(mgr, buf, count);
 	if (ret) {
-		dev_err(dev, "Error while writing image data to FPGA\n");
+		dev_err(&mgr->dev, "Error while writing image data to FPGA\n");
 		mgr->state = FPGA_MGR_STATE_WRITE_ERR;
 		return ret;
 	}
 
+	return fpga_mgr_write_complete(mgr, info);
+}
+
+/**
+ * fpga_mgr_buf_load - load fpga from image in buffer
+ * @mgr:	fpga manager
+ * @flags:	flags setting fpga confuration modes
+ * @buf:	buffer contain fpga image
+ * @count:	byte count of buf
+ *
+ * Step the low level fpga manager through the device-specific steps of getting
+ * an FPGA ready to be configured, writing the image to it, then doing whatever
+ * post-configuration steps necessary.  This code assumes the caller got the
+ * mgr pointer from of_fpga_mgr_get() and checked that it is not an error code.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int fpga_mgr_buf_load(struct fpga_manager *mgr, struct fpga_image_info *info,
+		      const char *buf, size_t count)
+{
+	struct page **pages;
+	struct sg_table sgt;
+	const void *p;
+	int nr_pages;
+	int index;
+	int rc;
+
 	/*
-	 * After all the FPGA image has been written, do the device specific
-	 * steps to finish and set the FPGA into operating mode.
+	 * This is just a fast path if the caller has already created a
+	 * contiguous kernel buffer and the driver doesn't require SG, non-SG
+	 * drivers will still work on the slow path.
 	 */
-	mgr->state = FPGA_MGR_STATE_WRITE_COMPLETE;
-	ret = mgr->mops->write_complete(mgr, info);
-	if (ret) {
-		dev_err(dev, "Error after writing image data to FPGA\n");
-		mgr->state = FPGA_MGR_STATE_WRITE_COMPLETE_ERR;
-		return ret;
+	if (mgr->mops->write)
+		return fpga_mgr_buf_load_mapped(mgr, info, buf, count);
+
+	/*
+	 * Convert the linear kernel pointer into a sg_table of pages for use
+	 * by the driver.
+	 */
+	nr_pages = DIV_ROUND_UP((unsigned long)buf + count, PAGE_SIZE) -
+		   (unsigned long)buf / PAGE_SIZE;
+	pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	p = buf - offset_in_page(buf);
+	for (index = 0; index < nr_pages; index++) {
+		if (is_vmalloc_addr(p))
+			pages[index] = vmalloc_to_page(p);
+		else
+			pages[index] = kmap_to_page((void *)p);
+		if (!pages[index]) {
+			kfree(pages);
+			return -EFAULT;
+		}
+		p += PAGE_SIZE;
 	}
-	mgr->state = FPGA_MGR_STATE_OPERATING;
 
-	return 0;
+	/*
+	 * The temporary pages list is used to code share the merging algorithm
+	 * in sg_alloc_table_from_pages
+	 */
+	rc = sg_alloc_table_from_pages(&sgt, pages, index, offset_in_page(buf),
+				       count, GFP_KERNEL);
+	kfree(pages);
+	if (rc)
+		return rc;
+
+	rc = fpga_mgr_buf_load_sg(mgr, info, &sgt);
+	sg_free_table(&sgt);
+
+	return rc;
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_buf_load);
 
@@ -291,8 +466,9 @@ int fpga_mgr_register(struct device *dev, const char *name,
 	struct fpga_manager *mgr;
 	int id, ret;
 
-	if (!mops || !mops->write_init || !mops->write ||
-	    !mops->write_complete || !mops->state) {
+	if (!mops || !mops->write_complete || !mops->state ||
+	    !mops->write_init || (!mops->write && !mops->write_sg) ||
+	    (mops->write && mops->write_sg)) {
 		dev_err(dev, "Attempt to register without fpga_manager_ops\n");
 		return -EINVAL;
 	}
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 16551d5eac36..57beb5d09bfc 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -22,6 +22,7 @@
 #define _LINUX_FPGA_MGR_H
 
 struct fpga_manager;
+struct sg_table;
 
 /**
  * enum fpga_mgr_states - fpga framework states
@@ -88,6 +89,7 @@ struct fpga_image_info {
  * @state: returns an enum value of the FPGA's state
  * @write_init: prepare the FPGA to receive confuration data
  * @write: write count bytes of configuration data to the FPGA
+ * @write_sg: write the scatter list of configuration data to the FPGA
  * @write_complete: set FPGA to operating state after writing is done
  * @fpga_remove: optional: Set FPGA into a specific state during driver remove
  *
@@ -102,6 +104,7 @@ struct fpga_manager_ops {
 			  struct fpga_image_info *info,
 			  const char *buf, size_t count);
 	int (*write)(struct fpga_manager *mgr, const char *buf, size_t count);
+	int (*write_sg)(struct fpga_manager *mgr, struct sg_table *sgt);
 	int (*write_complete)(struct fpga_manager *mgr,
 			      struct fpga_image_info *info);
 	void (*fpga_remove)(struct fpga_manager *mgr);
@@ -129,6 +132,8 @@ struct fpga_manager {
 
 int fpga_mgr_buf_load(struct fpga_manager *mgr, struct fpga_image_info *info,
 		      const char *buf, size_t count);
+int fpga_mgr_buf_load_sg(struct fpga_manager *mgr, struct fpga_image_info *info,
+			 struct sg_table *sgt);
 
 int fpga_mgr_firmware_load(struct fpga_manager *mgr,
 			   struct fpga_image_info *info,
-- 
cgit v1.2.3


From f6c4391553573d592212e6624cfba5e2752cd5c8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 5 Feb 2017 17:20:33 -0700
Subject: vmbus: remove no longer used signal_policy

The explicit signal policy is no longer used. A different mechanism
will be added later when xmit_more is supported.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hyperv.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index c37a4a145036..7795966af0f9 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -684,11 +684,6 @@ struct hv_input_signal_event_buffer {
 	struct hv_input_signal_event event;
 };
 
-enum hv_signal_policy {
-	HV_SIGNAL_POLICY_DEFAULT = 0,
-	HV_SIGNAL_POLICY_EXPLICIT,
-};
-
 enum hv_numa_policy {
 	HV_BALANCED = 0,
 	HV_LOCALIZED,
@@ -850,13 +845,6 @@ struct vmbus_channel {
 	 * link up channels based on their CPU affinity.
 	 */
 	struct list_head percpu_list;
-	/*
-	 * Host signaling policy: The default policy will be
-	 * based on the ring buffer state. We will also support
-	 * a policy where the client driver can have explicit
-	 * signaling control.
-	 */
-	enum hv_signal_policy  signal_policy;
 	/*
 	 * On the channel send side, many of the VMBUS
 	 * device drivers explicity serialize access to the
@@ -918,12 +906,6 @@ static inline bool is_hvsock_channel(const struct vmbus_channel *c)
 		  VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER);
 }
 
-static inline void set_channel_signal_state(struct vmbus_channel *c,
-					    enum hv_signal_policy policy)
-{
-	c->signal_policy = policy;
-}
-
 static inline void set_channel_affinity_state(struct vmbus_channel *c,
 					      enum hv_numa_policy policy)
 {
-- 
cgit v1.2.3


From 3454323c954775098871559b5c23d877c3e23f77 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 5 Feb 2017 17:20:34 -0700
Subject: vmbus: remove unused kickq argument to sendpacket

Since sendpacket no longer uses kickq argument remove it.
Remove it no longer used xmit_more in sendpacket in netvsc as well.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c        | 19 +++++++++----------
 drivers/net/hyperv/netvsc.c | 21 +++------------------
 include/linux/hyperv.h      |  6 ++----
 3 files changed, 14 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index e26285cde8e0..789c75f6df26 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -643,8 +643,8 @@ void vmbus_close(struct vmbus_channel *channel)
 EXPORT_SYMBOL_GPL(vmbus_close);
 
 int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
-			   u32 bufferlen, u64 requestid,
-			   enum vmbus_packet_type type, u32 flags, bool kick_q)
+			 u32 bufferlen, u64 requestid,
+			 enum vmbus_packet_type type, u32 flags)
 {
 	struct vmpacket_descriptor desc;
 	u32 packetlen = sizeof(struct vmpacket_descriptor) + bufferlen;
@@ -693,7 +693,7 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
 			   enum vmbus_packet_type type, u32 flags)
 {
 	return vmbus_sendpacket_ctl(channel, buffer, bufferlen, requestid,
-				    type, flags, true);
+				    type, flags);
 }
 EXPORT_SYMBOL(vmbus_sendpacket);
 
@@ -705,11 +705,9 @@ EXPORT_SYMBOL(vmbus_sendpacket);
  * explicitly.
  */
 int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
-				     struct hv_page_buffer pagebuffers[],
-				     u32 pagecount, void *buffer, u32 bufferlen,
-				     u64 requestid,
-				     u32 flags,
-				     bool kick_q)
+				    struct hv_page_buffer pagebuffers[],
+				    u32 pagecount, void *buffer, u32 bufferlen,
+				    u64 requestid, u32 flags)
 {
 	int i;
 	struct vmbus_channel_packet_page_buffer desc;
@@ -769,9 +767,10 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 				     u64 requestid)
 {
 	u32 flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
+
 	return vmbus_sendpacket_pagebuffer_ctl(channel, pagebuffers, pagecount,
-					       buffer, bufferlen, requestid,
-					       flags, true);
+					       buffer, bufferlen,
+					       requestid, flags);
 
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 86e5749226ef..372603023053 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -723,8 +723,6 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 	char *dest = start + (section_index * net_device->send_section_size)
 		     + pend_size;
 	int i;
-	bool is_data_pkt = (skb != NULL) ? true : false;
-	bool xmit_more = (skb != NULL) ? skb->xmit_more : false;
 	u32 msg_size = 0;
 	u32 padding = 0;
 	u32 remain = packet->total_data_buflen % net_device->pkt_align;
@@ -732,7 +730,7 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 		packet->page_buf_cnt;
 
 	/* Add padding */
-	if (is_data_pkt && xmit_more && remain &&
+	if (skb && skb->xmit_more && remain &&
 	    !packet->cp_partial) {
 		padding = net_device->pkt_align - remain;
 		rndis_msg->msg_len += padding;
@@ -772,7 +770,6 @@ static inline int netvsc_send_pkt(
 	int ret;
 	struct hv_page_buffer *pgbuf;
 	u32 ring_avail = hv_ringbuf_avail_percent(&out_channel->outbound);
-	bool xmit_more = (skb != NULL) ? skb->xmit_more : false;
 
 	nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
 	if (skb != NULL) {
@@ -796,16 +793,6 @@ static inline int netvsc_send_pkt(
 	if (out_channel->rescind)
 		return -ENODEV;
 
-	/*
-	 * It is possible that once we successfully place this packet
-	 * on the ringbuffer, we may stop the queue. In that case, we want
-	 * to notify the host independent of the xmit_more flag. We don't
-	 * need to be precise here; in the worst case we may signal the host
-	 * unnecessarily.
-	 */
-	if (ring_avail < (RING_AVAIL_PERCENT_LOWATER + 1))
-		xmit_more = false;
-
 	if (packet->page_buf_cnt) {
 		pgbuf = packet->cp_partial ? (*pb) +
 			packet->rmsg_pgcnt : (*pb);
@@ -815,15 +802,13 @@ static inline int netvsc_send_pkt(
 						      &nvmsg,
 						      sizeof(struct nvsp_message),
 						      req_id,
-						      VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED,
-						      !xmit_more);
+						      VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	} else {
 		ret = vmbus_sendpacket_ctl(out_channel, &nvmsg,
 					   sizeof(struct nvsp_message),
 					   req_id,
 					   VM_PKT_DATA_INBAND,
-					   VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED,
-					   !xmit_more);
+					   VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	}
 
 	if (ret == 0) {
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 7795966af0f9..e208e6437f5b 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1037,8 +1037,7 @@ extern int vmbus_sendpacket_ctl(struct vmbus_channel *channel,
 				  u32 bufferLen,
 				  u64 requestid,
 				  enum vmbus_packet_type type,
-				  u32 flags,
-				  bool kick_q);
+				  u32 flags);
 
 extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 					    struct hv_page_buffer pagebuffers[],
@@ -1053,8 +1052,7 @@ extern int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
 					   void *buffer,
 					   u32 bufferlen,
 					   u64 requestid,
-					   u32 flags,
-					   bool kick_q);
+					   u32 flags);
 
 extern int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
 					struct hv_multipage_buffer *mpb,
-- 
cgit v1.2.3


From f1ba82616c3368e1ae9e64ef29cf3edc1be0860d Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Tue, 7 Feb 2017 18:24:43 +0100
Subject: blk-mq: pass bio to blk_mq_sched_get_rq_priv

bio is used in bfq-mq's get_rq_priv, to get the request group. We could
pass directly the group here, but I thought that passing the bio was
more general, giving the possibility to get other pieces of information
if needed.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c     | 8 +++++---
 block/blk-mq-sched.h     | 5 +++--
 include/linux/elevator.h | 2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 72d0d8361175..97fe904f0a04 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -68,7 +68,9 @@ error:
 EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
 
 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
-				      struct request *rq, struct io_context *ioc)
+				      struct request *rq,
+				      struct bio *bio,
+				      struct io_context *ioc)
 {
 	struct io_cq *icq;
 
@@ -83,7 +85,7 @@ static void __blk_mq_sched_assign_ioc(struct request_queue *q,
 	}
 
 	rq->elv.icq = icq;
-	if (!blk_mq_sched_get_rq_priv(q, rq)) {
+	if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
 		rq->rq_flags |= RQF_ELVPRIV;
 		get_io_context(icq->ioc);
 		return;
@@ -99,7 +101,7 @@ static void blk_mq_sched_assign_ioc(struct request_queue *q,
 
 	ioc = rq_ioc(bio);
 	if (ioc)
-		__blk_mq_sched_assign_ioc(q, rq, ioc);
+		__blk_mq_sched_assign_ioc(q, rq, bio, ioc);
 }
 
 struct request *blk_mq_sched_get_request(struct request_queue *q,
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 5954859c8670..7b5f3b95c78e 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -49,12 +49,13 @@ blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 }
 
 static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
-					   struct request *rq)
+					   struct request *rq,
+					   struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 
 	if (e && e->type->ops.mq.get_rq_priv)
-		return e->type->ops.mq.get_rq_priv(q, rq);
+		return e->type->ops.mq.get_rq_priv(q, rq, bio);
 
 	return 0;
 }
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 8265b6330cc2..aebecc4ed088 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -109,7 +109,7 @@ struct elevator_mq_ops {
 	void (*requeue_request)(struct request *);
 	struct request *(*former_request)(struct request_queue *, struct request *);
 	struct request *(*next_request)(struct request_queue *, struct request *);
-	int (*get_rq_priv)(struct request_queue *, struct request *);
+	int (*get_rq_priv)(struct request_queue *, struct request *, struct bio *);
 	void (*put_rq_priv)(struct request_queue *, struct request *);
 	void (*init_icq)(struct io_cq *);
 	void (*exit_icq)(struct io_cq *);
-- 
cgit v1.2.3


From ff548773106ec7f8031bc6172e0234bd2a02c19c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 10 Feb 2017 16:34:07 +0000
Subject: afs: Move UUID struct to linux/uuid.h

Move the afs_uuid struct to linux/uuid.h, rename it to uuid_v1 and change
the u16/u32 fields to __be16/__be32 instead so that the structure can be
cast to a 16-octet network-order buffer.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de
---
 fs/afs/cmservice.c   | 28 ++++++++++++++--------------
 fs/afs/internal.h    | 27 ++-------------------------
 fs/afs/main.c        | 25 +++++++++++++------------
 include/linux/uuid.h | 24 ++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index e349a3316303..2edbdcbf6432 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -351,7 +351,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 {
 	struct sockaddr_rxrpc srx;
 	struct afs_server *server;
-	struct afs_uuid *r;
+	struct uuid_v1 *r;
 	unsigned loop;
 	__be32 *b;
 	int ret;
@@ -381,15 +381,15 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 		}
 
 		_debug("unmarshall UUID");
-		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
 		if (!call->request)
 			return -ENOMEM;
 
 		b = call->buffer;
 		r = call->request;
-		r->time_low			= ntohl(b[0]);
-		r->time_mid			= ntohl(b[1]);
-		r->time_hi_and_version		= ntohl(b[2]);
+		r->time_low			= b[0];
+		r->time_mid			= htons(ntohl(b[1]));
+		r->time_hi_and_version		= htons(ntohl(b[2]));
 		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
 		r->clock_seq_low		= ntohl(b[4]);
 
@@ -454,7 +454,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
 static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 {
 	struct afs_call *call = container_of(work, struct afs_call, work);
-	struct afs_uuid *r = call->request;
+	struct uuid_v1 *r = call->request;
 
 	struct {
 		__be32	match;
@@ -477,7 +477,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
  */
 static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 {
-	struct afs_uuid *r;
+	struct uuid_v1 *r;
 	unsigned loop;
 	__be32 *b;
 	int ret;
@@ -503,15 +503,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 		}
 
 		_debug("unmarshall UUID");
-		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
 		if (!call->request)
 			return -ENOMEM;
 
 		b = call->buffer;
 		r = call->request;
-		r->time_low			= ntohl(b[0]);
-		r->time_mid			= ntohl(b[1]);
-		r->time_hi_and_version		= ntohl(b[2]);
+		r->time_low			= b[0];
+		r->time_mid			= htons(ntohl(b[1]));
+		r->time_hi_and_version		= htons(ntohl(b[2]));
 		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
 		r->clock_seq_low		= ntohl(b[4]);
 
@@ -569,9 +569,9 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 	memset(&reply, 0, sizeof(reply));
 	reply.ia.nifs = htonl(nifs);
 
-	reply.ia.uuid[0] = htonl(afs_uuid.time_low);
-	reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
-	reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+	reply.ia.uuid[0] = afs_uuid.time_low;
+	reply.ia.uuid[1] = htonl(ntohs(afs_uuid.time_mid));
+	reply.ia.uuid[2] = htonl(ntohs(afs_uuid.time_hi_and_version));
 	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
 	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
 	for (loop = 0; loop < 6; loop++)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 65504e218d35..79061fa17168 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/fscache.h>
 #include <linux/backing-dev.h>
+#include <linux/uuid.h>
 #include <net/af_rxrpc.h>
 
 #include "afs.h"
@@ -407,30 +408,6 @@ struct afs_interface {
 	unsigned	mtu;		/* MTU of interface */
 };
 
-/*
- * UUID definition [internet draft]
- * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
- *   increments since midnight 15th October 1582
- *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
- *     time
- * - the clock sequence is a 14-bit counter to avoid duplicate times
- */
-struct afs_uuid {
-	u32		time_low;			/* low part of timestamp */
-	u16		time_mid;			/* mid part of timestamp */
-	u16		time_hi_and_version;		/* high part of timestamp and version  */
-#define AFS_UUID_TO_UNIX_TIME	0x01b21dd213814000ULL
-#define AFS_UUID_TIMEHI_MASK	0x0fff
-#define AFS_UUID_VERSION_TIME	0x1000	/* time-based UUID */
-#define AFS_UUID_VERSION_NAME	0x3000	/* name-based UUID */
-#define AFS_UUID_VERSION_RANDOM	0x4000	/* (pseudo-)random generated UUID */
-	u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
-#define AFS_UUID_CLOCKHI_MASK	0x3f
-#define AFS_UUID_VARIANT_STD	0x80
-	u8		clock_seq_low;			/* clock seq low */
-	u8		node[6];			/* spatially unique node ID (MAC addr) */
-};
-
 /*****************************************************************************/
 /*
  * cache.c
@@ -565,7 +542,7 @@ extern int afs_drop_inode(struct inode *);
  * main.c
  */
 extern struct workqueue_struct *afs_wq;
-extern struct afs_uuid afs_uuid;
+extern struct uuid_v1 afs_uuid;
 
 /*
  * misc.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index f8188feb03ad..a07c14df3fd1 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -31,7 +31,7 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-struct afs_uuid afs_uuid;
+struct uuid_v1 afs_uuid;
 struct workqueue_struct *afs_wq;
 
 /*
@@ -41,7 +41,7 @@ static int __init afs_get_client_UUID(void)
 {
 	struct timespec ts;
 	u64 uuidtime;
-	u16 clockseq;
+	u16 clockseq, hi_v;
 	int ret;
 
 	/* read the MAC address of one of the external interfaces and construct
@@ -53,22 +53,23 @@ static int __init afs_get_client_UUID(void)
 	getnstimeofday(&ts);
 	uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
 	uuidtime += ts.tv_nsec / 100;
-	uuidtime += AFS_UUID_TO_UNIX_TIME;
-	afs_uuid.time_low = uuidtime;
-	afs_uuid.time_mid = uuidtime >> 32;
-	afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
-	afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME;
+	uuidtime += UUID_TO_UNIX_TIME;
+	afs_uuid.time_low = htonl(uuidtime);
+	afs_uuid.time_mid = htons(uuidtime >> 32);
+	hi_v = (uuidtime >> 48) & UUID_TIMEHI_MASK;
+	hi_v |= UUID_VERSION_TIME;
+	afs_uuid.time_hi_and_version = htons(hi_v);
 
 	get_random_bytes(&clockseq, 2);
 	afs_uuid.clock_seq_low = clockseq;
 	afs_uuid.clock_seq_hi_and_reserved =
-		(clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
-	afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD;
+		(clockseq >> 8) & UUID_CLOCKHI_MASK;
+	afs_uuid.clock_seq_hi_and_reserved |= UUID_VARIANT_STD;
 
 	_debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-	       afs_uuid.time_low,
-	       afs_uuid.time_mid,
-	       afs_uuid.time_hi_and_version,
+	       ntohl(afs_uuid.time_low),
+	       ntohs(afs_uuid.time_mid),
+	       ntohs(afs_uuid.time_hi_and_version),
 	       afs_uuid.clock_seq_hi_and_reserved,
 	       afs_uuid.clock_seq_low,
 	       afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 2d095fc60204..4dff73a89758 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -18,6 +18,30 @@
 
 #include <uapi/linux/uuid.h>
 
+/*
+ * V1 (time-based) UUID definition [RFC 4122].
+ * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
+ *   increments since midnight 15th October 1582
+ *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
+ *     time
+ * - the clock sequence is a 14-bit counter to avoid duplicate times
+ */
+struct uuid_v1 {
+	__be32		time_low;			/* low part of timestamp */
+	__be16		time_mid;			/* mid part of timestamp */
+	__be16		time_hi_and_version;		/* high part of timestamp and version  */
+#define UUID_TO_UNIX_TIME	0x01b21dd213814000ULL
+#define UUID_TIMEHI_MASK	0x0fff
+#define UUID_VERSION_TIME	0x1000	/* time-based UUID */
+#define UUID_VERSION_NAME	0x3000	/* name-based UUID */
+#define UUID_VERSION_RANDOM	0x4000	/* (pseudo-)random generated UUID */
+	u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
+#define UUID_CLOCKHI_MASK	0x3f
+#define UUID_VARIANT_STD	0x80
+	u8		clock_seq_low;			/* clock seq low */
+	u8		node[6];			/* spatially unique node ID (MAC addr) */
+};
+
 /*
  * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
  * not including trailing NUL.
-- 
cgit v1.2.3


From 28309572aac4c632666053dc8bf9906a3594b8d2 Mon Sep 17 00:00:00 2001
From: Cédric Le Goater <clg@kaod.org>
Date: Thu, 9 Feb 2017 10:21:07 +0100
Subject: mtd: name the mtd device with an optional label property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This can be used to easily identify a specific chip on a system with
multiple chips.

Suggested-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/mtd.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 5bb42c6dacdc..eebdc63cf6af 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -24,6 +24,7 @@
 #include <linux/uio.h>
 #include <linux/notifier.h>
 #include <linux/device.h>
+#include <linux/of.h>
 
 #include <mtd/mtd-abi.h>
 
@@ -386,6 +387,8 @@ static inline void mtd_set_of_node(struct mtd_info *mtd,
 				   struct device_node *np)
 {
 	mtd->dev.of_node = np;
+	if (!mtd->name)
+		of_property_read_string(np, "label", &mtd->name);
 }
 
 static inline struct device_node *mtd_get_of_node(struct mtd_info *mtd)
-- 
cgit v1.2.3


From ea6da4fd388a1eeab30d64da3eab3c5338714c74 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 7 Feb 2017 09:56:06 +0200
Subject: net/skbuff: Introduce skb_mac_offset()

Introduce skb_mac_offset() that could be used to get mac header offset.

Signed-off-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f1adddc1c5ac..69ccd2636911 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2184,6 +2184,11 @@ static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
 	return skb->head + skb->mac_header;
 }
 
+static inline int skb_mac_offset(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb) - skb->data;
+}
+
 static inline int skb_mac_header_was_set(const struct sk_buff *skb)
 {
 	return skb->mac_header != (typeof(skb->mac_header))~0U;
-- 
cgit v1.2.3


From 47feb41888bc82ba5c9268c344775adc483d6de7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 18:17:43 +0100
Subject: PCI/MSI: Remove unused pci_msi_create_default_irq_domain()

pci_msi_create_default_irq_domain() is never called in the whole tree, so
remove it as well as all the supporting code for a default PCI MSI domain.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c   | 50 ++------------------------------------------------
 include/linux/msi.h |  3 ---
 2 files changed, 2 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 0f77b38f03dd..79f20e4cb7bf 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -32,30 +32,11 @@ int pci_msi_ignore_mask;
 #define msix_table_size(flags)	((flags & PCI_MSIX_FLAGS_QSIZE) + 1)
 
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
-static struct irq_domain *pci_msi_default_domain;
-static DEFINE_MUTEX(pci_msi_domain_lock);
-
-struct irq_domain * __weak arch_get_pci_msi_domain(struct pci_dev *dev)
-{
-	return pci_msi_default_domain;
-}
-
-static struct irq_domain *pci_msi_get_domain(struct pci_dev *dev)
-{
-	struct irq_domain *domain;
-
-	domain = dev_get_msi_domain(&dev->dev);
-	if (domain)
-		return domain;
-
-	return arch_get_pci_msi_domain(dev);
-}
-
 static int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	struct irq_domain *domain;
 
-	domain = pci_msi_get_domain(dev);
+	domain = dev_get_msi_domain(&dev->dev);
 	if (domain && irq_domain_is_hierarchy(domain))
 		return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
 
@@ -66,7 +47,7 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 {
 	struct irq_domain *domain;
 
-	domain = pci_msi_get_domain(dev);
+	domain = dev_get_msi_domain(&dev->dev);
 	if (domain && irq_domain_is_hierarchy(domain))
 		pci_msi_domain_free_irqs(domain, dev);
 	else
@@ -1499,33 +1480,6 @@ void pci_msi_domain_free_irqs(struct irq_domain *domain, struct pci_dev *dev)
 	msi_domain_free_irqs(domain, &dev->dev);
 }
 
-/**
- * pci_msi_create_default_irq_domain - Create a default MSI interrupt domain
- * @fwnode:	Optional fwnode of the interrupt controller
- * @info:	MSI domain info
- * @parent:	Parent irq domain
- *
- * Returns: A domain pointer or NULL in case of failure. If successful
- * the default PCI/MSI irqdomain pointer is updated.
- */
-struct irq_domain *pci_msi_create_default_irq_domain(struct fwnode_handle *fwnode,
-		struct msi_domain_info *info, struct irq_domain *parent)
-{
-	struct irq_domain *domain;
-
-	mutex_lock(&pci_msi_domain_lock);
-	if (pci_msi_default_domain) {
-		pr_err("PCI: default irq domain for PCI MSI has already been created.\n");
-		domain = NULL;
-	} else {
-		domain = pci_msi_create_irq_domain(fwnode, info, parent);
-		pci_msi_default_domain = domain;
-	}
-	mutex_unlock(&pci_msi_domain_lock);
-
-	return domain;
-}
-
 static int get_msi_id_cb(struct pci_dev *pdev, u16 alias, void *data)
 {
 	u32 *pa = data;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 0db320b7bb15..18b8566b3ce3 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -319,9 +319,6 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 int pci_msi_domain_alloc_irqs(struct irq_domain *domain, struct pci_dev *dev,
 			      int nvec, int type);
 void pci_msi_domain_free_irqs(struct irq_domain *domain, struct pci_dev *dev);
-struct irq_domain *pci_msi_create_default_irq_domain(struct fwnode_handle *fwnode,
-		 struct msi_domain_info *info, struct irq_domain *parent);
-
 irq_hw_number_t pci_msi_domain_calc_hwirq(struct pci_dev *dev,
 					  struct msi_desc *desc);
 int pci_msi_domain_check_cap(struct irq_domain *domain,
-- 
cgit v1.2.3


From 699c4cec238731a4c466f73fe6e9e45ab6f49a41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 18:17:44 +0100
Subject: PCI/MSI: Remove pci_msi_domain_{alloc,free}_irqs()

Just call the msi_* version directly instead of having trivial wrappers for
one or two callsites.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/msi.c |  2 +-
 drivers/pci/msi.c          | 30 ++----------------------------
 include/linux/msi.h        |  3 ---
 3 files changed, 3 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 015bbf30e3e3..c61aec7e65f4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -82,7 +82,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (domain == NULL)
 		return -ENOSYS;
 
-	return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+	return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
 }
 
 void native_teardown_msi_irq(unsigned int irq)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 79f20e4cb7bf..b44ad7c21b29 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -38,7 +38,7 @@ static int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 
 	domain = dev_get_msi_domain(&dev->dev);
 	if (domain && irq_domain_is_hierarchy(domain))
-		return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+		return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
 
 	return arch_setup_msi_irqs(dev, nvec, type);
 }
@@ -49,7 +49,7 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 
 	domain = dev_get_msi_domain(&dev->dev);
 	if (domain && irq_domain_is_hierarchy(domain))
-		pci_msi_domain_free_irqs(domain, dev);
+		msi_domain_free_irqs(domain, &dev->dev);
 	else
 		arch_teardown_msi_irqs(dev);
 }
@@ -1454,32 +1454,6 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 }
 EXPORT_SYMBOL_GPL(pci_msi_create_irq_domain);
 
-/**
- * pci_msi_domain_alloc_irqs - Allocate interrupts for @dev in @domain
- * @domain:	The interrupt domain to allocate from
- * @dev:	The device for which to allocate
- * @nvec:	The number of interrupts to allocate
- * @type:	Unused to allow simpler migration from the arch_XXX interfaces
- *
- * Returns:
- * A virtual interrupt number or an error code in case of failure
- */
-int pci_msi_domain_alloc_irqs(struct irq_domain *domain, struct pci_dev *dev,
-			      int nvec, int type)
-{
-	return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
-}
-
-/**
- * pci_msi_domain_free_irqs - Free interrupts for @dev in @domain
- * @domain:	The interrupt domain
- * @dev:	The device for which to free interrupts
- */
-void pci_msi_domain_free_irqs(struct irq_domain *domain, struct pci_dev *dev)
-{
-	msi_domain_free_irqs(domain, &dev->dev);
-}
-
 static int get_msi_id_cb(struct pci_dev *pdev, u16 alias, void *data)
 {
 	u32 *pa = data;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 18b8566b3ce3..1b6f3ebbe876 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -316,9 +316,6 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent);
-int pci_msi_domain_alloc_irqs(struct irq_domain *domain, struct pci_dev *dev,
-			      int nvec, int type);
-void pci_msi_domain_free_irqs(struct irq_domain *domain, struct pci_dev *dev);
 irq_hw_number_t pci_msi_domain_calc_hwirq(struct pci_dev *dev,
 					  struct msi_desc *desc);
 int pci_msi_domain_check_cap(struct irq_domain *domain,
-- 
cgit v1.2.3


From 1697599ee301a52cded6499a09bd609f7f63fd06 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 9 Feb 2017 09:17:27 -0800
Subject: bitfield.h: add FIELD_FIT() helper

Add a helper for checking at runtime that a value will fit inside
a specified field/mask.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_bpf.h |  2 --
 include/linux/bitfield.h                     | 13 +++++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
index 76a19f1796af..9513c80f7be5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
@@ -39,8 +39,6 @@
 #include <linux/list.h>
 #include <linux/types.h>
 
-#define FIELD_FIT(mask, val)  (!((((u64)val) << __bf_shf(mask)) & ~(mask)))
-
 /* For branch fixup logic use up-most byte of branch instruction as scratch
  * area.  Remember to clear this before sending instructions to HW!
  */
diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index f6505d83069d..8b9d6fff002d 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -62,6 +62,19 @@
 					      (1ULL << __bf_shf(_mask))); \
 	})
 
+/**
+ * FIELD_FIT() - check if value fits in the field
+ * @_mask: shifted mask defining the field's length and position
+ * @_val:  value to test against the field
+ *
+ * Return: true if @_val can fit inside @_mask, false if @_val is too big.
+ */
+#define FIELD_FIT(_mask, _val)						\
+	({								\
+		__BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_FIT: ");	\
+		!((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \
+	})
+
 /**
  * FIELD_PREP() - prepare a bitfield element
  * @_mask: shifted mask defining the field's length and position
-- 
cgit v1.2.3


From a8e04698732736f59fefe72c675791a006b76e1d Mon Sep 17 00:00:00 2001
From: Sainath Grandhi <sainath.grandhi@intel.com>
Date: Fri, 10 Feb 2017 16:03:46 -0800
Subject: tap: Refactoring macvtap.c

macvtap module has code for tap/queue management and link management. This patch splits
the code into macvtap_main.c for link management and tap.c for tap/queue management.
Functionality in tap.c can be re-used for implementing tap on other virtual interfaces.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Makefile       |    2 +
 drivers/net/macvtap.c      | 1374 --------------------------------------------
 drivers/net/macvtap_main.c |  218 +++++++
 drivers/net/tap.c          | 1186 ++++++++++++++++++++++++++++++++++++++
 include/linux/if_macvtap.h |   10 +
 5 files changed, 1416 insertions(+), 1374 deletions(-)
 delete mode 100644 drivers/net/macvtap.c
 create mode 100644 drivers/net/macvtap_main.c
 create mode 100644 drivers/net/tap.c
 create mode 100644 include/linux/if_macvtap.h

(limited to 'include/linux')

diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 7336cbd3ef5d..19b03a9fe0f6 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -29,6 +29,8 @@ obj-$(CONFIG_GTP) += gtp.o
 obj-$(CONFIG_NLMON) += nlmon.o
 obj-$(CONFIG_NET_VRF) += vrf.o
 
+macvtap-objs := macvtap_main.o tap.o
+
 #
 # Networking Drivers
 #
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
deleted file mode 100644
index c27011bbe30c..000000000000
--- a/drivers/net/macvtap.c
+++ /dev/null
@@ -1,1374 +0,0 @@
-#include <linux/etherdevice.h>
-#include <linux/if_macvlan.h>
-#include <linux/if_vlan.h>
-#include <linux/interrupt.h>
-#include <linux/nsproxy.h>
-#include <linux/compat.h>
-#include <linux/if_tun.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/cache.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
-#include <linux/cdev.h>
-#include <linux/idr.h>
-#include <linux/fs.h>
-#include <linux/uio.h>
-
-#include <net/net_namespace.h>
-#include <net/rtnetlink.h>
-#include <net/sock.h>
-#include <linux/virtio_net.h>
-#include <linux/skb_array.h>
-
-/*
- * A macvtap queue is the central object of this driver, it connects
- * an open character device to a macvlan interface. There can be
- * multiple queues on one interface, which map back to queues
- * implemented in hardware on the underlying device.
- *
- * macvtap_proto is used to allocate queues through the sock allocation
- * mechanism.
- *
- */
-struct macvtap_queue {
-	struct sock sk;
-	struct socket sock;
-	struct socket_wq wq;
-	int vnet_hdr_sz;
-	struct macvlan_dev __rcu *vlan;
-	struct file *file;
-	unsigned int flags;
-	u16 queue_index;
-	bool enabled;
-	struct list_head next;
-	struct skb_array skb_array;
-};
-
-#define MACVTAP_FEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
-
-#define MACVTAP_VNET_LE 0x80000000
-#define MACVTAP_VNET_BE 0x40000000
-
-#ifdef CONFIG_TUN_VNET_CROSS_LE
-static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
-{
-	return q->flags & MACVTAP_VNET_BE ? false :
-		virtio_legacy_is_little_endian();
-}
-
-static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *sp)
-{
-	int s = !!(q->flags & MACVTAP_VNET_BE);
-
-	if (put_user(s, sp))
-		return -EFAULT;
-
-	return 0;
-}
-
-static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *sp)
-{
-	int s;
-
-	if (get_user(s, sp))
-		return -EFAULT;
-
-	if (s)
-		q->flags |= MACVTAP_VNET_BE;
-	else
-		q->flags &= ~MACVTAP_VNET_BE;
-
-	return 0;
-}
-#else
-static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
-{
-	return virtio_legacy_is_little_endian();
-}
-
-static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *argp)
-{
-	return -EINVAL;
-}
-
-static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *argp)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_TUN_VNET_CROSS_LE */
-
-static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
-{
-	return q->flags & MACVTAP_VNET_LE ||
-		macvtap_legacy_is_little_endian(q);
-}
-
-static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
-{
-	return __virtio16_to_cpu(macvtap_is_little_endian(q), val);
-}
-
-static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
-{
-	return __cpu_to_virtio16(macvtap_is_little_endian(q), val);
-}
-
-static struct proto macvtap_proto = {
-	.name = "macvtap",
-	.owner = THIS_MODULE,
-	.obj_size = sizeof (struct macvtap_queue),
-};
-
-/*
- * Variables for dealing with macvtaps device numbers.
- */
-static dev_t macvtap_major;
-#define MACVTAP_NUM_DEVS (1U << MINORBITS)
-static DEFINE_MUTEX(minor_lock);
-static DEFINE_IDR(minor_idr);
-
-#define GOODCOPY_LEN 128
-static const void *macvtap_net_namespace(struct device *d)
-{
-	struct net_device *dev = to_net_dev(d->parent);
-	return dev_net(dev);
-}
-
-static struct class macvtap_class = {
-	.name = "macvtap",
-	.owner = THIS_MODULE,
-	.ns_type = &net_ns_type_operations,
-	.namespace = macvtap_net_namespace,
-};
-static struct cdev macvtap_cdev;
-
-static const struct proto_ops macvtap_socket_ops;
-
-#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
-		      NETIF_F_TSO6 | NETIF_F_UFO)
-#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
-#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
-
-static struct macvlan_dev *macvtap_get_vlan_rcu(const struct net_device *dev)
-{
-	return rcu_dereference(dev->rx_handler_data);
-}
-
-/*
- * RCU usage:
- * The macvtap_queue and the macvlan_dev are loosely coupled, the
- * pointers from one to the other can only be read while rcu_read_lock
- * or rtnl is held.
- *
- * Both the file and the macvlan_dev hold a reference on the macvtap_queue
- * through sock_hold(&q->sk). When the macvlan_dev goes away first,
- * q->vlan becomes inaccessible. When the files gets closed,
- * macvtap_get_queue() fails.
- *
- * There may still be references to the struct sock inside of the
- * queue from outbound SKBs, but these never reference back to the
- * file or the dev. The data structure is freed through __sk_free
- * when both our references and any pending SKBs are gone.
- */
-
-static int macvtap_enable_queue(struct net_device *dev, struct file *file,
-				struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	int err = -EINVAL;
-
-	ASSERT_RTNL();
-
-	if (q->enabled)
-		goto out;
-
-	err = 0;
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
-	q->queue_index = vlan->numvtaps;
-	q->enabled = true;
-
-	vlan->numvtaps++;
-out:
-	return err;
-}
-
-/* Requires RTNL */
-static int macvtap_set_queue(struct net_device *dev, struct file *file,
-			     struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-
-	if (vlan->numqueues == MAX_MACVTAP_QUEUES)
-		return -EBUSY;
-
-	rcu_assign_pointer(q->vlan, vlan);
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
-	sock_hold(&q->sk);
-
-	q->file = file;
-	q->queue_index = vlan->numvtaps;
-	q->enabled = true;
-	file->private_data = q;
-	list_add_tail(&q->next, &vlan->queue_list);
-
-	vlan->numvtaps++;
-	vlan->numqueues++;
-
-	return 0;
-}
-
-static int macvtap_disable_queue(struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan;
-	struct macvtap_queue *nq;
-
-	ASSERT_RTNL();
-	if (!q->enabled)
-		return -EINVAL;
-
-	vlan = rtnl_dereference(q->vlan);
-
-	if (vlan) {
-		int index = q->queue_index;
-		BUG_ON(index >= vlan->numvtaps);
-		nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]);
-		nq->queue_index = index;
-
-		rcu_assign_pointer(vlan->taps[index], nq);
-		RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
-		q->enabled = false;
-
-		vlan->numvtaps--;
-	}
-
-	return 0;
-}
-
-/*
- * The file owning the queue got closed, give up both
- * the reference that the files holds as well as the
- * one from the macvlan_dev if that still exists.
- *
- * Using the spinlock makes sure that we don't get
- * to the queue again after destroying it.
- */
-static void macvtap_put_queue(struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan;
-
-	rtnl_lock();
-	vlan = rtnl_dereference(q->vlan);
-
-	if (vlan) {
-		if (q->enabled)
-			BUG_ON(macvtap_disable_queue(q));
-
-		vlan->numqueues--;
-		RCU_INIT_POINTER(q->vlan, NULL);
-		sock_put(&q->sk);
-		list_del_init(&q->next);
-	}
-
-	rtnl_unlock();
-
-	synchronize_rcu();
-	sock_put(&q->sk);
-}
-
-/*
- * Select a queue based on the rxq of the device on which this packet
- * arrived. If the incoming device is not mq, calculate a flow hash
- * to select a queue. If all fails, find the first available queue.
- * Cache vlan->numvtaps since it can become zero during the execution
- * of this function.
- */
-static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
-					       struct sk_buff *skb)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *tap = NULL;
-	/* Access to taps array is protected by rcu, but access to numvtaps
-	 * isn't. Below we use it to lookup a queue, but treat it as a hint
-	 * and validate that the result isn't NULL - in case we are
-	 * racing against queue removal.
-	 */
-	int numvtaps = ACCESS_ONCE(vlan->numvtaps);
-	__u32 rxq;
-
-	if (!numvtaps)
-		goto out;
-
-	if (numvtaps == 1)
-		goto single;
-
-	/* Check if we can use flow to select a queue */
-	rxq = skb_get_hash(skb);
-	if (rxq) {
-		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
-		goto out;
-	}
-
-	if (likely(skb_rx_queue_recorded(skb))) {
-		rxq = skb_get_rx_queue(skb);
-
-		while (unlikely(rxq >= numvtaps))
-			rxq -= numvtaps;
-
-		tap = rcu_dereference(vlan->taps[rxq]);
-		goto out;
-	}
-
-single:
-	tap = rcu_dereference(vlan->taps[0]);
-out:
-	return tap;
-}
-
-/*
- * The net_device is going away, give up the reference
- * that it holds on all queues and safely set the pointer
- * from the queues to NULL.
- */
-static void macvtap_del_queues(struct net_device *dev)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *q, *tmp;
-
-	ASSERT_RTNL();
-	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
-		list_del_init(&q->next);
-		RCU_INIT_POINTER(q->vlan, NULL);
-		if (q->enabled)
-			vlan->numvtaps--;
-		vlan->numqueues--;
-		sock_put(&q->sk);
-	}
-	BUG_ON(vlan->numvtaps);
-	BUG_ON(vlan->numqueues);
-	/* guarantee that any future macvtap_set_queue will fail */
-	vlan->numvtaps = MAX_MACVTAP_QUEUES;
-}
-
-static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
-{
-	struct sk_buff *skb = *pskb;
-	struct net_device *dev = skb->dev;
-	struct macvlan_dev *vlan;
-	struct macvtap_queue *q;
-	netdev_features_t features = TAP_FEATURES;
-
-	vlan = macvtap_get_vlan_rcu(dev);
-	if (!vlan)
-		return RX_HANDLER_PASS;
-
-	q = macvtap_get_queue(dev, skb);
-	if (!q)
-		return RX_HANDLER_PASS;
-
-	if (__skb_array_full(&q->skb_array))
-		goto drop;
-
-	skb_push(skb, ETH_HLEN);
-
-	/* Apply the forward feature mask so that we perform segmentation
-	 * according to users wishes.  This only works if VNET_HDR is
-	 * enabled.
-	 */
-	if (q->flags & IFF_VNET_HDR)
-		features |= vlan->tap_features;
-	if (netif_needs_gso(skb, features)) {
-		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
-
-		if (IS_ERR(segs))
-			goto drop;
-
-		if (!segs) {
-			if (skb_array_produce(&q->skb_array, skb))
-				goto drop;
-			goto wake_up;
-		}
-
-		consume_skb(skb);
-		while (segs) {
-			struct sk_buff *nskb = segs->next;
-
-			segs->next = NULL;
-			if (skb_array_produce(&q->skb_array, segs)) {
-				kfree_skb(segs);
-				kfree_skb_list(nskb);
-				break;
-			}
-			segs = nskb;
-		}
-	} else {
-		/* If we receive a partial checksum and the tap side
-		 * doesn't support checksum offload, compute the checksum.
-		 * Note: it doesn't matter which checksum feature to
-		 *        check, we either support them all or none.
-		 */
-		if (skb->ip_summed == CHECKSUM_PARTIAL &&
-		    !(features & NETIF_F_CSUM_MASK) &&
-		    skb_checksum_help(skb))
-			goto drop;
-		if (skb_array_produce(&q->skb_array, skb))
-			goto drop;
-	}
-
-wake_up:
-	wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
-	return RX_HANDLER_CONSUMED;
-
-drop:
-	/* Count errors/drops only here, thus don't care about args. */
-	macvlan_count_rx(vlan, 0, 0, 0);
-	kfree_skb(skb);
-	return RX_HANDLER_CONSUMED;
-}
-
-static int macvtap_get_minor(struct macvlan_dev *vlan)
-{
-	int retval = -ENOMEM;
-
-	mutex_lock(&minor_lock);
-	retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL);
-	if (retval >= 0) {
-		vlan->minor = retval;
-	} else if (retval == -ENOSPC) {
-		netdev_err(vlan->dev, "Too many macvtap devices\n");
-		retval = -EINVAL;
-	}
-	mutex_unlock(&minor_lock);
-	return retval < 0 ? retval : 0;
-}
-
-static void macvtap_free_minor(struct macvlan_dev *vlan)
-{
-	mutex_lock(&minor_lock);
-	if (vlan->minor) {
-		idr_remove(&minor_idr, vlan->minor);
-		vlan->minor = 0;
-	}
-	mutex_unlock(&minor_lock);
-}
-
-static struct net_device *dev_get_by_macvtap_minor(int minor)
-{
-	struct net_device *dev = NULL;
-	struct macvlan_dev *vlan;
-
-	mutex_lock(&minor_lock);
-	vlan = idr_find(&minor_idr, minor);
-	if (vlan) {
-		dev = vlan->dev;
-		dev_hold(dev);
-	}
-	mutex_unlock(&minor_lock);
-	return dev;
-}
-
-static int macvtap_newlink(struct net *src_net,
-			   struct net_device *dev,
-			   struct nlattr *tb[],
-			   struct nlattr *data[])
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	int err;
-
-	INIT_LIST_HEAD(&vlan->queue_list);
-
-	/* Since macvlan supports all offloads by default, make
-	 * tap support all offloads also.
-	 */
-	vlan->tap_features = TUN_OFFLOADS;
-
-	err = netdev_rx_handler_register(dev, macvtap_handle_frame, vlan);
-	if (err)
-		return err;
-
-	/* Don't put anything that may fail after macvlan_common_newlink
-	 * because we can't undo what it does.
-	 */
-	err = macvlan_common_newlink(src_net, dev, tb, data);
-	if (err) {
-		netdev_rx_handler_unregister(dev);
-		return err;
-	}
-
-	return 0;
-}
-
-static void macvtap_dellink(struct net_device *dev,
-			    struct list_head *head)
-{
-	netdev_rx_handler_unregister(dev);
-	macvtap_del_queues(dev);
-	macvlan_dellink(dev, head);
-}
-
-static void macvtap_setup(struct net_device *dev)
-{
-	macvlan_common_setup(dev);
-	dev->tx_queue_len = TUN_READQ_SIZE;
-}
-
-static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
-	.kind		= "macvtap",
-	.setup		= macvtap_setup,
-	.newlink	= macvtap_newlink,
-	.dellink	= macvtap_dellink,
-};
-
-
-static void macvtap_sock_write_space(struct sock *sk)
-{
-	wait_queue_head_t *wqueue;
-
-	if (!sock_writeable(sk) ||
-	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
-		return;
-
-	wqueue = sk_sleep(sk);
-	if (wqueue && waitqueue_active(wqueue))
-		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
-}
-
-static void macvtap_sock_destruct(struct sock *sk)
-{
-	struct macvtap_queue *q = container_of(sk, struct macvtap_queue, sk);
-
-	skb_array_cleanup(&q->skb_array);
-}
-
-static int macvtap_open(struct inode *inode, struct file *file)
-{
-	struct net *net = current->nsproxy->net_ns;
-	struct net_device *dev;
-	struct macvtap_queue *q;
-	int err = -ENODEV;
-
-	rtnl_lock();
-	dev = dev_get_by_macvtap_minor(iminor(inode));
-	if (!dev)
-		goto err;
-
-	err = -ENOMEM;
-	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					     &macvtap_proto, 0);
-	if (!q)
-		goto err;
-
-	RCU_INIT_POINTER(q->sock.wq, &q->wq);
-	init_waitqueue_head(&q->wq.wait);
-	q->sock.type = SOCK_RAW;
-	q->sock.state = SS_CONNECTED;
-	q->sock.file = file;
-	q->sock.ops = &macvtap_socket_ops;
-	sock_init_data(&q->sock, &q->sk);
-	q->sk.sk_write_space = macvtap_sock_write_space;
-	q->sk.sk_destruct = macvtap_sock_destruct;
-	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
-	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
-
-	/*
-	 * so far only KVM virtio_net uses macvtap, enable zero copy between
-	 * guest kernel and host kernel when lower device supports zerocopy
-	 *
-	 * The macvlan supports zerocopy iff the lower device supports zero
-	 * copy so we don't have to look at the lower device directly.
-	 */
-	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
-		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
-
-	err = -ENOMEM;
-	if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
-		goto err_array;
-
-	err = macvtap_set_queue(dev, file, q);
-	if (err)
-		goto err_queue;
-
-	dev_put(dev);
-
-	rtnl_unlock();
-	return err;
-
-err_queue:
-	skb_array_cleanup(&q->skb_array);
-err_array:
-	sock_put(&q->sk);
-err:
-	if (dev)
-		dev_put(dev);
-
-	rtnl_unlock();
-	return err;
-}
-
-static int macvtap_release(struct inode *inode, struct file *file)
-{
-	struct macvtap_queue *q = file->private_data;
-	macvtap_put_queue(q);
-	return 0;
-}
-
-static unsigned int macvtap_poll(struct file *file, poll_table * wait)
-{
-	struct macvtap_queue *q = file->private_data;
-	unsigned int mask = POLLERR;
-
-	if (!q)
-		goto out;
-
-	mask = 0;
-	poll_wait(file, &q->wq.wait, wait);
-
-	if (!skb_array_empty(&q->skb_array))
-		mask |= POLLIN | POLLRDNORM;
-
-	if (sock_writeable(&q->sk) ||
-	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
-	     sock_writeable(&q->sk)))
-		mask |= POLLOUT | POLLWRNORM;
-
-out:
-	return mask;
-}
-
-static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
-						size_t len, size_t linear,
-						int noblock, int *err)
-{
-	struct sk_buff *skb;
-
-	/* Under a page?  Don't bother with paged skb. */
-	if (prepad + len < PAGE_SIZE || !linear)
-		linear = len;
-
-	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   err, 0);
-	if (!skb)
-		return NULL;
-
-	skb_reserve(skb, prepad);
-	skb_put(skb, linear);
-	skb->data_len = len - linear;
-	skb->len += len - linear;
-
-	return skb;
-}
-
-/* Neighbour code has some assumptions on HH_DATA_MOD alignment */
-#define MACVTAP_RESERVE HH_DATA_OFF(ETH_HLEN)
-
-/* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
-				struct iov_iter *from, int noblock)
-{
-	int good_linear = SKB_MAX_HEAD(MACVTAP_RESERVE);
-	struct sk_buff *skb;
-	struct macvlan_dev *vlan;
-	unsigned long total_len = iov_iter_count(from);
-	unsigned long len = total_len;
-	int err;
-	struct virtio_net_hdr vnet_hdr = { 0 };
-	int vnet_hdr_len = 0;
-	int copylen = 0;
-	int depth;
-	bool zerocopy = false;
-	size_t linear;
-
-	if (q->flags & IFF_VNET_HDR) {
-		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
-
-		err = -EINVAL;
-		if (len < vnet_hdr_len)
-			goto err;
-		len -= vnet_hdr_len;
-
-		err = -EFAULT;
-		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
-			goto err;
-		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
-		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
-		     macvtap16_to_cpu(q, vnet_hdr.csum_start) +
-		     macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
-			     macvtap16_to_cpu(q, vnet_hdr.hdr_len))
-			vnet_hdr.hdr_len = cpu_to_macvtap16(q,
-				 macvtap16_to_cpu(q, vnet_hdr.csum_start) +
-				 macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
-		err = -EINVAL;
-		if (macvtap16_to_cpu(q, vnet_hdr.hdr_len) > len)
-			goto err;
-	}
-
-	err = -EINVAL;
-	if (unlikely(len < ETH_HLEN))
-		goto err;
-
-	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
-		struct iov_iter i;
-
-		copylen = vnet_hdr.hdr_len ?
-			macvtap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
-		if (copylen > good_linear)
-			copylen = good_linear;
-		else if (copylen < ETH_HLEN)
-			copylen = ETH_HLEN;
-		linear = copylen;
-		i = *from;
-		iov_iter_advance(&i, copylen);
-		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
-			zerocopy = true;
-	}
-
-	if (!zerocopy) {
-		copylen = len;
-		linear = macvtap16_to_cpu(q, vnet_hdr.hdr_len);
-		if (linear > good_linear)
-			linear = good_linear;
-		else if (linear < ETH_HLEN)
-			linear = ETH_HLEN;
-	}
-
-	skb = macvtap_alloc_skb(&q->sk, MACVTAP_RESERVE, copylen,
-				linear, noblock, &err);
-	if (!skb)
-		goto err;
-
-	if (zerocopy)
-		err = zerocopy_sg_from_iter(skb, from);
-	else
-		err = skb_copy_datagram_from_iter(skb, 0, from, len);
-
-	if (err)
-		goto err_kfree;
-
-	skb_set_network_header(skb, ETH_HLEN);
-	skb_reset_mac_header(skb);
-	skb->protocol = eth_hdr(skb)->h_proto;
-
-	if (vnet_hdr_len) {
-		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
-					    macvtap_is_little_endian(q));
-		if (err)
-			goto err_kfree;
-	}
-
-	skb_probe_transport_header(skb, ETH_HLEN);
-
-	/* Move network header to the right position for VLAN tagged packets */
-	if ((skb->protocol == htons(ETH_P_8021Q) ||
-	     skb->protocol == htons(ETH_P_8021AD)) &&
-	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
-		skb_set_network_header(skb, depth);
-
-	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
-	/* copy skb_ubuf_info for callback when skb has no error */
-	if (zerocopy) {
-		skb_shinfo(skb)->destructor_arg = m->msg_control;
-		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
-		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-	} else if (m && m->msg_control) {
-		struct ubuf_info *uarg = m->msg_control;
-		uarg->callback(uarg, false);
-	}
-
-	if (vlan) {
-		skb->dev = vlan->dev;
-		dev_queue_xmit(skb);
-	} else {
-		kfree_skb(skb);
-	}
-	rcu_read_unlock();
-
-	return total_len;
-
-err_kfree:
-	kfree_skb(skb);
-
-err:
-	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
-	if (vlan)
-		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
-	rcu_read_unlock();
-
-	return err;
-}
-
-static ssize_t macvtap_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct macvtap_queue *q = file->private_data;
-
-	return macvtap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
-}
-
-/* Put packet to the user space buffer */
-static ssize_t macvtap_put_user(struct macvtap_queue *q,
-				const struct sk_buff *skb,
-				struct iov_iter *iter)
-{
-	int ret;
-	int vnet_hdr_len = 0;
-	int vlan_offset = 0;
-	int total;
-
-	if (q->flags & IFF_VNET_HDR) {
-		struct virtio_net_hdr vnet_hdr;
-		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
-		if (iov_iter_count(iter) < vnet_hdr_len)
-			return -EINVAL;
-
-		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					    macvtap_is_little_endian(q), true))
-			BUG();
-
-		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
-		    sizeof(vnet_hdr))
-			return -EFAULT;
-
-		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
-	}
-	total = vnet_hdr_len;
-	total += skb->len;
-
-	if (skb_vlan_tag_present(skb)) {
-		struct {
-			__be16 h_vlan_proto;
-			__be16 h_vlan_TCI;
-		} veth;
-		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
-
-		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
-		total += VLAN_HLEN;
-
-		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
-		if (ret || !iov_iter_count(iter))
-			goto done;
-
-		ret = copy_to_iter(&veth, sizeof(veth), iter);
-		if (ret != sizeof(veth) || !iov_iter_count(iter))
-			goto done;
-	}
-
-	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
-				     skb->len - vlan_offset);
-
-done:
-	return ret ? ret : total;
-}
-
-static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       struct iov_iter *to,
-			       int noblock)
-{
-	DEFINE_WAIT(wait);
-	struct sk_buff *skb;
-	ssize_t ret = 0;
-
-	if (!iov_iter_count(to))
-		return 0;
-
-	while (1) {
-		if (!noblock)
-			prepare_to_wait(sk_sleep(&q->sk), &wait,
-					TASK_INTERRUPTIBLE);
-
-		/* Read frames from the queue */
-		skb = skb_array_consume(&q->skb_array);
-		if (skb)
-			break;
-		if (noblock) {
-			ret = -EAGAIN;
-			break;
-		}
-		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
-		/* Nothing to read, let's sleep */
-		schedule();
-	}
-	if (!noblock)
-		finish_wait(sk_sleep(&q->sk), &wait);
-
-	if (skb) {
-		ret = macvtap_put_user(q, skb, to);
-		if (unlikely(ret < 0))
-			kfree_skb(skb);
-		else
-			consume_skb(skb);
-	}
-	return ret;
-}
-
-static ssize_t macvtap_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
-	struct file *file = iocb->ki_filp;
-	struct macvtap_queue *q = file->private_data;
-	ssize_t len = iov_iter_count(to), ret;
-
-	ret = macvtap_do_read(q, to, file->f_flags & O_NONBLOCK);
-	ret = min_t(ssize_t, ret, len);
-	if (ret > 0)
-		iocb->ki_pos = ret;
-	return ret;
-}
-
-static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan;
-
-	ASSERT_RTNL();
-	vlan = rtnl_dereference(q->vlan);
-	if (vlan)
-		dev_hold(vlan->dev);
-
-	return vlan;
-}
-
-static void macvtap_put_vlan(struct macvlan_dev *vlan)
-{
-	dev_put(vlan->dev);
-}
-
-static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
-{
-	struct macvtap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
-	int ret;
-
-	vlan = macvtap_get_vlan(q);
-	if (!vlan)
-		return -EINVAL;
-
-	if (flags & IFF_ATTACH_QUEUE)
-		ret = macvtap_enable_queue(vlan->dev, file, q);
-	else if (flags & IFF_DETACH_QUEUE)
-		ret = macvtap_disable_queue(q);
-	else
-		ret = -EINVAL;
-
-	macvtap_put_vlan(vlan);
-	return ret;
-}
-
-static int set_offload(struct macvtap_queue *q, unsigned long arg)
-{
-	struct macvlan_dev *vlan;
-	netdev_features_t features;
-	netdev_features_t feature_mask = 0;
-
-	vlan = rtnl_dereference(q->vlan);
-	if (!vlan)
-		return -ENOLINK;
-
-	features = vlan->dev->features;
-
-	if (arg & TUN_F_CSUM) {
-		feature_mask = NETIF_F_HW_CSUM;
-
-		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
-			if (arg & TUN_F_TSO_ECN)
-				feature_mask |= NETIF_F_TSO_ECN;
-			if (arg & TUN_F_TSO4)
-				feature_mask |= NETIF_F_TSO;
-			if (arg & TUN_F_TSO6)
-				feature_mask |= NETIF_F_TSO6;
-		}
-
-		if (arg & TUN_F_UFO)
-			feature_mask |= NETIF_F_UFO;
-	}
-
-	/* tun/tap driver inverts the usage for TSO offloads, where
-	 * setting the TSO bit means that the userspace wants to
-	 * accept TSO frames and turning it off means that user space
-	 * does not support TSO.
-	 * For macvtap, we have to invert it to mean the same thing.
-	 * When user space turns off TSO, we turn off GSO/LRO so that
-	 * user-space will not receive TSO frames.
-	 */
-	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO))
-		features |= RX_OFFLOADS;
-	else
-		features &= ~RX_OFFLOADS;
-
-	/* tap_features are the same as features on tun/tap and
-	 * reflect user expectations.
-	 */
-	vlan->tap_features = feature_mask;
-	vlan->set_features = features;
-	netdev_update_features(vlan->dev);
-
-	return 0;
-}
-
-/*
- * provide compatibility with generic tun/tap interface
- */
-static long macvtap_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	struct macvtap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
-	void __user *argp = (void __user *)arg;
-	struct ifreq __user *ifr = argp;
-	unsigned int __user *up = argp;
-	unsigned short u;
-	int __user *sp = argp;
-	struct sockaddr sa;
-	int s;
-	int ret;
-
-	switch (cmd) {
-	case TUNSETIFF:
-		/* ignore the name, just look at flags */
-		if (get_user(u, &ifr->ifr_flags))
-			return -EFAULT;
-
-		ret = 0;
-		if ((u & ~MACVTAP_FEATURES) != (IFF_NO_PI | IFF_TAP))
-			ret = -EINVAL;
-		else
-			q->flags = (q->flags & ~MACVTAP_FEATURES) | u;
-
-		return ret;
-
-	case TUNGETIFF:
-		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
-		if (!vlan) {
-			rtnl_unlock();
-			return -ENOLINK;
-		}
-
-		ret = 0;
-		u = q->flags;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
-		    put_user(u, &ifr->ifr_flags))
-			ret = -EFAULT;
-		macvtap_put_vlan(vlan);
-		rtnl_unlock();
-		return ret;
-
-	case TUNSETQUEUE:
-		if (get_user(u, &ifr->ifr_flags))
-			return -EFAULT;
-		rtnl_lock();
-		ret = macvtap_ioctl_set_queue(file, u);
-		rtnl_unlock();
-		return ret;
-
-	case TUNGETFEATURES:
-		if (put_user(IFF_TAP | IFF_NO_PI | MACVTAP_FEATURES, up))
-			return -EFAULT;
-		return 0;
-
-	case TUNSETSNDBUF:
-		if (get_user(s, sp))
-			return -EFAULT;
-
-		q->sk.sk_sndbuf = s;
-		return 0;
-
-	case TUNGETVNETHDRSZ:
-		s = q->vnet_hdr_sz;
-		if (put_user(s, sp))
-			return -EFAULT;
-		return 0;
-
-	case TUNSETVNETHDRSZ:
-		if (get_user(s, sp))
-			return -EFAULT;
-		if (s < (int)sizeof(struct virtio_net_hdr))
-			return -EINVAL;
-
-		q->vnet_hdr_sz = s;
-		return 0;
-
-	case TUNGETVNETLE:
-		s = !!(q->flags & MACVTAP_VNET_LE);
-		if (put_user(s, sp))
-			return -EFAULT;
-		return 0;
-
-	case TUNSETVNETLE:
-		if (get_user(s, sp))
-			return -EFAULT;
-		if (s)
-			q->flags |= MACVTAP_VNET_LE;
-		else
-			q->flags &= ~MACVTAP_VNET_LE;
-		return 0;
-
-	case TUNGETVNETBE:
-		return macvtap_get_vnet_be(q, sp);
-
-	case TUNSETVNETBE:
-		return macvtap_set_vnet_be(q, sp);
-
-	case TUNSETOFFLOAD:
-		/* let the user check for future flags */
-		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			    TUN_F_TSO_ECN | TUN_F_UFO))
-			return -EINVAL;
-
-		rtnl_lock();
-		ret = set_offload(q, arg);
-		rtnl_unlock();
-		return ret;
-
-	case SIOCGIFHWADDR:
-		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
-		if (!vlan) {
-			rtnl_unlock();
-			return -ENOLINK;
-		}
-		ret = 0;
-		u = vlan->dev->type;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
-		    copy_to_user(&ifr->ifr_hwaddr.sa_data, vlan->dev->dev_addr, ETH_ALEN) ||
-		    put_user(u, &ifr->ifr_hwaddr.sa_family))
-			ret = -EFAULT;
-		macvtap_put_vlan(vlan);
-		rtnl_unlock();
-		return ret;
-
-	case SIOCSIFHWADDR:
-		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
-			return -EFAULT;
-		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
-		if (!vlan) {
-			rtnl_unlock();
-			return -ENOLINK;
-		}
-		ret = dev_set_mac_address(vlan->dev, &sa);
-		macvtap_put_vlan(vlan);
-		rtnl_unlock();
-		return ret;
-
-	default:
-		return -EINVAL;
-	}
-}
-
-#ifdef CONFIG_COMPAT
-static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
-				 unsigned long arg)
-{
-	return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
-static const struct file_operations macvtap_fops = {
-	.owner		= THIS_MODULE,
-	.open		= macvtap_open,
-	.release	= macvtap_release,
-	.read_iter	= macvtap_read_iter,
-	.write_iter	= macvtap_write_iter,
-	.poll		= macvtap_poll,
-	.llseek		= no_llseek,
-	.unlocked_ioctl	= macvtap_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= macvtap_compat_ioctl,
-#endif
-};
-
-static int macvtap_sendmsg(struct socket *sock, struct msghdr *m,
-			   size_t total_len)
-{
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
-}
-
-static int macvtap_recvmsg(struct socket *sock, struct msghdr *m,
-			   size_t total_len, int flags)
-{
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	int ret;
-	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
-		return -EINVAL;
-	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
-	if (ret > total_len) {
-		m->msg_flags |= MSG_TRUNC;
-		ret = flags & MSG_TRUNC ? ret : total_len;
-	}
-	return ret;
-}
-
-static int macvtap_peek_len(struct socket *sock)
-{
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue,
-					       sock);
-	return skb_array_peek_len(&q->skb_array);
-}
-
-/* Ops structure to mimic raw sockets with tun */
-static const struct proto_ops macvtap_socket_ops = {
-	.sendmsg = macvtap_sendmsg,
-	.recvmsg = macvtap_recvmsg,
-	.peek_len = macvtap_peek_len,
-};
-
-/* Get an underlying socket object from tun file.  Returns error unless file is
- * attached to a device.  The returned object works like a packet socket, it
- * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
- * holding a reference to the file for as long as the socket is in use. */
-struct socket *macvtap_get_socket(struct file *file)
-{
-	struct macvtap_queue *q;
-	if (file->f_op != &macvtap_fops)
-		return ERR_PTR(-EINVAL);
-	q = file->private_data;
-	if (!q)
-		return ERR_PTR(-EBADFD);
-	return &q->sock;
-}
-EXPORT_SYMBOL_GPL(macvtap_get_socket);
-
-static int macvtap_queue_resize(struct macvlan_dev *vlan)
-{
-	struct net_device *dev = vlan->dev;
-	struct macvtap_queue *q;
-	struct skb_array **arrays;
-	int n = vlan->numqueues;
-	int ret, i = 0;
-
-	arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
-	if (!arrays)
-		return -ENOMEM;
-
-	list_for_each_entry(q, &vlan->queue_list, next)
-		arrays[i++] = &q->skb_array;
-
-	ret = skb_array_resize_multiple(arrays, n,
-					dev->tx_queue_len, GFP_KERNEL);
-
-	kfree(arrays);
-	return ret;
-}
-
-static int macvtap_device_event(struct notifier_block *unused,
-				unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct macvlan_dev *vlan;
-	struct device *classdev;
-	dev_t devt;
-	int err;
-	char tap_name[IFNAMSIZ];
-
-	if (dev->rtnl_link_ops != &macvtap_link_ops)
-		return NOTIFY_DONE;
-
-	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
-	vlan = netdev_priv(dev);
-
-	switch (event) {
-	case NETDEV_REGISTER:
-		/* Create the device node here after the network device has
-		 * been registered but before register_netdevice has
-		 * finished running.
-		 */
-		err = macvtap_get_minor(vlan);
-		if (err)
-			return notifier_from_errno(err);
-
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
-		classdev = device_create(&macvtap_class, &dev->dev, devt,
-					 dev, tap_name);
-		if (IS_ERR(classdev)) {
-			macvtap_free_minor(vlan);
-			return notifier_from_errno(PTR_ERR(classdev));
-		}
-		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
-					tap_name);
-		if (err)
-			return notifier_from_errno(err);
-		break;
-	case NETDEV_UNREGISTER:
-		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
-		if (vlan->minor == 0)
-			break;
-		sysfs_remove_link(&dev->dev.kobj, tap_name);
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
-		device_destroy(&macvtap_class, devt);
-		macvtap_free_minor(vlan);
-		break;
-	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		if (macvtap_queue_resize(vlan))
-			return NOTIFY_BAD;
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block macvtap_notifier_block __read_mostly = {
-	.notifier_call	= macvtap_device_event,
-};
-
-static int macvtap_init(void)
-{
-	int err;
-
-	err = alloc_chrdev_region(&macvtap_major, 0,
-				MACVTAP_NUM_DEVS, "macvtap");
-	if (err)
-		goto out1;
-
-	cdev_init(&macvtap_cdev, &macvtap_fops);
-	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
-	if (err)
-		goto out2;
-
-	err = class_register(&macvtap_class);
-	if (err)
-		goto out3;
-
-	err = register_netdevice_notifier(&macvtap_notifier_block);
-	if (err)
-		goto out4;
-
-	err = macvlan_link_register(&macvtap_link_ops);
-	if (err)
-		goto out5;
-
-	return 0;
-
-out5:
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-out4:
-	class_unregister(&macvtap_class);
-out3:
-	cdev_del(&macvtap_cdev);
-out2:
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-out1:
-	return err;
-}
-module_init(macvtap_init);
-
-static void macvtap_exit(void)
-{
-	rtnl_link_unregister(&macvtap_link_ops);
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-	class_unregister(&macvtap_class);
-	cdev_del(&macvtap_cdev);
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-	idr_destroy(&minor_idr);
-}
-module_exit(macvtap_exit);
-
-MODULE_ALIAS_RTNL_LINK("macvtap");
-MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
new file mode 100644
index 000000000000..96ffa60c5a36
--- /dev/null
+++ b/drivers/net/macvtap_main.c
@@ -0,0 +1,218 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/if_macvtap.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/uio.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+#include <linux/virtio_net.h>
+#include <linux/skb_array.h>
+
+/*
+ * Variables for dealing with macvtaps device numbers.
+ */
+static dev_t macvtap_major;
+#define MACVTAP_NUM_DEVS (1U << MINORBITS)
+
+static const void *macvtap_net_namespace(struct device *d)
+{
+	struct net_device *dev = to_net_dev(d->parent);
+	return dev_net(dev);
+}
+
+static struct class macvtap_class = {
+	.name = "macvtap",
+	.owner = THIS_MODULE,
+	.ns_type = &net_ns_type_operations,
+	.namespace = macvtap_net_namespace,
+};
+static struct cdev macvtap_cdev;
+
+#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
+		      NETIF_F_TSO6 | NETIF_F_UFO)
+
+static int macvtap_newlink(struct net *src_net,
+			   struct net_device *dev,
+			   struct nlattr *tb[],
+			   struct nlattr *data[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	int err;
+
+	INIT_LIST_HEAD(&vlan->queue_list);
+
+	/* Since macvlan supports all offloads by default, make
+	 * tap support all offloads also.
+	 */
+	vlan->tap_features = TUN_OFFLOADS;
+
+	err = netdev_rx_handler_register(dev, macvtap_handle_frame, vlan);
+	if (err)
+		return err;
+
+	/* Don't put anything that may fail after macvlan_common_newlink
+	 * because we can't undo what it does.
+	 */
+	err = macvlan_common_newlink(src_net, dev, tb, data);
+	if (err) {
+		netdev_rx_handler_unregister(dev);
+		return err;
+	}
+
+	return 0;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+			    struct list_head *head)
+{
+	netdev_rx_handler_unregister(dev);
+	macvtap_del_queues(dev);
+	macvlan_dellink(dev, head);
+}
+
+static void macvtap_setup(struct net_device *dev)
+{
+	macvlan_common_setup(dev);
+	dev->tx_queue_len = TUN_READQ_SIZE;
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+	.kind		= "macvtap",
+	.setup		= macvtap_setup,
+	.newlink	= macvtap_newlink,
+	.dellink	= macvtap_dellink,
+};
+
+static int macvtap_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct macvlan_dev *vlan;
+	struct device *classdev;
+	dev_t devt;
+	int err;
+	char tap_name[IFNAMSIZ];
+
+	if (dev->rtnl_link_ops != &macvtap_link_ops)
+		return NOTIFY_DONE;
+
+	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
+	vlan = netdev_priv(dev);
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		/* Create the device node here after the network device has
+		 * been registered but before register_netdevice has
+		 * finished running.
+		 */
+		err = macvtap_get_minor(vlan);
+		if (err)
+			return notifier_from_errno(err);
+
+		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		classdev = device_create(&macvtap_class, &dev->dev, devt,
+					 dev, tap_name);
+		if (IS_ERR(classdev)) {
+			macvtap_free_minor(vlan);
+			return notifier_from_errno(PTR_ERR(classdev));
+		}
+		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
+					tap_name);
+		if (err)
+			return notifier_from_errno(err);
+		break;
+	case NETDEV_UNREGISTER:
+		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
+		if (vlan->minor == 0)
+			break;
+		sysfs_remove_link(&dev->dev.kobj, tap_name);
+		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		device_destroy(&macvtap_class, devt);
+		macvtap_free_minor(vlan);
+		break;
+	case NETDEV_CHANGE_TX_QUEUE_LEN:
+		if (macvtap_queue_resize(vlan))
+			return NOTIFY_BAD;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block macvtap_notifier_block __read_mostly = {
+	.notifier_call	= macvtap_device_event,
+};
+
+extern struct file_operations macvtap_fops;
+static int macvtap_init(void)
+{
+	int err;
+
+	err = alloc_chrdev_region(&macvtap_major, 0,
+				MACVTAP_NUM_DEVS, "macvtap");
+	if (err)
+		goto out1;
+
+	cdev_init(&macvtap_cdev, &macvtap_fops);
+	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
+	if (err)
+		goto out2;
+
+	err = class_register(&macvtap_class);
+	if (err)
+		goto out3;
+
+	err = register_netdevice_notifier(&macvtap_notifier_block);
+	if (err)
+		goto out4;
+
+	err = macvlan_link_register(&macvtap_link_ops);
+	if (err)
+		goto out5;
+
+	return 0;
+
+out5:
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+out4:
+	class_unregister(&macvtap_class);
+out3:
+	cdev_del(&macvtap_cdev);
+out2:
+	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+out1:
+	return err;
+}
+module_init(macvtap_init);
+
+extern struct idr minor_idr;
+static void macvtap_exit(void)
+{
+	rtnl_link_unregister(&macvtap_link_ops);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+	class_unregister(&macvtap_class);
+	cdev_del(&macvtap_cdev);
+	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+	idr_destroy(&minor_idr);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
new file mode 100644
index 000000000000..6f6228e4fd3f
--- /dev/null
+++ b/drivers/net/tap.c
@@ -0,0 +1,1186 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/uio.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+#include <linux/virtio_net.h>
+#include <linux/skb_array.h>
+
+/*
+ * A macvtap queue is the central object of this driver, it connects
+ * an open character device to a macvlan interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * macvtap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ */
+struct macvtap_queue {
+	struct sock sk;
+	struct socket sock;
+	struct socket_wq wq;
+	int vnet_hdr_sz;
+	struct macvlan_dev __rcu *vlan;
+	struct file *file;
+	unsigned int flags;
+	u16 queue_index;
+	bool enabled;
+	struct list_head next;
+	struct skb_array skb_array;
+};
+
+#define MACVTAP_FEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
+
+#define MACVTAP_VNET_LE 0x80000000
+#define MACVTAP_VNET_BE 0x40000000
+
+#ifdef CONFIG_TUN_VNET_CROSS_LE
+static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+{
+	return q->flags & MACVTAP_VNET_BE ? false :
+		virtio_legacy_is_little_endian();
+}
+
+static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *sp)
+{
+	int s = !!(q->flags & MACVTAP_VNET_BE);
+
+	if (put_user(s, sp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *sp)
+{
+	int s;
+
+	if (get_user(s, sp))
+		return -EFAULT;
+
+	if (s)
+		q->flags |= MACVTAP_VNET_BE;
+	else
+		q->flags &= ~MACVTAP_VNET_BE;
+
+	return 0;
+}
+#else
+static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+{
+	return virtio_legacy_is_little_endian();
+}
+
+static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *argp)
+{
+	return -EINVAL;
+}
+
+static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *argp)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_TUN_VNET_CROSS_LE */
+
+static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
+{
+	return q->flags & MACVTAP_VNET_LE ||
+		macvtap_legacy_is_little_endian(q);
+}
+
+static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
+{
+	return __virtio16_to_cpu(macvtap_is_little_endian(q), val);
+}
+
+static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
+{
+	return __cpu_to_virtio16(macvtap_is_little_endian(q), val);
+}
+
+static struct proto macvtap_proto = {
+	.name = "macvtap",
+	.owner = THIS_MODULE,
+	.obj_size = sizeof (struct macvtap_queue),
+};
+
+#define MACVTAP_NUM_DEVS (1U << MINORBITS)
+static DEFINE_MUTEX(minor_lock);
+DEFINE_IDR(minor_idr);
+
+#define GOODCOPY_LEN 128
+
+static const struct proto_ops macvtap_socket_ops;
+
+#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
+#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
+
+static struct macvlan_dev *macvtap_get_vlan_rcu(const struct net_device *dev)
+{
+	return rcu_dereference(dev->rx_handler_data);
+}
+
+/*
+ * RCU usage:
+ * The macvtap_queue and the macvlan_dev are loosely coupled, the
+ * pointers from one to the other can only be read while rcu_read_lock
+ * or rtnl is held.
+ *
+ * Both the file and the macvlan_dev hold a reference on the macvtap_queue
+ * through sock_hold(&q->sk). When the macvlan_dev goes away first,
+ * q->vlan becomes inaccessible. When the files gets closed,
+ * macvtap_get_queue() fails.
+ *
+ * There may still be references to the struct sock inside of the
+ * queue from outbound SKBs, but these never reference back to the
+ * file or the dev. The data structure is freed through __sk_free
+ * when both our references and any pending SKBs are gone.
+ */
+
+static int macvtap_enable_queue(struct net_device *dev, struct file *file,
+				struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	int err = -EINVAL;
+
+	ASSERT_RTNL();
+
+	if (q->enabled)
+		goto out;
+
+	err = 0;
+	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
+	q->queue_index = vlan->numvtaps;
+	q->enabled = true;
+
+	vlan->numvtaps++;
+out:
+	return err;
+}
+
+/* Requires RTNL */
+static int macvtap_set_queue(struct net_device *dev, struct file *file,
+			     struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	if (vlan->numqueues == MAX_MACVTAP_QUEUES)
+		return -EBUSY;
+
+	rcu_assign_pointer(q->vlan, vlan);
+	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
+	sock_hold(&q->sk);
+
+	q->file = file;
+	q->queue_index = vlan->numvtaps;
+	q->enabled = true;
+	file->private_data = q;
+	list_add_tail(&q->next, &vlan->queue_list);
+
+	vlan->numvtaps++;
+	vlan->numqueues++;
+
+	return 0;
+}
+
+static int macvtap_disable_queue(struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan;
+	struct macvtap_queue *nq;
+
+	ASSERT_RTNL();
+	if (!q->enabled)
+		return -EINVAL;
+
+	vlan = rtnl_dereference(q->vlan);
+
+	if (vlan) {
+		int index = q->queue_index;
+		BUG_ON(index >= vlan->numvtaps);
+		nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]);
+		nq->queue_index = index;
+
+		rcu_assign_pointer(vlan->taps[index], nq);
+		RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
+		q->enabled = false;
+
+		vlan->numvtaps--;
+	}
+
+	return 0;
+}
+
+/*
+ * The file owning the queue got closed, give up both
+ * the reference that the files holds as well as the
+ * one from the macvlan_dev if that still exists.
+ *
+ * Using the spinlock makes sure that we don't get
+ * to the queue again after destroying it.
+ */
+static void macvtap_put_queue(struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan;
+
+	rtnl_lock();
+	vlan = rtnl_dereference(q->vlan);
+
+	if (vlan) {
+		if (q->enabled)
+			BUG_ON(macvtap_disable_queue(q));
+
+		vlan->numqueues--;
+		RCU_INIT_POINTER(q->vlan, NULL);
+		sock_put(&q->sk);
+		list_del_init(&q->next);
+	}
+
+	rtnl_unlock();
+
+	synchronize_rcu();
+	sock_put(&q->sk);
+}
+
+/*
+ * Select a queue based on the rxq of the device on which this packet
+ * arrived. If the incoming device is not mq, calculate a flow hash
+ * to select a queue. If all fails, find the first available queue.
+ * Cache vlan->numvtaps since it can become zero during the execution
+ * of this function.
+ */
+static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
+					       struct sk_buff *skb)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_queue *tap = NULL;
+	/* Access to taps array is protected by rcu, but access to numvtaps
+	 * isn't. Below we use it to lookup a queue, but treat it as a hint
+	 * and validate that the result isn't NULL - in case we are
+	 * racing against queue removal.
+	 */
+	int numvtaps = ACCESS_ONCE(vlan->numvtaps);
+	__u32 rxq;
+
+	if (!numvtaps)
+		goto out;
+
+	if (numvtaps == 1)
+		goto single;
+
+	/* Check if we can use flow to select a queue */
+	rxq = skb_get_hash(skb);
+	if (rxq) {
+		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
+		goto out;
+	}
+
+	if (likely(skb_rx_queue_recorded(skb))) {
+		rxq = skb_get_rx_queue(skb);
+
+		while (unlikely(rxq >= numvtaps))
+			rxq -= numvtaps;
+
+		tap = rcu_dereference(vlan->taps[rxq]);
+		goto out;
+	}
+
+single:
+	tap = rcu_dereference(vlan->taps[0]);
+out:
+	return tap;
+}
+
+/*
+ * The net_device is going away, give up the reference
+ * that it holds on all queues and safely set the pointer
+ * from the queues to NULL.
+ */
+void macvtap_del_queues(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_queue *q, *tmp;
+
+	ASSERT_RTNL();
+	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
+		list_del_init(&q->next);
+		RCU_INIT_POINTER(q->vlan, NULL);
+		if (q->enabled)
+			vlan->numvtaps--;
+		vlan->numqueues--;
+		sock_put(&q->sk);
+	}
+	BUG_ON(vlan->numvtaps);
+	BUG_ON(vlan->numqueues);
+	/* guarantee that any future macvtap_set_queue will fail */
+	vlan->numvtaps = MAX_MACVTAP_QUEUES;
+}
+
+rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *dev = skb->dev;
+	struct macvlan_dev *vlan;
+	struct macvtap_queue *q;
+	netdev_features_t features = TAP_FEATURES;
+
+	vlan = macvtap_get_vlan_rcu(dev);
+	if (!vlan)
+		return RX_HANDLER_PASS;
+
+	q = macvtap_get_queue(dev, skb);
+	if (!q)
+		return RX_HANDLER_PASS;
+
+	if (__skb_array_full(&q->skb_array))
+		goto drop;
+
+	skb_push(skb, ETH_HLEN);
+
+	/* Apply the forward feature mask so that we perform segmentation
+	 * according to users wishes.  This only works if VNET_HDR is
+	 * enabled.
+	 */
+	if (q->flags & IFF_VNET_HDR)
+		features |= vlan->tap_features;
+	if (netif_needs_gso(skb, features)) {
+		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
+
+		if (IS_ERR(segs))
+			goto drop;
+
+		if (!segs) {
+			if (skb_array_produce(&q->skb_array, skb))
+				goto drop;
+			goto wake_up;
+		}
+
+		consume_skb(skb);
+		while (segs) {
+			struct sk_buff *nskb = segs->next;
+
+			segs->next = NULL;
+			if (skb_array_produce(&q->skb_array, segs)) {
+				kfree_skb(segs);
+				kfree_skb_list(nskb);
+				break;
+			}
+			segs = nskb;
+		}
+	} else {
+		/* If we receive a partial checksum and the tap side
+		 * doesn't support checksum offload, compute the checksum.
+		 * Note: it doesn't matter which checksum feature to
+		 *	  check, we either support them all or none.
+		 */
+		if (skb->ip_summed == CHECKSUM_PARTIAL &&
+		    !(features & NETIF_F_CSUM_MASK) &&
+		    skb_checksum_help(skb))
+			goto drop;
+		if (skb_array_produce(&q->skb_array, skb))
+			goto drop;
+	}
+
+wake_up:
+	wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
+	return RX_HANDLER_CONSUMED;
+
+drop:
+	/* Count errors/drops only here, thus don't care about args. */
+	macvlan_count_rx(vlan, 0, 0, 0);
+	kfree_skb(skb);
+	return RX_HANDLER_CONSUMED;
+}
+
+int macvtap_get_minor(struct macvlan_dev *vlan)
+{
+	int retval = -ENOMEM;
+
+	mutex_lock(&minor_lock);
+	retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL);
+	if (retval >= 0) {
+		vlan->minor = retval;
+	} else if (retval == -ENOSPC) {
+		netdev_err(vlan->dev, "Too many macvtap devices\n");
+		retval = -EINVAL;
+	}
+	mutex_unlock(&minor_lock);
+	return retval < 0 ? retval : 0;
+}
+
+void macvtap_free_minor(struct macvlan_dev *vlan)
+{
+	mutex_lock(&minor_lock);
+	if (vlan->minor) {
+		idr_remove(&minor_idr, vlan->minor);
+		vlan->minor = 0;
+	}
+	mutex_unlock(&minor_lock);
+}
+
+static struct net_device *dev_get_by_macvtap_minor(int minor)
+{
+	struct net_device *dev = NULL;
+	struct macvlan_dev *vlan;
+
+	mutex_lock(&minor_lock);
+	vlan = idr_find(&minor_idr, minor);
+	if (vlan) {
+		dev = vlan->dev;
+		dev_hold(dev);
+	}
+	mutex_unlock(&minor_lock);
+	return dev;
+}
+
+static void macvtap_sock_write_space(struct sock *sk)
+{
+	wait_queue_head_t *wqueue;
+
+	if (!sock_writeable(sk) ||
+	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
+		return;
+
+	wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
+}
+
+static void macvtap_sock_destruct(struct sock *sk)
+{
+	struct macvtap_queue *q = container_of(sk, struct macvtap_queue, sk);
+
+	skb_array_cleanup(&q->skb_array);
+}
+
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct net_device *dev;
+	struct macvtap_queue *q;
+	int err = -ENODEV;
+
+	rtnl_lock();
+	dev = dev_get_by_macvtap_minor(iminor(inode));
+	if (!dev)
+		goto err;
+
+	err = -ENOMEM;
+	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+					     &macvtap_proto, 0);
+	if (!q)
+		goto err;
+
+	RCU_INIT_POINTER(q->sock.wq, &q->wq);
+	init_waitqueue_head(&q->wq.wait);
+	q->sock.type = SOCK_RAW;
+	q->sock.state = SS_CONNECTED;
+	q->sock.file = file;
+	q->sock.ops = &macvtap_socket_ops;
+	sock_init_data(&q->sock, &q->sk);
+	q->sk.sk_write_space = macvtap_sock_write_space;
+	q->sk.sk_destruct = macvtap_sock_destruct;
+	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
+	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+
+	/*
+	 * so far only KVM virtio_net uses macvtap, enable zero copy between
+	 * guest kernel and host kernel when lower device supports zerocopy
+	 *
+	 * The macvlan supports zerocopy iff the lower device supports zero
+	 * copy so we don't have to look at the lower device directly.
+	 */
+	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
+		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+
+	err = -ENOMEM;
+	if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
+		goto err_array;
+
+	err = macvtap_set_queue(dev, file, q);
+	if (err)
+		goto err_queue;
+
+	dev_put(dev);
+
+	rtnl_unlock();
+	return err;
+
+err_queue:
+	skb_array_cleanup(&q->skb_array);
+err_array:
+	sock_put(&q->sk);
+err:
+	if (dev)
+		dev_put(dev);
+
+	rtnl_unlock();
+	return err;
+}
+
+static int macvtap_release(struct inode *inode, struct file *file)
+{
+	struct macvtap_queue *q = file->private_data;
+	macvtap_put_queue(q);
+	return 0;
+}
+
+static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+{
+	struct macvtap_queue *q = file->private_data;
+	unsigned int mask = POLLERR;
+
+	if (!q)
+		goto out;
+
+	mask = 0;
+	poll_wait(file, &q->wq.wait, wait);
+
+	if (!skb_array_empty(&q->skb_array))
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sock_writeable(&q->sk) ||
+	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
+	     sock_writeable(&q->sk)))
+		mask |= POLLOUT | POLLWRNORM;
+
+out:
+	return mask;
+}
+
+static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
+						size_t len, size_t linear,
+						int noblock, int *err)
+{
+	struct sk_buff *skb;
+
+	/* Under a page?  Don't bother with paged skb. */
+	if (prepad + len < PAGE_SIZE || !linear)
+		linear = len;
+
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+				   err, 0);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, prepad);
+	skb_put(skb, linear);
+	skb->data_len = len - linear;
+	skb->len += len - linear;
+
+	return skb;
+}
+
+/* Neighbour code has some assumptions on HH_DATA_MOD alignment */
+#define MACVTAP_RESERVE HH_DATA_OFF(ETH_HLEN)
+
+/* Get packet from user space buffer */
+static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
+				struct iov_iter *from, int noblock)
+{
+	int good_linear = SKB_MAX_HEAD(MACVTAP_RESERVE);
+	struct sk_buff *skb;
+	struct macvlan_dev *vlan;
+	unsigned long total_len = iov_iter_count(from);
+	unsigned long len = total_len;
+	int err;
+	struct virtio_net_hdr vnet_hdr = { 0 };
+	int vnet_hdr_len = 0;
+	int copylen = 0;
+	int depth;
+	bool zerocopy = false;
+	size_t linear;
+
+	if (q->flags & IFF_VNET_HDR) {
+		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
+
+		err = -EINVAL;
+		if (len < vnet_hdr_len)
+			goto err;
+		len -= vnet_hdr_len;
+
+		err = -EFAULT;
+		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
+			goto err;
+		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
+		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		     macvtap16_to_cpu(q, vnet_hdr.csum_start) +
+		     macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
+			     macvtap16_to_cpu(q, vnet_hdr.hdr_len))
+			vnet_hdr.hdr_len = cpu_to_macvtap16(q,
+				 macvtap16_to_cpu(q, vnet_hdr.csum_start) +
+				 macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
+		err = -EINVAL;
+		if (macvtap16_to_cpu(q, vnet_hdr.hdr_len) > len)
+			goto err;
+	}
+
+	err = -EINVAL;
+	if (unlikely(len < ETH_HLEN))
+		goto err;
+
+	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
+		struct iov_iter i;
+
+		copylen = vnet_hdr.hdr_len ?
+			macvtap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
+		if (copylen > good_linear)
+			copylen = good_linear;
+		else if (copylen < ETH_HLEN)
+			copylen = ETH_HLEN;
+		linear = copylen;
+		i = *from;
+		iov_iter_advance(&i, copylen);
+		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
+			zerocopy = true;
+	}
+
+	if (!zerocopy) {
+		copylen = len;
+		linear = macvtap16_to_cpu(q, vnet_hdr.hdr_len);
+		if (linear > good_linear)
+			linear = good_linear;
+		else if (linear < ETH_HLEN)
+			linear = ETH_HLEN;
+	}
+
+	skb = macvtap_alloc_skb(&q->sk, MACVTAP_RESERVE, copylen,
+				linear, noblock, &err);
+	if (!skb)
+		goto err;
+
+	if (zerocopy)
+		err = zerocopy_sg_from_iter(skb, from);
+	else
+		err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+	if (err)
+		goto err_kfree;
+
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	if (vnet_hdr_len) {
+		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
+					    macvtap_is_little_endian(q));
+		if (err)
+			goto err_kfree;
+	}
+
+	skb_probe_transport_header(skb, ETH_HLEN);
+
+	/* Move network header to the right position for VLAN tagged packets */
+	if ((skb->protocol == htons(ETH_P_8021Q) ||
+	     skb->protocol == htons(ETH_P_8021AD)) &&
+	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
+		skb_set_network_header(skb, depth);
+
+	rcu_read_lock();
+	vlan = rcu_dereference(q->vlan);
+	/* copy skb_ubuf_info for callback when skb has no error */
+	if (zerocopy) {
+		skb_shinfo(skb)->destructor_arg = m->msg_control;
+		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+	} else if (m && m->msg_control) {
+		struct ubuf_info *uarg = m->msg_control;
+		uarg->callback(uarg, false);
+	}
+
+	if (vlan) {
+		skb->dev = vlan->dev;
+		dev_queue_xmit(skb);
+	} else {
+		kfree_skb(skb);
+	}
+	rcu_read_unlock();
+
+	return total_len;
+
+err_kfree:
+	kfree_skb(skb);
+
+err:
+	rcu_read_lock();
+	vlan = rcu_dereference(q->vlan);
+	if (vlan)
+		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+	rcu_read_unlock();
+
+	return err;
+}
+
+static ssize_t macvtap_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct macvtap_queue *q = file->private_data;
+
+	return macvtap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
+}
+
+/* Put packet to the user space buffer */
+static ssize_t macvtap_put_user(struct macvtap_queue *q,
+				const struct sk_buff *skb,
+				struct iov_iter *iter)
+{
+	int ret;
+	int vnet_hdr_len = 0;
+	int vlan_offset = 0;
+	int total;
+
+	if (q->flags & IFF_VNET_HDR) {
+		struct virtio_net_hdr vnet_hdr;
+		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
+		if (iov_iter_count(iter) < vnet_hdr_len)
+			return -EINVAL;
+
+		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
+					    macvtap_is_little_endian(q), true))
+			BUG();
+
+		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
+		    sizeof(vnet_hdr))
+			return -EFAULT;
+
+		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
+	}
+	total = vnet_hdr_len;
+	total += skb->len;
+
+	if (skb_vlan_tag_present(skb)) {
+		struct {
+			__be16 h_vlan_proto;
+			__be16 h_vlan_TCI;
+		} veth;
+		veth.h_vlan_proto = skb->vlan_proto;
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+
+		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
+		total += VLAN_HLEN;
+
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
+			goto done;
+
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret != sizeof(veth) || !iov_iter_count(iter))
+			goto done;
+	}
+
+	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
+				     skb->len - vlan_offset);
+
+done:
+	return ret ? ret : total;
+}
+
+static ssize_t macvtap_do_read(struct macvtap_queue *q,
+			       struct iov_iter *to,
+			       int noblock)
+{
+	DEFINE_WAIT(wait);
+	struct sk_buff *skb;
+	ssize_t ret = 0;
+
+	if (!iov_iter_count(to))
+		return 0;
+
+	while (1) {
+		if (!noblock)
+			prepare_to_wait(sk_sleep(&q->sk), &wait,
+					TASK_INTERRUPTIBLE);
+
+		/* Read frames from the queue */
+		skb = skb_array_consume(&q->skb_array);
+		if (skb)
+			break;
+		if (noblock) {
+			ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		/* Nothing to read, let's sleep */
+		schedule();
+	}
+	if (!noblock)
+		finish_wait(sk_sleep(&q->sk), &wait);
+
+	if (skb) {
+		ret = macvtap_put_user(q, skb, to);
+		if (unlikely(ret < 0))
+			kfree_skb(skb);
+		else
+			consume_skb(skb);
+	}
+	return ret;
+}
+
+static ssize_t macvtap_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct macvtap_queue *q = file->private_data;
+	ssize_t len = iov_iter_count(to), ret;
+
+	ret = macvtap_do_read(q, to, file->f_flags & O_NONBLOCK);
+	ret = min_t(ssize_t, ret, len);
+	if (ret > 0)
+		iocb->ki_pos = ret;
+	return ret;
+}
+
+static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan;
+
+	ASSERT_RTNL();
+	vlan = rtnl_dereference(q->vlan);
+	if (vlan)
+		dev_hold(vlan->dev);
+
+	return vlan;
+}
+
+static void macvtap_put_vlan(struct macvlan_dev *vlan)
+{
+	dev_put(vlan->dev);
+}
+
+static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
+{
+	struct macvtap_queue *q = file->private_data;
+	struct macvlan_dev *vlan;
+	int ret;
+
+	vlan = macvtap_get_vlan(q);
+	if (!vlan)
+		return -EINVAL;
+
+	if (flags & IFF_ATTACH_QUEUE)
+		ret = macvtap_enable_queue(vlan->dev, file, q);
+	else if (flags & IFF_DETACH_QUEUE)
+		ret = macvtap_disable_queue(q);
+	else
+		ret = -EINVAL;
+
+	macvtap_put_vlan(vlan);
+	return ret;
+}
+
+static int set_offload(struct macvtap_queue *q, unsigned long arg)
+{
+	struct macvlan_dev *vlan;
+	netdev_features_t features;
+	netdev_features_t feature_mask = 0;
+
+	vlan = rtnl_dereference(q->vlan);
+	if (!vlan)
+		return -ENOLINK;
+
+	features = vlan->dev->features;
+
+	if (arg & TUN_F_CSUM) {
+		feature_mask = NETIF_F_HW_CSUM;
+
+		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
+			if (arg & TUN_F_TSO_ECN)
+				feature_mask |= NETIF_F_TSO_ECN;
+			if (arg & TUN_F_TSO4)
+				feature_mask |= NETIF_F_TSO;
+			if (arg & TUN_F_TSO6)
+				feature_mask |= NETIF_F_TSO6;
+		}
+
+		if (arg & TUN_F_UFO)
+			feature_mask |= NETIF_F_UFO;
+	}
+
+	/* tun/tap driver inverts the usage for TSO offloads, where
+	 * setting the TSO bit means that the userspace wants to
+	 * accept TSO frames and turning it off means that user space
+	 * does not support TSO.
+	 * For macvtap, we have to invert it to mean the same thing.
+	 * When user space turns off TSO, we turn off GSO/LRO so that
+	 * user-space will not receive TSO frames.
+	 */
+	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO))
+		features |= RX_OFFLOADS;
+	else
+		features &= ~RX_OFFLOADS;
+
+	/* tap_features are the same as features on tun/tap and
+	 * reflect user expectations.
+	 */
+	vlan->tap_features = feature_mask;
+	vlan->set_features = features;
+	netdev_update_features(vlan->dev);
+
+	return 0;
+}
+
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct macvtap_queue *q = file->private_data;
+	struct macvlan_dev *vlan;
+	void __user *argp = (void __user *)arg;
+	struct ifreq __user *ifr = argp;
+	unsigned int __user *up = argp;
+	unsigned short u;
+	int __user *sp = argp;
+	struct sockaddr sa;
+	int s;
+	int ret;
+
+	switch (cmd) {
+	case TUNSETIFF:
+		/* ignore the name, just look at flags */
+		if (get_user(u, &ifr->ifr_flags))
+			return -EFAULT;
+
+		ret = 0;
+		if ((u & ~MACVTAP_FEATURES) != (IFF_NO_PI | IFF_TAP))
+			ret = -EINVAL;
+		else
+			q->flags = (q->flags & ~MACVTAP_FEATURES) | u;
+
+		return ret;
+
+	case TUNGETIFF:
+		rtnl_lock();
+		vlan = macvtap_get_vlan(q);
+		if (!vlan) {
+			rtnl_unlock();
+			return -ENOLINK;
+		}
+
+		ret = 0;
+		u = q->flags;
+		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
+		    put_user(u, &ifr->ifr_flags))
+			ret = -EFAULT;
+		macvtap_put_vlan(vlan);
+		rtnl_unlock();
+		return ret;
+
+	case TUNSETQUEUE:
+		if (get_user(u, &ifr->ifr_flags))
+			return -EFAULT;
+		rtnl_lock();
+		ret = macvtap_ioctl_set_queue(file, u);
+		rtnl_unlock();
+		return ret;
+
+	case TUNGETFEATURES:
+		if (put_user(IFF_TAP | IFF_NO_PI | MACVTAP_FEATURES, up))
+			return -EFAULT;
+		return 0;
+
+	case TUNSETSNDBUF:
+		if (get_user(s, sp))
+			return -EFAULT;
+
+		q->sk.sk_sndbuf = s;
+		return 0;
+
+	case TUNGETVNETHDRSZ:
+		s = q->vnet_hdr_sz;
+		if (put_user(s, sp))
+			return -EFAULT;
+		return 0;
+
+	case TUNSETVNETHDRSZ:
+		if (get_user(s, sp))
+			return -EFAULT;
+		if (s < (int)sizeof(struct virtio_net_hdr))
+			return -EINVAL;
+
+		q->vnet_hdr_sz = s;
+		return 0;
+
+	case TUNGETVNETLE:
+		s = !!(q->flags & MACVTAP_VNET_LE);
+		if (put_user(s, sp))
+			return -EFAULT;
+		return 0;
+
+	case TUNSETVNETLE:
+		if (get_user(s, sp))
+			return -EFAULT;
+		if (s)
+			q->flags |= MACVTAP_VNET_LE;
+		else
+			q->flags &= ~MACVTAP_VNET_LE;
+		return 0;
+
+	case TUNGETVNETBE:
+		return macvtap_get_vnet_be(q, sp);
+
+	case TUNSETVNETBE:
+		return macvtap_set_vnet_be(q, sp);
+
+	case TUNSETOFFLOAD:
+		/* let the user check for future flags */
+		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+			    TUN_F_TSO_ECN | TUN_F_UFO))
+			return -EINVAL;
+
+		rtnl_lock();
+		ret = set_offload(q, arg);
+		rtnl_unlock();
+		return ret;
+
+	case SIOCGIFHWADDR:
+		rtnl_lock();
+		vlan = macvtap_get_vlan(q);
+		if (!vlan) {
+			rtnl_unlock();
+			return -ENOLINK;
+		}
+		ret = 0;
+		u = vlan->dev->type;
+		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
+		    copy_to_user(&ifr->ifr_hwaddr.sa_data, vlan->dev->dev_addr, ETH_ALEN) ||
+		    put_user(u, &ifr->ifr_hwaddr.sa_family))
+			ret = -EFAULT;
+		macvtap_put_vlan(vlan);
+		rtnl_unlock();
+		return ret;
+
+	case SIOCSIFHWADDR:
+		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
+			return -EFAULT;
+		rtnl_lock();
+		vlan = macvtap_get_vlan(q);
+		if (!vlan) {
+			rtnl_unlock();
+			return -ENOLINK;
+		}
+		ret = dev_set_mac_address(vlan->dev, &sa);
+		macvtap_put_vlan(vlan);
+		rtnl_unlock();
+		return ret;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+const struct file_operations macvtap_fops = {
+	.owner		= THIS_MODULE,
+	.open		= macvtap_open,
+	.release	= macvtap_release,
+	.read_iter	= macvtap_read_iter,
+	.write_iter	= macvtap_write_iter,
+	.poll		= macvtap_poll,
+	.llseek		= no_llseek,
+	.unlocked_ioctl	= macvtap_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= macvtap_compat_ioctl,
+#endif
+};
+
+static int macvtap_sendmsg(struct socket *sock, struct msghdr *m,
+			   size_t total_len)
+{
+	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
+	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
+}
+
+static int macvtap_recvmsg(struct socket *sock, struct msghdr *m,
+			   size_t total_len, int flags)
+{
+	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
+	int ret;
+	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
+		return -EINVAL;
+	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
+	if (ret > total_len) {
+		m->msg_flags |= MSG_TRUNC;
+		ret = flags & MSG_TRUNC ? ret : total_len;
+	}
+	return ret;
+}
+
+static int macvtap_peek_len(struct socket *sock)
+{
+	struct macvtap_queue *q = container_of(sock, struct macvtap_queue,
+					       sock);
+	return skb_array_peek_len(&q->skb_array);
+}
+
+/* Ops structure to mimic raw sockets with tun */
+static const struct proto_ops macvtap_socket_ops = {
+	.sendmsg = macvtap_sendmsg,
+	.recvmsg = macvtap_recvmsg,
+	.peek_len = macvtap_peek_len,
+};
+
+/* Get an underlying socket object from tun file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *macvtap_get_socket(struct file *file)
+{
+	struct macvtap_queue *q;
+	if (file->f_op != &macvtap_fops)
+		return ERR_PTR(-EINVAL);
+	q = file->private_data;
+	if (!q)
+		return ERR_PTR(-EBADFD);
+	return &q->sock;
+}
+EXPORT_SYMBOL_GPL(macvtap_get_socket);
+
+int macvtap_queue_resize(struct macvlan_dev *vlan)
+{
+	struct net_device *dev = vlan->dev;
+	struct macvtap_queue *q;
+	struct skb_array **arrays;
+	int n = vlan->numqueues;
+	int ret, i = 0;
+
+	arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+	if (!arrays)
+		return -ENOMEM;
+
+	list_for_each_entry(q, &vlan->queue_list, next)
+		arrays[i++] = &q->skb_array;
+
+	ret = skb_array_resize_multiple(arrays, n,
+					dev->tx_queue_len, GFP_KERNEL);
+
+	kfree(arrays);
+	return ret;
+}
diff --git a/include/linux/if_macvtap.h b/include/linux/if_macvtap.h
new file mode 100644
index 000000000000..c9bf84b75b27
--- /dev/null
+++ b/include/linux/if_macvtap.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_IF_MACVTAP_H_
+#define _LINUX_IF_MACVTAP_H_
+
+rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb);
+void macvtap_del_queues(struct net_device *dev);
+int macvtap_get_minor(struct macvlan_dev *vlan);
+void macvtap_free_minor(struct macvlan_dev *vlan);
+int macvtap_queue_resize(struct macvlan_dev *vlan);
+
+#endif /*_LINUX_IF_MACVTAP_H_*/
-- 
cgit v1.2.3


From 635b8c8ecdd27142d7fdab0df334b2e9201481cf Mon Sep 17 00:00:00 2001
From: Sainath Grandhi <sainath.grandhi@intel.com>
Date: Fri, 10 Feb 2017 16:03:47 -0800
Subject: tap: Renaming tap related APIs, data structures, macros

Renaming tap related APIs, data structures and macros in tap.c from macvtap_.* to tap_.*

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap_main.c |  18 +--
 drivers/net/tap.c          | 332 ++++++++++++++++++++++-----------------------
 drivers/vhost/net.c        |   3 +-
 include/linux/if_macvlan.h |  17 +--
 include/linux/if_macvtap.h |  10 --
 include/linux/if_tap.h     |  23 ++++
 6 files changed, 202 insertions(+), 201 deletions(-)
 delete mode 100644 include/linux/if_macvtap.h
 create mode 100644 include/linux/if_tap.h

(limited to 'include/linux')

diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
index 96ffa60c5a36..548f339a75bd 100644
--- a/drivers/net/macvtap_main.c
+++ b/drivers/net/macvtap_main.c
@@ -1,6 +1,6 @@
 #include <linux/etherdevice.h>
 #include <linux/if_macvlan.h>
-#include <linux/if_macvtap.h>
+#include <linux/if_tap.h>
 #include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/nsproxy.h>
@@ -62,7 +62,7 @@ static int macvtap_newlink(struct net *src_net,
 	 */
 	vlan->tap_features = TUN_OFFLOADS;
 
-	err = netdev_rx_handler_register(dev, macvtap_handle_frame, vlan);
+	err = netdev_rx_handler_register(dev, tap_handle_frame, vlan);
 	if (err)
 		return err;
 
@@ -82,7 +82,7 @@ static void macvtap_dellink(struct net_device *dev,
 			    struct list_head *head)
 {
 	netdev_rx_handler_unregister(dev);
-	macvtap_del_queues(dev);
+	tap_del_queues(dev);
 	macvlan_dellink(dev, head);
 }
 
@@ -121,7 +121,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		 * been registered but before register_netdevice has
 		 * finished running.
 		 */
-		err = macvtap_get_minor(vlan);
+		err = tap_get_minor(vlan);
 		if (err)
 			return notifier_from_errno(err);
 
@@ -129,7 +129,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		classdev = device_create(&macvtap_class, &dev->dev, devt,
 					 dev, tap_name);
 		if (IS_ERR(classdev)) {
-			macvtap_free_minor(vlan);
+			tap_free_minor(vlan);
 			return notifier_from_errno(PTR_ERR(classdev));
 		}
 		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
@@ -144,10 +144,10 @@ static int macvtap_device_event(struct notifier_block *unused,
 		sysfs_remove_link(&dev->dev.kobj, tap_name);
 		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
 		device_destroy(&macvtap_class, devt);
-		macvtap_free_minor(vlan);
+		tap_free_minor(vlan);
 		break;
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		if (macvtap_queue_resize(vlan))
+		if (tap_queue_resize(vlan))
 			return NOTIFY_BAD;
 		break;
 	}
@@ -159,7 +159,7 @@ static struct notifier_block macvtap_notifier_block __read_mostly = {
 	.notifier_call	= macvtap_device_event,
 };
 
-extern struct file_operations macvtap_fops;
+extern struct file_operations tap_fops;
 static int macvtap_init(void)
 {
 	int err;
@@ -169,7 +169,7 @@ static int macvtap_init(void)
 	if (err)
 		goto out1;
 
-	cdev_init(&macvtap_cdev, &macvtap_fops);
+	cdev_init(&macvtap_cdev, &tap_fops);
 	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
 	if (err)
 		goto out2;
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 6f6228e4fd3f..15ca2d531d05 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -24,16 +24,16 @@
 #include <linux/skb_array.h>
 
 /*
- * A macvtap queue is the central object of this driver, it connects
+ * A tap queue is the central object of this driver, it connects
  * an open character device to a macvlan interface. There can be
  * multiple queues on one interface, which map back to queues
  * implemented in hardware on the underlying device.
  *
- * macvtap_proto is used to allocate queues through the sock allocation
+ * tap_proto is used to allocate queues through the sock allocation
  * mechanism.
  *
  */
-struct macvtap_queue {
+struct tap_queue {
 	struct sock sk;
 	struct socket sock;
 	struct socket_wq wq;
@@ -47,21 +47,21 @@ struct macvtap_queue {
 	struct skb_array skb_array;
 };
 
-#define MACVTAP_FEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
+#define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
 
-#define MACVTAP_VNET_LE 0x80000000
-#define MACVTAP_VNET_BE 0x40000000
+#define TAP_VNET_LE 0x80000000
+#define TAP_VNET_BE 0x40000000
 
 #ifdef CONFIG_TUN_VNET_CROSS_LE
-static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
 {
-	return q->flags & MACVTAP_VNET_BE ? false :
+	return q->flags & TAP_VNET_BE ? false :
 		virtio_legacy_is_little_endian();
 }
 
-static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *sp)
+static long tap_get_vnet_be(struct tap_queue *q, int __user *sp)
 {
-	int s = !!(q->flags & MACVTAP_VNET_BE);
+	int s = !!(q->flags & TAP_VNET_BE);
 
 	if (put_user(s, sp))
 		return -EFAULT;
@@ -69,7 +69,7 @@ static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *sp)
 	return 0;
 }
 
-static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *sp)
+static long tap_set_vnet_be(struct tap_queue *q, int __user *sp)
 {
 	int s;
 
@@ -77,77 +77,77 @@ static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *sp)
 		return -EFAULT;
 
 	if (s)
-		q->flags |= MACVTAP_VNET_BE;
+		q->flags |= TAP_VNET_BE;
 	else
-		q->flags &= ~MACVTAP_VNET_BE;
+		q->flags &= ~TAP_VNET_BE;
 
 	return 0;
 }
 #else
-static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+static inline bool tap_legacy_is_little_endian(struct tap_queue *q)
 {
 	return virtio_legacy_is_little_endian();
 }
 
-static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *argp)
+static long tap_get_vnet_be(struct tap_queue *q, int __user *argp)
 {
 	return -EINVAL;
 }
 
-static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *argp)
+static long tap_set_vnet_be(struct tap_queue *q, int __user *argp)
 {
 	return -EINVAL;
 }
 #endif /* CONFIG_TUN_VNET_CROSS_LE */
 
-static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
+static inline bool tap_is_little_endian(struct tap_queue *q)
 {
-	return q->flags & MACVTAP_VNET_LE ||
-		macvtap_legacy_is_little_endian(q);
+	return q->flags & TAP_VNET_LE ||
+		tap_legacy_is_little_endian(q);
 }
 
-static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
+static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val)
 {
-	return __virtio16_to_cpu(macvtap_is_little_endian(q), val);
+	return __virtio16_to_cpu(tap_is_little_endian(q), val);
 }
 
-static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
+static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val)
 {
-	return __cpu_to_virtio16(macvtap_is_little_endian(q), val);
+	return __cpu_to_virtio16(tap_is_little_endian(q), val);
 }
 
-static struct proto macvtap_proto = {
-	.name = "macvtap",
+static struct proto tap_proto = {
+	.name = "tap",
 	.owner = THIS_MODULE,
-	.obj_size = sizeof (struct macvtap_queue),
+	.obj_size = sizeof(struct tap_queue),
 };
 
-#define MACVTAP_NUM_DEVS (1U << MINORBITS)
+#define TAP_NUM_DEVS (1U << MINORBITS)
 static DEFINE_MUTEX(minor_lock);
 DEFINE_IDR(minor_idr);
 
 #define GOODCOPY_LEN 128
 
-static const struct proto_ops macvtap_socket_ops;
+static const struct proto_ops tap_socket_ops;
 
 #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
 #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
 
-static struct macvlan_dev *macvtap_get_vlan_rcu(const struct net_device *dev)
+static struct macvlan_dev *tap_get_vlan_rcu(const struct net_device *dev)
 {
 	return rcu_dereference(dev->rx_handler_data);
 }
 
 /*
  * RCU usage:
- * The macvtap_queue and the macvlan_dev are loosely coupled, the
+ * The tap_queue and the macvlan_dev are loosely coupled, the
  * pointers from one to the other can only be read while rcu_read_lock
  * or rtnl is held.
  *
- * Both the file and the macvlan_dev hold a reference on the macvtap_queue
+ * Both the file and the macvlan_dev hold a reference on the tap_queue
  * through sock_hold(&q->sk). When the macvlan_dev goes away first,
  * q->vlan becomes inaccessible. When the files gets closed,
- * macvtap_get_queue() fails.
+ * tap_get_queue() fails.
  *
  * There may still be references to the struct sock inside of the
  * queue from outbound SKBs, but these never reference back to the
@@ -155,8 +155,8 @@ static struct macvlan_dev *macvtap_get_vlan_rcu(const struct net_device *dev)
  * when both our references and any pending SKBs are gone.
  */
 
-static int macvtap_enable_queue(struct net_device *dev, struct file *file,
-				struct macvtap_queue *q)
+static int tap_enable_queue(struct net_device *dev, struct file *file,
+			    struct tap_queue *q)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	int err = -EINVAL;
@@ -177,12 +177,12 @@ out:
 }
 
 /* Requires RTNL */
-static int macvtap_set_queue(struct net_device *dev, struct file *file,
-			     struct macvtap_queue *q)
+static int tap_set_queue(struct net_device *dev, struct file *file,
+			 struct tap_queue *q)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	if (vlan->numqueues == MAX_MACVTAP_QUEUES)
+	if (vlan->numqueues == MAX_TAP_QUEUES)
 		return -EBUSY;
 
 	rcu_assign_pointer(q->vlan, vlan);
@@ -201,10 +201,10 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file,
 	return 0;
 }
 
-static int macvtap_disable_queue(struct macvtap_queue *q)
+static int tap_disable_queue(struct tap_queue *q)
 {
 	struct macvlan_dev *vlan;
-	struct macvtap_queue *nq;
+	struct tap_queue *nq;
 
 	ASSERT_RTNL();
 	if (!q->enabled)
@@ -236,7 +236,7 @@ static int macvtap_disable_queue(struct macvtap_queue *q)
  * Using the spinlock makes sure that we don't get
  * to the queue again after destroying it.
  */
-static void macvtap_put_queue(struct macvtap_queue *q)
+static void tap_put_queue(struct tap_queue *q)
 {
 	struct macvlan_dev *vlan;
 
@@ -245,7 +245,7 @@ static void macvtap_put_queue(struct macvtap_queue *q)
 
 	if (vlan) {
 		if (q->enabled)
-			BUG_ON(macvtap_disable_queue(q));
+			BUG_ON(tap_disable_queue(q));
 
 		vlan->numqueues--;
 		RCU_INIT_POINTER(q->vlan, NULL);
@@ -266,11 +266,11 @@ static void macvtap_put_queue(struct macvtap_queue *q)
  * Cache vlan->numvtaps since it can become zero during the execution
  * of this function.
  */
-static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
-					       struct sk_buff *skb)
+static struct tap_queue *tap_get_queue(struct net_device *dev,
+				       struct sk_buff *skb)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *tap = NULL;
+	struct tap_queue *tap = NULL;
 	/* Access to taps array is protected by rcu, but access to numvtaps
 	 * isn't. Below we use it to lookup a queue, but treat it as a hint
 	 * and validate that the result isn't NULL - in case we are
@@ -313,10 +313,10 @@ out:
  * that it holds on all queues and safely set the pointer
  * from the queues to NULL.
  */
-void macvtap_del_queues(struct net_device *dev)
+void tap_del_queues(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *q, *tmp;
+	struct tap_queue *q, *tmp;
 
 	ASSERT_RTNL();
 	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
@@ -329,23 +329,23 @@ void macvtap_del_queues(struct net_device *dev)
 	}
 	BUG_ON(vlan->numvtaps);
 	BUG_ON(vlan->numqueues);
-	/* guarantee that any future macvtap_set_queue will fail */
-	vlan->numvtaps = MAX_MACVTAP_QUEUES;
+	/* guarantee that any future tap_set_queue will fail */
+	vlan->numvtaps = MAX_TAP_QUEUES;
 }
 
-rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
+rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
 	struct macvlan_dev *vlan;
-	struct macvtap_queue *q;
+	struct tap_queue *q;
 	netdev_features_t features = TAP_FEATURES;
 
-	vlan = macvtap_get_vlan_rcu(dev);
+	vlan = tap_get_vlan_rcu(dev);
 	if (!vlan)
 		return RX_HANDLER_PASS;
 
-	q = macvtap_get_queue(dev, skb);
+	q = tap_get_queue(dev, skb);
 	if (!q)
 		return RX_HANDLER_PASS;
 
@@ -409,23 +409,23 @@ drop:
 	return RX_HANDLER_CONSUMED;
 }
 
-int macvtap_get_minor(struct macvlan_dev *vlan)
+int tap_get_minor(struct macvlan_dev *vlan)
 {
 	int retval = -ENOMEM;
 
 	mutex_lock(&minor_lock);
-	retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL);
+	retval = idr_alloc(&minor_idr, vlan, 1, TAP_NUM_DEVS, GFP_KERNEL);
 	if (retval >= 0) {
 		vlan->minor = retval;
 	} else if (retval == -ENOSPC) {
-		netdev_err(vlan->dev, "Too many macvtap devices\n");
+		netdev_err(vlan->dev, "Too many tap devices\n");
 		retval = -EINVAL;
 	}
 	mutex_unlock(&minor_lock);
 	return retval < 0 ? retval : 0;
 }
 
-void macvtap_free_minor(struct macvlan_dev *vlan)
+void tap_free_minor(struct macvlan_dev *vlan)
 {
 	mutex_lock(&minor_lock);
 	if (vlan->minor) {
@@ -435,7 +435,7 @@ void macvtap_free_minor(struct macvlan_dev *vlan)
 	mutex_unlock(&minor_lock);
 }
 
-static struct net_device *dev_get_by_macvtap_minor(int minor)
+static struct net_device *dev_get_by_tap_minor(int minor)
 {
 	struct net_device *dev = NULL;
 	struct macvlan_dev *vlan;
@@ -450,7 +450,7 @@ static struct net_device *dev_get_by_macvtap_minor(int minor)
 	return dev;
 }
 
-static void macvtap_sock_write_space(struct sock *sk)
+static void tap_sock_write_space(struct sock *sk)
 {
 	wait_queue_head_t *wqueue;
 
@@ -463,28 +463,28 @@ static void macvtap_sock_write_space(struct sock *sk)
 		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
 }
 
-static void macvtap_sock_destruct(struct sock *sk)
+static void tap_sock_destruct(struct sock *sk)
 {
-	struct macvtap_queue *q = container_of(sk, struct macvtap_queue, sk);
+	struct tap_queue *q = container_of(sk, struct tap_queue, sk);
 
 	skb_array_cleanup(&q->skb_array);
 }
 
-static int macvtap_open(struct inode *inode, struct file *file)
+static int tap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct net_device *dev;
-	struct macvtap_queue *q;
+	struct tap_queue *q;
 	int err = -ENODEV;
 
 	rtnl_lock();
-	dev = dev_get_by_macvtap_minor(iminor(inode));
+	dev = dev_get_by_tap_minor(iminor(inode));
 	if (!dev)
 		goto err;
 
 	err = -ENOMEM;
-	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					     &macvtap_proto, 0);
+	q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+					     &tap_proto, 0);
 	if (!q)
 		goto err;
 
@@ -493,15 +493,15 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	q->sock.type = SOCK_RAW;
 	q->sock.state = SS_CONNECTED;
 	q->sock.file = file;
-	q->sock.ops = &macvtap_socket_ops;
+	q->sock.ops = &tap_socket_ops;
 	sock_init_data(&q->sock, &q->sk);
-	q->sk.sk_write_space = macvtap_sock_write_space;
-	q->sk.sk_destruct = macvtap_sock_destruct;
+	q->sk.sk_write_space = tap_sock_write_space;
+	q->sk.sk_destruct = tap_sock_destruct;
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
 	/*
-	 * so far only KVM virtio_net uses macvtap, enable zero copy between
+	 * so far only KVM virtio_net uses tap, enable zero copy between
 	 * guest kernel and host kernel when lower device supports zerocopy
 	 *
 	 * The macvlan supports zerocopy iff the lower device supports zero
@@ -514,7 +514,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
 		goto err_array;
 
-	err = macvtap_set_queue(dev, file, q);
+	err = tap_set_queue(dev, file, q);
 	if (err)
 		goto err_queue;
 
@@ -535,16 +535,16 @@ err:
 	return err;
 }
 
-static int macvtap_release(struct inode *inode, struct file *file)
+static int tap_release(struct inode *inode, struct file *file)
 {
-	struct macvtap_queue *q = file->private_data;
-	macvtap_put_queue(q);
+	struct tap_queue *q = file->private_data;
+	tap_put_queue(q);
 	return 0;
 }
 
-static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+static unsigned int tap_poll(struct file *file, poll_table *wait)
 {
-	struct macvtap_queue *q = file->private_data;
+	struct tap_queue *q = file->private_data;
 	unsigned int mask = POLLERR;
 
 	if (!q)
@@ -565,8 +565,8 @@ out:
 	return mask;
 }
 
-static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
-						size_t len, size_t linear,
+static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
+					    size_t len, size_t linear,
 						int noblock, int *err)
 {
 	struct sk_buff *skb;
@@ -589,13 +589,13 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
 }
 
 /* Neighbour code has some assumptions on HH_DATA_MOD alignment */
-#define MACVTAP_RESERVE HH_DATA_OFF(ETH_HLEN)
+#define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
 
 /* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
-				struct iov_iter *from, int noblock)
+static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
+			    struct iov_iter *from, int noblock)
 {
-	int good_linear = SKB_MAX_HEAD(MACVTAP_RESERVE);
+	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
 	struct sk_buff *skb;
 	struct macvlan_dev *vlan;
 	unsigned long total_len = iov_iter_count(from);
@@ -621,14 +621,14 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 			goto err;
 		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
 		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
-		     macvtap16_to_cpu(q, vnet_hdr.csum_start) +
-		     macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
-			     macvtap16_to_cpu(q, vnet_hdr.hdr_len))
-			vnet_hdr.hdr_len = cpu_to_macvtap16(q,
-				 macvtap16_to_cpu(q, vnet_hdr.csum_start) +
-				 macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
+		     tap16_to_cpu(q, vnet_hdr.csum_start) +
+		     tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
+			     tap16_to_cpu(q, vnet_hdr.hdr_len))
+			vnet_hdr.hdr_len = cpu_to_tap16(q,
+				 tap16_to_cpu(q, vnet_hdr.csum_start) +
+				 tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
 		err = -EINVAL;
-		if (macvtap16_to_cpu(q, vnet_hdr.hdr_len) > len)
+		if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
 			goto err;
 	}
 
@@ -640,7 +640,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 		struct iov_iter i;
 
 		copylen = vnet_hdr.hdr_len ?
-			macvtap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
+			tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
 		if (copylen > good_linear)
 			copylen = good_linear;
 		else if (copylen < ETH_HLEN)
@@ -654,15 +654,15 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 
 	if (!zerocopy) {
 		copylen = len;
-		linear = macvtap16_to_cpu(q, vnet_hdr.hdr_len);
+		linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
 		if (linear > good_linear)
 			linear = good_linear;
 		else if (linear < ETH_HLEN)
 			linear = ETH_HLEN;
 	}
 
-	skb = macvtap_alloc_skb(&q->sk, MACVTAP_RESERVE, copylen,
-				linear, noblock, &err);
+	skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
+			    linear, noblock, &err);
 	if (!skb)
 		goto err;
 
@@ -680,7 +680,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 
 	if (vnet_hdr_len) {
 		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
-					    macvtap_is_little_endian(q));
+					    tap_is_little_endian(q));
 		if (err)
 			goto err_kfree;
 	}
@@ -728,18 +728,18 @@ err:
 	return err;
 }
 
-static ssize_t macvtap_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct macvtap_queue *q = file->private_data;
+	struct tap_queue *q = file->private_data;
 
-	return macvtap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
+	return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
 }
 
 /* Put packet to the user space buffer */
-static ssize_t macvtap_put_user(struct macvtap_queue *q,
-				const struct sk_buff *skb,
-				struct iov_iter *iter)
+static ssize_t tap_put_user(struct tap_queue *q,
+			    const struct sk_buff *skb,
+			    struct iov_iter *iter)
 {
 	int ret;
 	int vnet_hdr_len = 0;
@@ -753,7 +753,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					    macvtap_is_little_endian(q), true))
+					    tap_is_little_endian(q), true))
 			BUG();
 
 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
@@ -792,9 +792,9 @@ done:
 	return ret ? ret : total;
 }
 
-static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       struct iov_iter *to,
-			       int noblock)
+static ssize_t tap_do_read(struct tap_queue *q,
+			   struct iov_iter *to,
+			   int noblock)
 {
 	DEFINE_WAIT(wait);
 	struct sk_buff *skb;
@@ -827,7 +827,7 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q,
 		finish_wait(sk_sleep(&q->sk), &wait);
 
 	if (skb) {
-		ret = macvtap_put_user(q, skb, to);
+		ret = tap_put_user(q, skb, to);
 		if (unlikely(ret < 0))
 			kfree_skb(skb);
 		else
@@ -836,20 +836,20 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q,
 	return ret;
 }
 
-static ssize_t macvtap_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
-	struct macvtap_queue *q = file->private_data;
+	struct tap_queue *q = file->private_data;
 	ssize_t len = iov_iter_count(to), ret;
 
-	ret = macvtap_do_read(q, to, file->f_flags & O_NONBLOCK);
+	ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
 		iocb->ki_pos = ret;
 	return ret;
 }
 
-static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
+static struct macvlan_dev *tap_get_vlan(struct tap_queue *q)
 {
 	struct macvlan_dev *vlan;
 
@@ -861,33 +861,33 @@ static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
 	return vlan;
 }
 
-static void macvtap_put_vlan(struct macvlan_dev *vlan)
+static void tap_put_vlan(struct macvlan_dev *vlan)
 {
 	dev_put(vlan->dev);
 }
 
-static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
+static int tap_ioctl_set_queue(struct file *file, unsigned int flags)
 {
-	struct macvtap_queue *q = file->private_data;
+	struct tap_queue *q = file->private_data;
 	struct macvlan_dev *vlan;
 	int ret;
 
-	vlan = macvtap_get_vlan(q);
+	vlan = tap_get_vlan(q);
 	if (!vlan)
 		return -EINVAL;
 
 	if (flags & IFF_ATTACH_QUEUE)
-		ret = macvtap_enable_queue(vlan->dev, file, q);
+		ret = tap_enable_queue(vlan->dev, file, q);
 	else if (flags & IFF_DETACH_QUEUE)
-		ret = macvtap_disable_queue(q);
+		ret = tap_disable_queue(q);
 	else
 		ret = -EINVAL;
 
-	macvtap_put_vlan(vlan);
+	tap_put_vlan(vlan);
 	return ret;
 }
 
-static int set_offload(struct macvtap_queue *q, unsigned long arg)
+static int set_offload(struct tap_queue *q, unsigned long arg)
 {
 	struct macvlan_dev *vlan;
 	netdev_features_t features;
@@ -919,7 +919,7 @@ static int set_offload(struct macvtap_queue *q, unsigned long arg)
 	 * setting the TSO bit means that the userspace wants to
 	 * accept TSO frames and turning it off means that user space
 	 * does not support TSO.
-	 * For macvtap, we have to invert it to mean the same thing.
+	 * For tap, we have to invert it to mean the same thing.
 	 * When user space turns off TSO, we turn off GSO/LRO so that
 	 * user-space will not receive TSO frames.
 	 */
@@ -941,10 +941,10 @@ static int set_offload(struct macvtap_queue *q, unsigned long arg)
 /*
  * provide compatibility with generic tun/tap interface
  */
-static long macvtap_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
+static long tap_ioctl(struct file *file, unsigned int cmd,
+		      unsigned long arg)
 {
-	struct macvtap_queue *q = file->private_data;
+	struct tap_queue *q = file->private_data;
 	struct macvlan_dev *vlan;
 	void __user *argp = (void __user *)arg;
 	struct ifreq __user *ifr = argp;
@@ -962,16 +962,16 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 			return -EFAULT;
 
 		ret = 0;
-		if ((u & ~MACVTAP_FEATURES) != (IFF_NO_PI | IFF_TAP))
+		if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP))
 			ret = -EINVAL;
 		else
-			q->flags = (q->flags & ~MACVTAP_FEATURES) | u;
+			q->flags = (q->flags & ~TAP_IFFEATURES) | u;
 
 		return ret;
 
 	case TUNGETIFF:
 		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
+		vlan = tap_get_vlan(q);
 		if (!vlan) {
 			rtnl_unlock();
 			return -ENOLINK;
@@ -982,7 +982,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
 		    put_user(u, &ifr->ifr_flags))
 			ret = -EFAULT;
-		macvtap_put_vlan(vlan);
+		tap_put_vlan(vlan);
 		rtnl_unlock();
 		return ret;
 
@@ -990,12 +990,12 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		if (get_user(u, &ifr->ifr_flags))
 			return -EFAULT;
 		rtnl_lock();
-		ret = macvtap_ioctl_set_queue(file, u);
+		ret = tap_ioctl_set_queue(file, u);
 		rtnl_unlock();
 		return ret;
 
 	case TUNGETFEATURES:
-		if (put_user(IFF_TAP | IFF_NO_PI | MACVTAP_FEATURES, up))
+		if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up))
 			return -EFAULT;
 		return 0;
 
@@ -1022,7 +1022,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		return 0;
 
 	case TUNGETVNETLE:
-		s = !!(q->flags & MACVTAP_VNET_LE);
+		s = !!(q->flags & TAP_VNET_LE);
 		if (put_user(s, sp))
 			return -EFAULT;
 		return 0;
@@ -1031,16 +1031,16 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		if (get_user(s, sp))
 			return -EFAULT;
 		if (s)
-			q->flags |= MACVTAP_VNET_LE;
+			q->flags |= TAP_VNET_LE;
 		else
-			q->flags &= ~MACVTAP_VNET_LE;
+			q->flags &= ~TAP_VNET_LE;
 		return 0;
 
 	case TUNGETVNETBE:
-		return macvtap_get_vnet_be(q, sp);
+		return tap_get_vnet_be(q, sp);
 
 	case TUNSETVNETBE:
-		return macvtap_set_vnet_be(q, sp);
+		return tap_set_vnet_be(q, sp);
 
 	case TUNSETOFFLOAD:
 		/* let the user check for future flags */
@@ -1055,7 +1055,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 
 	case SIOCGIFHWADDR:
 		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
+		vlan = tap_get_vlan(q);
 		if (!vlan) {
 			rtnl_unlock();
 			return -ENOLINK;
@@ -1066,7 +1066,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		    copy_to_user(&ifr->ifr_hwaddr.sa_data, vlan->dev->dev_addr, ETH_ALEN) ||
 		    put_user(u, &ifr->ifr_hwaddr.sa_family))
 			ret = -EFAULT;
-		macvtap_put_vlan(vlan);
+		tap_put_vlan(vlan);
 		rtnl_unlock();
 		return ret;
 
@@ -1074,13 +1074,13 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
 			return -EFAULT;
 		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
+		vlan = tap_get_vlan(q);
 		if (!vlan) {
 			rtnl_unlock();
 			return -ENOLINK;
 		}
 		ret = dev_set_mac_address(vlan->dev, &sa);
-		macvtap_put_vlan(vlan);
+		tap_put_vlan(vlan);
 		rtnl_unlock();
 		return ret;
 
@@ -1090,42 +1090,42 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 }
 
 #ifdef CONFIG_COMPAT
-static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
-				 unsigned long arg)
+static long tap_compat_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long arg)
 {
-	return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+	return tap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
 }
 #endif
 
-const struct file_operations macvtap_fops = {
+const struct file_operations tap_fops = {
 	.owner		= THIS_MODULE,
-	.open		= macvtap_open,
-	.release	= macvtap_release,
-	.read_iter	= macvtap_read_iter,
-	.write_iter	= macvtap_write_iter,
-	.poll		= macvtap_poll,
+	.open		= tap_open,
+	.release	= tap_release,
+	.read_iter	= tap_read_iter,
+	.write_iter	= tap_write_iter,
+	.poll		= tap_poll,
 	.llseek		= no_llseek,
-	.unlocked_ioctl	= macvtap_ioctl,
+	.unlocked_ioctl	= tap_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl	= macvtap_compat_ioctl,
+	.compat_ioctl	= tap_compat_ioctl,
 #endif
 };
 
-static int macvtap_sendmsg(struct socket *sock, struct msghdr *m,
-			   size_t total_len)
+static int tap_sendmsg(struct socket *sock, struct msghdr *m,
+		       size_t total_len)
 {
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
+	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
+	return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
 }
 
-static int macvtap_recvmsg(struct socket *sock, struct msghdr *m,
-			   size_t total_len, int flags)
+static int tap_recvmsg(struct socket *sock, struct msghdr *m,
+		       size_t total_len, int flags)
 {
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
+	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
 	int ret;
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
+	ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
@@ -1133,40 +1133,40 @@ static int macvtap_recvmsg(struct socket *sock, struct msghdr *m,
 	return ret;
 }
 
-static int macvtap_peek_len(struct socket *sock)
+static int tap_peek_len(struct socket *sock)
 {
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue,
+	struct tap_queue *q = container_of(sock, struct tap_queue,
 					       sock);
 	return skb_array_peek_len(&q->skb_array);
 }
 
 /* Ops structure to mimic raw sockets with tun */
-static const struct proto_ops macvtap_socket_ops = {
-	.sendmsg = macvtap_sendmsg,
-	.recvmsg = macvtap_recvmsg,
-	.peek_len = macvtap_peek_len,
+static const struct proto_ops tap_socket_ops = {
+	.sendmsg = tap_sendmsg,
+	.recvmsg = tap_recvmsg,
+	.peek_len = tap_peek_len,
 };
 
 /* Get an underlying socket object from tun file.  Returns error unless file is
  * attached to a device.  The returned object works like a packet socket, it
  * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
  * holding a reference to the file for as long as the socket is in use. */
-struct socket *macvtap_get_socket(struct file *file)
+struct socket *tap_get_socket(struct file *file)
 {
-	struct macvtap_queue *q;
-	if (file->f_op != &macvtap_fops)
+	struct tap_queue *q;
+	if (file->f_op != &tap_fops)
 		return ERR_PTR(-EINVAL);
 	q = file->private_data;
 	if (!q)
 		return ERR_PTR(-EBADFD);
 	return &q->sock;
 }
-EXPORT_SYMBOL_GPL(macvtap_get_socket);
+EXPORT_SYMBOL_GPL(tap_get_socket);
 
-int macvtap_queue_resize(struct macvlan_dev *vlan)
+int tap_queue_resize(struct macvlan_dev *vlan)
 {
 	struct net_device *dev = vlan->dev;
-	struct macvtap_queue *q;
+	struct tap_queue *q;
 	struct skb_array **arrays;
 	int n = vlan->numqueues;
 	int ret, i = 0;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c42e9c305134..2fe35354f20e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -24,6 +24,7 @@
 #include <linux/if_arp.h>
 #include <linux/if_tun.h>
 #include <linux/if_macvlan.h>
+#include <linux/if_tap.h>
 #include <linux/if_vlan.h>
 
 #include <net/sock.h>
@@ -960,7 +961,7 @@ static struct socket *get_tap_socket(int fd)
 	sock = tun_get_socket(file);
 	if (!IS_ERR(sock))
 		return sock;
-	sock = macvtap_get_socket(file);
+	sock = tap_get_socket(file);
 	if (IS_ERR(sock))
 		fput(file);
 	return sock;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index a4ccc3122f93..c9ec1343d187 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -9,19 +9,6 @@
 #include <net/netlink.h>
 #include <linux/u64_stats_sync.h>
 
-#if IS_ENABLED(CONFIG_MACVTAP)
-struct socket *macvtap_get_socket(struct file *);
-#else
-#include <linux/err.h>
-#include <linux/errno.h>
-struct file;
-struct socket;
-static inline struct socket *macvtap_get_socket(struct file *f)
-{
-	return ERR_PTR(-EINVAL);
-}
-#endif /* CONFIG_MACVTAP */
-
 struct macvlan_port;
 struct macvtap_queue;
 
@@ -29,7 +16,7 @@ struct macvtap_queue;
  * Maximum times a macvtap device can be opened. This can be used to
  * configure the number of receive queue, e.g. for multiqueue virtio.
  */
-#define MAX_MACVTAP_QUEUES	256
+#define MAX_TAP_QUEUES	256
 
 #define MACVLAN_MC_FILTER_BITS	8
 #define MACVLAN_MC_FILTER_SZ	(1 << MACVLAN_MC_FILTER_BITS)
@@ -49,7 +36,7 @@ struct macvlan_dev {
 	enum macvlan_mode	mode;
 	u16			flags;
 	/* This array tracks active taps. */
-	struct macvtap_queue	__rcu *taps[MAX_MACVTAP_QUEUES];
+	struct tap_queue	__rcu *taps[MAX_TAP_QUEUES];
 	/* This list tracks all taps (both enabled and disabled) */
 	struct list_head	queue_list;
 	int			numvtaps;
diff --git a/include/linux/if_macvtap.h b/include/linux/if_macvtap.h
deleted file mode 100644
index c9bf84b75b27..000000000000
--- a/include/linux/if_macvtap.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _LINUX_IF_MACVTAP_H_
-#define _LINUX_IF_MACVTAP_H_
-
-rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb);
-void macvtap_del_queues(struct net_device *dev);
-int macvtap_get_minor(struct macvlan_dev *vlan);
-void macvtap_free_minor(struct macvlan_dev *vlan);
-int macvtap_queue_resize(struct macvlan_dev *vlan);
-
-#endif /*_LINUX_IF_MACVTAP_H_*/
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
new file mode 100644
index 000000000000..97d27b8ebd55
--- /dev/null
+++ b/include/linux/if_tap.h
@@ -0,0 +1,23 @@
+#ifndef _LINUX_IF_TAP_H_
+#define _LINUX_IF_TAP_H_
+
+#if IS_ENABLED(CONFIG_MACVTAP)
+struct socket *tap_get_socket(struct file *);
+#else
+#include <linux/err.h>
+#include <linux/errno.h>
+struct file;
+struct socket;
+static inline struct socket *tap_get_socket(struct file *f)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_MACVTAP */
+
+rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
+void tap_del_queues(struct net_device *dev);
+int tap_get_minor(struct macvlan_dev *vlan);
+void tap_free_minor(struct macvlan_dev *vlan);
+int tap_queue_resize(struct macvlan_dev *vlan);
+
+#endif /*_LINUX_IF_TAP_H_*/
-- 
cgit v1.2.3


From ebc05ba7e8600b52a2a0c87a43105143368aca2a Mon Sep 17 00:00:00 2001
From: Sainath Grandhi <sainath.grandhi@intel.com>
Date: Fri, 10 Feb 2017 16:03:48 -0800
Subject: tap: Tap character device creation/destroy API

This patch provides tap device create/destroy APIs in tap.c.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap_main.c | 30 +++++++---------------
 drivers/net/tap.c          | 62 ++++++++++++++++++++++++++++++++++++++--------
 include/linux/if_tap.h     |  3 +++
 3 files changed, 63 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
index 548f339a75bd..215ab7abae89 100644
--- a/drivers/net/macvtap_main.c
+++ b/drivers/net/macvtap_main.c
@@ -28,7 +28,6 @@
  * Variables for dealing with macvtaps device numbers.
  */
 static dev_t macvtap_major;
-#define MACVTAP_NUM_DEVS (1U << MINORBITS)
 
 static const void *macvtap_net_namespace(struct device *d)
 {
@@ -159,57 +158,46 @@ static struct notifier_block macvtap_notifier_block __read_mostly = {
 	.notifier_call	= macvtap_device_event,
 };
 
-extern struct file_operations tap_fops;
 static int macvtap_init(void)
 {
 	int err;
 
-	err = alloc_chrdev_region(&macvtap_major, 0,
-				MACVTAP_NUM_DEVS, "macvtap");
-	if (err)
-		goto out1;
+	err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");
 
-	cdev_init(&macvtap_cdev, &tap_fops);
-	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
 	if (err)
-		goto out2;
+		goto out1;
 
 	err = class_register(&macvtap_class);
 	if (err)
-		goto out3;
+		goto out2;
 
 	err = register_netdevice_notifier(&macvtap_notifier_block);
 	if (err)
-		goto out4;
+		goto out3;
 
 	err = macvlan_link_register(&macvtap_link_ops);
 	if (err)
-		goto out5;
+		goto out4;
 
 	return 0;
 
-out5:
-	unregister_netdevice_notifier(&macvtap_notifier_block);
 out4:
-	class_unregister(&macvtap_class);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
 out3:
-	cdev_del(&macvtap_cdev);
+	class_unregister(&macvtap_class);
 out2:
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
 out1:
 	return err;
 }
 module_init(macvtap_init);
 
-extern struct idr minor_idr;
 static void macvtap_exit(void)
 {
 	rtnl_link_unregister(&macvtap_link_ops);
 	unregister_netdevice_notifier(&macvtap_notifier_block);
 	class_unregister(&macvtap_class);
-	cdev_del(&macvtap_cdev);
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-	idr_destroy(&minor_idr);
+	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
 }
 module_exit(macvtap_exit);
 
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 15ca2d531d05..04ba9782c2f3 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -123,8 +123,12 @@ static struct proto tap_proto = {
 };
 
 #define TAP_NUM_DEVS (1U << MINORBITS)
-static DEFINE_MUTEX(minor_lock);
-DEFINE_IDR(minor_idr);
+struct major_info {
+	dev_t major;
+	struct idr minor_idr;
+	struct mutex minor_lock;
+	const char *device_name;
+} macvtap_major;
 
 #define GOODCOPY_LEN 128
 
@@ -413,26 +417,26 @@ int tap_get_minor(struct macvlan_dev *vlan)
 {
 	int retval = -ENOMEM;
 
-	mutex_lock(&minor_lock);
-	retval = idr_alloc(&minor_idr, vlan, 1, TAP_NUM_DEVS, GFP_KERNEL);
+	mutex_lock(&macvtap_major.minor_lock);
+	retval = idr_alloc(&macvtap_major.minor_idr, vlan, 1, TAP_NUM_DEVS, GFP_KERNEL);
 	if (retval >= 0) {
 		vlan->minor = retval;
 	} else if (retval == -ENOSPC) {
 		netdev_err(vlan->dev, "Too many tap devices\n");
 		retval = -EINVAL;
 	}
-	mutex_unlock(&minor_lock);
+	mutex_unlock(&macvtap_major.minor_lock);
 	return retval < 0 ? retval : 0;
 }
 
 void tap_free_minor(struct macvlan_dev *vlan)
 {
-	mutex_lock(&minor_lock);
+	mutex_lock(&macvtap_major.minor_lock);
 	if (vlan->minor) {
-		idr_remove(&minor_idr, vlan->minor);
+		idr_remove(&macvtap_major.minor_idr, vlan->minor);
 		vlan->minor = 0;
 	}
-	mutex_unlock(&minor_lock);
+	mutex_unlock(&macvtap_major.minor_lock);
 }
 
 static struct net_device *dev_get_by_tap_minor(int minor)
@@ -440,13 +444,13 @@ static struct net_device *dev_get_by_tap_minor(int minor)
 	struct net_device *dev = NULL;
 	struct macvlan_dev *vlan;
 
-	mutex_lock(&minor_lock);
-	vlan = idr_find(&minor_idr, minor);
+	mutex_lock(&macvtap_major.minor_lock);
+	vlan = idr_find(&macvtap_major.minor_idr, minor);
 	if (vlan) {
 		dev = vlan->dev;
 		dev_hold(dev);
 	}
-	mutex_unlock(&minor_lock);
+	mutex_unlock(&macvtap_major.minor_lock);
 	return dev;
 }
 
@@ -1184,3 +1188,39 @@ int tap_queue_resize(struct macvlan_dev *vlan)
 	kfree(arrays);
 	return ret;
 }
+
+int tap_create_cdev(struct cdev *tap_cdev,
+		    dev_t *tap_major, const char *device_name)
+{
+	int err;
+
+	err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name);
+	if (err)
+		goto out1;
+
+	cdev_init(tap_cdev, &tap_fops);
+	err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS);
+	if (err)
+		goto out2;
+
+	macvtap_major.major = MAJOR(*tap_major);
+
+	idr_init(&macvtap_major.minor_idr);
+	mutex_init(&macvtap_major.minor_lock);
+
+	macvtap_major.device_name = device_name;
+
+	return 0;
+
+out2:
+	unregister_chrdev_region(*tap_major, TAP_NUM_DEVS);
+out1:
+	return err;
+}
+
+void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
+{
+	cdev_del(tap_cdev);
+	unregister_chrdev_region(major, TAP_NUM_DEVS);
+	idr_destroy(&macvtap_major.minor_idr);
+}
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 97d27b8ebd55..a2dfd9063a6c 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -19,5 +19,8 @@ void tap_del_queues(struct net_device *dev);
 int tap_get_minor(struct macvlan_dev *vlan);
 void tap_free_minor(struct macvlan_dev *vlan);
 int tap_queue_resize(struct macvlan_dev *vlan);
+int tap_create_cdev(struct cdev *tap_cdev,
+		    dev_t *tap_major, const char *device_name);
+void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev);
 
 #endif /*_LINUX_IF_TAP_H_*/
-- 
cgit v1.2.3


From 6fe3faf86757eb7f078ff06b23b206f17dc4fb36 Mon Sep 17 00:00:00 2001
From: Sainath Grandhi <sainath.grandhi@intel.com>
Date: Fri, 10 Feb 2017 16:03:49 -0800
Subject: tap: Abstract type of virtual interface from tap implementation

macvlan object is re-structured to hold tap related elements in a separate
entity, tap_dev. Upon NETDEV_REGISTER device_event, tap_dev is registered with
idr and fetched again on tap_open. Few of the tap functions are modified to
accepted tap_dev as argument. tap_dev object includes callbacks to be used by
underlying virtual interface to take care of tx and rx accounting.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c      |   2 +-
 drivers/net/macvtap_main.c |  71 +++++++++---
 drivers/net/tap.c          | 264 ++++++++++++++++++++-------------------------
 include/linux/if_tap.h     |  57 +++++++++-
 4 files changed, 229 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index cbfc1be23a0e..9261722960a7 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1525,7 +1525,6 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
 int macvlan_link_register(struct rtnl_link_ops *ops)
 {
 	/* common fields */
-	ops->priv_size		= sizeof(struct macvlan_dev);
 	ops->validate		= macvlan_validate;
 	ops->maxtype		= IFLA_MACVLAN_MAX;
 	ops->policy		= macvlan_policy;
@@ -1548,6 +1547,7 @@ static struct rtnl_link_ops macvlan_link_ops = {
 	.newlink	= macvlan_newlink,
 	.dellink	= macvlan_dellink,
 	.get_link_net	= macvlan_get_link_net,
+	.priv_size      = sizeof(struct macvlan_dev),
 };
 
 static int macvlan_device_event(struct notifier_block *unused,
diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
index 215ab7abae89..0238df62bf45 100644
--- a/drivers/net/macvtap_main.c
+++ b/drivers/net/macvtap_main.c
@@ -24,6 +24,11 @@
 #include <linux/virtio_net.h>
 #include <linux/skb_array.h>
 
+struct macvtap_dev {
+	struct macvlan_dev vlan;
+	struct tap_dev    tap;
+};
+
 /*
  * Variables for dealing with macvtaps device numbers.
  */
@@ -46,22 +51,55 @@ static struct cdev macvtap_cdev;
 #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
 		      NETIF_F_TSO6 | NETIF_F_UFO)
 
+static void macvtap_count_tx_dropped(struct tap_dev *tap)
+{
+	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
+	struct macvlan_dev *vlan = &vlantap->vlan;
+
+	this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+}
+
+static void macvtap_count_rx_dropped(struct tap_dev *tap)
+{
+	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
+	struct macvlan_dev *vlan = &vlantap->vlan;
+
+	macvlan_count_rx(vlan, 0, 0, 0);
+}
+
+static void macvtap_update_features(struct tap_dev *tap,
+				    netdev_features_t features)
+{
+	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
+	struct macvlan_dev *vlan = &vlantap->vlan;
+
+	vlan->set_features = features;
+	netdev_update_features(vlan->dev);
+}
+
 static int macvtap_newlink(struct net *src_net,
 			   struct net_device *dev,
 			   struct nlattr *tb[],
 			   struct nlattr *data[])
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_dev *vlantap = netdev_priv(dev);
 	int err;
 
-	INIT_LIST_HEAD(&vlan->queue_list);
+	INIT_LIST_HEAD(&vlantap->tap.queue_list);
 
 	/* Since macvlan supports all offloads by default, make
 	 * tap support all offloads also.
 	 */
-	vlan->tap_features = TUN_OFFLOADS;
+	vlantap->tap.tap_features = TUN_OFFLOADS;
 
-	err = netdev_rx_handler_register(dev, tap_handle_frame, vlan);
+	/* Register callbacks for rx/tx drops accounting and updating
+	 * net_device features
+	 */
+	vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
+	vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
+	vlantap->tap.update_features  = macvtap_update_features;
+
+	err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
 	if (err)
 		return err;
 
@@ -74,14 +112,18 @@ static int macvtap_newlink(struct net *src_net,
 		return err;
 	}
 
+	vlantap->tap.dev = vlantap->vlan.dev;
+
 	return 0;
 }
 
 static void macvtap_dellink(struct net_device *dev,
 			    struct list_head *head)
 {
+	struct macvtap_dev *vlantap = netdev_priv(dev);
+
 	netdev_rx_handler_unregister(dev);
-	tap_del_queues(dev);
+	tap_del_queues(&vlantap->tap);
 	macvlan_dellink(dev, head);
 }
 
@@ -96,13 +138,14 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
 	.setup		= macvtap_setup,
 	.newlink	= macvtap_newlink,
 	.dellink	= macvtap_dellink,
+	.priv_size      = sizeof(struct macvtap_dev),
 };
 
 static int macvtap_device_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct macvlan_dev *vlan;
+	struct macvtap_dev *vlantap;
 	struct device *classdev;
 	dev_t devt;
 	int err;
@@ -112,7 +155,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		return NOTIFY_DONE;
 
 	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
-	vlan = netdev_priv(dev);
+	vlantap = netdev_priv(dev);
 
 	switch (event) {
 	case NETDEV_REGISTER:
@@ -120,15 +163,15 @@ static int macvtap_device_event(struct notifier_block *unused,
 		 * been registered but before register_netdevice has
 		 * finished running.
 		 */
-		err = tap_get_minor(vlan);
+		err = tap_get_minor(&vlantap->tap);
 		if (err)
 			return notifier_from_errno(err);
 
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
 		classdev = device_create(&macvtap_class, &dev->dev, devt,
 					 dev, tap_name);
 		if (IS_ERR(classdev)) {
-			tap_free_minor(vlan);
+			tap_free_minor(&vlantap->tap);
 			return notifier_from_errno(PTR_ERR(classdev));
 		}
 		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
@@ -138,15 +181,15 @@ static int macvtap_device_event(struct notifier_block *unused,
 		break;
 	case NETDEV_UNREGISTER:
 		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
-		if (vlan->minor == 0)
+		if (vlantap->tap.minor == 0)
 			break;
 		sysfs_remove_link(&dev->dev.kobj, tap_name);
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
 		device_destroy(&macvtap_class, devt);
-		tap_free_minor(vlan);
+		tap_free_minor(&vlantap->tap);
 		break;
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		if (tap_queue_resize(vlan))
+		if (tap_queue_resize(&vlantap->tap))
 			return NOTIFY_BAD;
 		break;
 	}
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 04ba9782c2f3..7d3e8b18f5e6 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1,5 +1,5 @@
 #include <linux/etherdevice.h>
-#include <linux/if_macvlan.h>
+#include <linux/if_tap.h>
 #include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/nsproxy.h>
@@ -23,30 +23,6 @@
 #include <linux/virtio_net.h>
 #include <linux/skb_array.h>
 
-/*
- * A tap queue is the central object of this driver, it connects
- * an open character device to a macvlan interface. There can be
- * multiple queues on one interface, which map back to queues
- * implemented in hardware on the underlying device.
- *
- * tap_proto is used to allocate queues through the sock allocation
- * mechanism.
- *
- */
-struct tap_queue {
-	struct sock sk;
-	struct socket sock;
-	struct socket_wq wq;
-	int vnet_hdr_sz;
-	struct macvlan_dev __rcu *vlan;
-	struct file *file;
-	unsigned int flags;
-	u16 queue_index;
-	bool enabled;
-	struct list_head next;
-	struct skb_array skb_array;
-};
-
 #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
 
 #define TAP_VNET_LE 0x80000000
@@ -137,7 +113,7 @@ static const struct proto_ops tap_socket_ops;
 #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
 #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
 
-static struct macvlan_dev *tap_get_vlan_rcu(const struct net_device *dev)
+static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev)
 {
 	return rcu_dereference(dev->rx_handler_data);
 }
@@ -159,10 +135,9 @@ static struct macvlan_dev *tap_get_vlan_rcu(const struct net_device *dev)
  * when both our references and any pending SKBs are gone.
  */
 
-static int tap_enable_queue(struct net_device *dev, struct file *file,
+static int tap_enable_queue(struct tap_dev *tap, struct file *file,
 			    struct tap_queue *q)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
 	int err = -EINVAL;
 
 	ASSERT_RTNL();
@@ -171,62 +146,60 @@ static int tap_enable_queue(struct net_device *dev, struct file *file,
 		goto out;
 
 	err = 0;
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
-	q->queue_index = vlan->numvtaps;
+	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
+	q->queue_index = tap->numvtaps;
 	q->enabled = true;
 
-	vlan->numvtaps++;
+	tap->numvtaps++;
 out:
 	return err;
 }
 
 /* Requires RTNL */
-static int tap_set_queue(struct net_device *dev, struct file *file,
+static int tap_set_queue(struct tap_dev *tap, struct file *file,
 			 struct tap_queue *q)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
-
-	if (vlan->numqueues == MAX_TAP_QUEUES)
+	if (tap->numqueues == MAX_TAP_QUEUES)
 		return -EBUSY;
 
-	rcu_assign_pointer(q->vlan, vlan);
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
+	rcu_assign_pointer(q->tap, tap);
+	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
 	sock_hold(&q->sk);
 
 	q->file = file;
-	q->queue_index = vlan->numvtaps;
+	q->queue_index = tap->numvtaps;
 	q->enabled = true;
 	file->private_data = q;
-	list_add_tail(&q->next, &vlan->queue_list);
+	list_add_tail(&q->next, &tap->queue_list);
 
-	vlan->numvtaps++;
-	vlan->numqueues++;
+	tap->numvtaps++;
+	tap->numqueues++;
 
 	return 0;
 }
 
 static int tap_disable_queue(struct tap_queue *q)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	struct tap_queue *nq;
 
 	ASSERT_RTNL();
 	if (!q->enabled)
 		return -EINVAL;
 
-	vlan = rtnl_dereference(q->vlan);
+	tap = rtnl_dereference(q->tap);
 
-	if (vlan) {
+	if (tap) {
 		int index = q->queue_index;
-		BUG_ON(index >= vlan->numvtaps);
-		nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]);
+		BUG_ON(index >= tap->numvtaps);
+		nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]);
 		nq->queue_index = index;
 
-		rcu_assign_pointer(vlan->taps[index], nq);
-		RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
+		rcu_assign_pointer(tap->taps[index], nq);
+		RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL);
 		q->enabled = false;
 
-		vlan->numvtaps--;
+		tap->numvtaps--;
 	}
 
 	return 0;
@@ -242,17 +215,17 @@ static int tap_disable_queue(struct tap_queue *q)
  */
 static void tap_put_queue(struct tap_queue *q)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 
 	rtnl_lock();
-	vlan = rtnl_dereference(q->vlan);
+	tap = rtnl_dereference(q->tap);
 
-	if (vlan) {
+	if (tap) {
 		if (q->enabled)
 			BUG_ON(tap_disable_queue(q));
 
-		vlan->numqueues--;
-		RCU_INIT_POINTER(q->vlan, NULL);
+		tap->numqueues--;
+		RCU_INIT_POINTER(q->tap, NULL);
 		sock_put(&q->sk);
 		list_del_init(&q->next);
 	}
@@ -270,17 +243,16 @@ static void tap_put_queue(struct tap_queue *q)
  * Cache vlan->numvtaps since it can become zero during the execution
  * of this function.
  */
-static struct tap_queue *tap_get_queue(struct net_device *dev,
+static struct tap_queue *tap_get_queue(struct tap_dev *tap,
 				       struct sk_buff *skb)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct tap_queue *tap = NULL;
+	struct tap_queue *queue = NULL;
 	/* Access to taps array is protected by rcu, but access to numvtaps
 	 * isn't. Below we use it to lookup a queue, but treat it as a hint
 	 * and validate that the result isn't NULL - in case we are
 	 * racing against queue removal.
 	 */
-	int numvtaps = ACCESS_ONCE(vlan->numvtaps);
+	int numvtaps = ACCESS_ONCE(tap->numvtaps);
 	__u32 rxq;
 
 	if (!numvtaps)
@@ -292,7 +264,7 @@ static struct tap_queue *tap_get_queue(struct net_device *dev,
 	/* Check if we can use flow to select a queue */
 	rxq = skb_get_hash(skb);
 	if (rxq) {
-		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
+		queue = rcu_dereference(tap->taps[rxq % numvtaps]);
 		goto out;
 	}
 
@@ -302,14 +274,14 @@ static struct tap_queue *tap_get_queue(struct net_device *dev,
 		while (unlikely(rxq >= numvtaps))
 			rxq -= numvtaps;
 
-		tap = rcu_dereference(vlan->taps[rxq]);
+		queue = rcu_dereference(tap->taps[rxq]);
 		goto out;
 	}
 
 single:
-	tap = rcu_dereference(vlan->taps[0]);
+	queue = rcu_dereference(tap->taps[0]);
 out:
-	return tap;
+	return queue;
 }
 
 /*
@@ -317,39 +289,38 @@ out:
  * that it holds on all queues and safely set the pointer
  * from the queues to NULL.
  */
-void tap_del_queues(struct net_device *dev)
+void tap_del_queues(struct tap_dev *tap)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct tap_queue *q, *tmp;
 
 	ASSERT_RTNL();
-	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
+	list_for_each_entry_safe(q, tmp, &tap->queue_list, next) {
 		list_del_init(&q->next);
-		RCU_INIT_POINTER(q->vlan, NULL);
+		RCU_INIT_POINTER(q->tap, NULL);
 		if (q->enabled)
-			vlan->numvtaps--;
-		vlan->numqueues--;
+			tap->numvtaps--;
+		tap->numqueues--;
 		sock_put(&q->sk);
 	}
-	BUG_ON(vlan->numvtaps);
-	BUG_ON(vlan->numqueues);
+	BUG_ON(tap->numvtaps);
+	BUG_ON(tap->numqueues);
 	/* guarantee that any future tap_set_queue will fail */
-	vlan->numvtaps = MAX_TAP_QUEUES;
+	tap->numvtaps = MAX_TAP_QUEUES;
 }
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	struct tap_queue *q;
 	netdev_features_t features = TAP_FEATURES;
 
-	vlan = tap_get_vlan_rcu(dev);
-	if (!vlan)
+	tap = tap_dev_get_rcu(dev);
+	if (!tap)
 		return RX_HANDLER_PASS;
 
-	q = tap_get_queue(dev, skb);
+	q = tap_get_queue(tap, skb);
 	if (!q)
 		return RX_HANDLER_PASS;
 
@@ -363,7 +334,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	 * enabled.
 	 */
 	if (q->flags & IFF_VNET_HDR)
-		features |= vlan->tap_features;
+		features |= tap->tap_features;
 	if (netif_needs_gso(skb, features)) {
 		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
 
@@ -408,50 +379,51 @@ wake_up:
 
 drop:
 	/* Count errors/drops only here, thus don't care about args. */
-	macvlan_count_rx(vlan, 0, 0, 0);
+	if (tap->count_rx_dropped)
+		tap->count_rx_dropped(tap);
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
 
-int tap_get_minor(struct macvlan_dev *vlan)
+int tap_get_minor(struct tap_dev *tap)
 {
 	int retval = -ENOMEM;
 
 	mutex_lock(&macvtap_major.minor_lock);
-	retval = idr_alloc(&macvtap_major.minor_idr, vlan, 1, TAP_NUM_DEVS, GFP_KERNEL);
+	retval = idr_alloc(&macvtap_major.minor_idr, tap, 1, TAP_NUM_DEVS, GFP_KERNEL);
 	if (retval >= 0) {
-		vlan->minor = retval;
+		tap->minor = retval;
 	} else if (retval == -ENOSPC) {
-		netdev_err(vlan->dev, "Too many tap devices\n");
+		netdev_err(tap->dev, "Too many tap devices\n");
 		retval = -EINVAL;
 	}
 	mutex_unlock(&macvtap_major.minor_lock);
 	return retval < 0 ? retval : 0;
 }
 
-void tap_free_minor(struct macvlan_dev *vlan)
+void tap_free_minor(struct tap_dev *tap)
 {
 	mutex_lock(&macvtap_major.minor_lock);
-	if (vlan->minor) {
-		idr_remove(&macvtap_major.minor_idr, vlan->minor);
-		vlan->minor = 0;
+	if (tap->minor) {
+		idr_remove(&macvtap_major.minor_idr, tap->minor);
+		tap->minor = 0;
 	}
 	mutex_unlock(&macvtap_major.minor_lock);
 }
 
-static struct net_device *dev_get_by_tap_minor(int minor)
+static struct tap_dev *dev_get_by_tap_minor(int minor)
 {
 	struct net_device *dev = NULL;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 
 	mutex_lock(&macvtap_major.minor_lock);
-	vlan = idr_find(&macvtap_major.minor_idr, minor);
-	if (vlan) {
-		dev = vlan->dev;
+	tap = idr_find(&macvtap_major.minor_idr, minor);
+	if (tap) {
+		dev = tap->dev;
 		dev_hold(dev);
 	}
 	mutex_unlock(&macvtap_major.minor_lock);
-	return dev;
+	return tap;
 }
 
 static void tap_sock_write_space(struct sock *sk)
@@ -477,13 +449,13 @@ static void tap_sock_destruct(struct sock *sk)
 static int tap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
-	struct net_device *dev;
+	struct tap_dev *tap;
 	struct tap_queue *q;
 	int err = -ENODEV;
 
 	rtnl_lock();
-	dev = dev_get_by_tap_minor(iminor(inode));
-	if (!dev)
+	tap = dev_get_by_tap_minor(iminor(inode));
+	if (!tap)
 		goto err;
 
 	err = -ENOMEM;
@@ -511,18 +483,18 @@ static int tap_open(struct inode *inode, struct file *file)
 	 * The macvlan supports zerocopy iff the lower device supports zero
 	 * copy so we don't have to look at the lower device directly.
 	 */
-	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
+	if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
 		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
 
 	err = -ENOMEM;
-	if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
+	if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
 		goto err_array;
 
-	err = tap_set_queue(dev, file, q);
+	err = tap_set_queue(tap, file, q);
 	if (err)
 		goto err_queue;
 
-	dev_put(dev);
+	dev_put(tap->dev);
 
 	rtnl_unlock();
 	return err;
@@ -532,8 +504,8 @@ err_queue:
 err_array:
 	sock_put(&q->sk);
 err:
-	if (dev)
-		dev_put(dev);
+	if (tap)
+		dev_put(tap->dev);
 
 	rtnl_unlock();
 	return err;
@@ -601,7 +573,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 {
 	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
 	struct sk_buff *skb;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	unsigned long total_len = iov_iter_count(from);
 	unsigned long len = total_len;
 	int err;
@@ -698,7 +670,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 		skb_set_network_header(skb, depth);
 
 	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
+	tap = rcu_dereference(q->tap);
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
 		skb_shinfo(skb)->destructor_arg = m->msg_control;
@@ -709,8 +681,8 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 		uarg->callback(uarg, false);
 	}
 
-	if (vlan) {
-		skb->dev = vlan->dev;
+	if (tap) {
+		skb->dev = tap->dev;
 		dev_queue_xmit(skb);
 	} else {
 		kfree_skb(skb);
@@ -724,9 +696,9 @@ err_kfree:
 
 err:
 	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
-	if (vlan)
-		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+	tap = rcu_dereference(q->tap);
+	if (tap && tap->count_tx_dropped)
+		tap->count_tx_dropped(tap);
 	rcu_read_unlock();
 
 	return err;
@@ -853,55 +825,55 @@ static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return ret;
 }
 
-static struct macvlan_dev *tap_get_vlan(struct tap_queue *q)
+static struct tap_dev *tap_get_tap_dev(struct tap_queue *q)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 
 	ASSERT_RTNL();
-	vlan = rtnl_dereference(q->vlan);
-	if (vlan)
-		dev_hold(vlan->dev);
+	tap = rtnl_dereference(q->tap);
+	if (tap)
+		dev_hold(tap->dev);
 
-	return vlan;
+	return tap;
 }
 
-static void tap_put_vlan(struct macvlan_dev *vlan)
+static void tap_put_tap_dev(struct tap_dev *tap)
 {
-	dev_put(vlan->dev);
+	dev_put(tap->dev);
 }
 
 static int tap_ioctl_set_queue(struct file *file, unsigned int flags)
 {
 	struct tap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	int ret;
 
-	vlan = tap_get_vlan(q);
-	if (!vlan)
+	tap = tap_get_tap_dev(q);
+	if (!tap)
 		return -EINVAL;
 
 	if (flags & IFF_ATTACH_QUEUE)
-		ret = tap_enable_queue(vlan->dev, file, q);
+		ret = tap_enable_queue(tap, file, q);
 	else if (flags & IFF_DETACH_QUEUE)
 		ret = tap_disable_queue(q);
 	else
 		ret = -EINVAL;
 
-	tap_put_vlan(vlan);
+	tap_put_tap_dev(tap);
 	return ret;
 }
 
 static int set_offload(struct tap_queue *q, unsigned long arg)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	netdev_features_t features;
 	netdev_features_t feature_mask = 0;
 
-	vlan = rtnl_dereference(q->vlan);
-	if (!vlan)
+	tap = rtnl_dereference(q->tap);
+	if (!tap)
 		return -ENOLINK;
 
-	features = vlan->dev->features;
+	features = tap->dev->features;
 
 	if (arg & TUN_F_CSUM) {
 		feature_mask = NETIF_F_HW_CSUM;
@@ -935,9 +907,9 @@ static int set_offload(struct tap_queue *q, unsigned long arg)
 	/* tap_features are the same as features on tun/tap and
 	 * reflect user expectations.
 	 */
-	vlan->tap_features = feature_mask;
-	vlan->set_features = features;
-	netdev_update_features(vlan->dev);
+	tap->tap_features = feature_mask;
+	if (tap->update_features)
+		tap->update_features(tap, features);
 
 	return 0;
 }
@@ -949,7 +921,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 		      unsigned long arg)
 {
 	struct tap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	void __user *argp = (void __user *)arg;
 	struct ifreq __user *ifr = argp;
 	unsigned int __user *up = argp;
@@ -975,18 +947,18 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 
 	case TUNGETIFF:
 		rtnl_lock();
-		vlan = tap_get_vlan(q);
-		if (!vlan) {
+		tap = tap_get_tap_dev(q);
+		if (!tap) {
 			rtnl_unlock();
 			return -ENOLINK;
 		}
 
 		ret = 0;
 		u = q->flags;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
+		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
 		    put_user(u, &ifr->ifr_flags))
 			ret = -EFAULT;
-		tap_put_vlan(vlan);
+		tap_put_tap_dev(tap);
 		rtnl_unlock();
 		return ret;
 
@@ -1059,18 +1031,18 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 
 	case SIOCGIFHWADDR:
 		rtnl_lock();
-		vlan = tap_get_vlan(q);
-		if (!vlan) {
+		tap = tap_get_tap_dev(q);
+		if (!tap) {
 			rtnl_unlock();
 			return -ENOLINK;
 		}
 		ret = 0;
-		u = vlan->dev->type;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
-		    copy_to_user(&ifr->ifr_hwaddr.sa_data, vlan->dev->dev_addr, ETH_ALEN) ||
+		u = tap->dev->type;
+		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
+		    copy_to_user(&ifr->ifr_hwaddr.sa_data, tap->dev->dev_addr, ETH_ALEN) ||
 		    put_user(u, &ifr->ifr_hwaddr.sa_family))
 			ret = -EFAULT;
-		tap_put_vlan(vlan);
+		tap_put_tap_dev(tap);
 		rtnl_unlock();
 		return ret;
 
@@ -1078,13 +1050,13 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
 			return -EFAULT;
 		rtnl_lock();
-		vlan = tap_get_vlan(q);
-		if (!vlan) {
+		tap = tap_get_tap_dev(q);
+		if (!tap) {
 			rtnl_unlock();
 			return -ENOLINK;
 		}
-		ret = dev_set_mac_address(vlan->dev, &sa);
-		tap_put_vlan(vlan);
+		ret = dev_set_mac_address(tap->dev, &sa);
+		tap_put_tap_dev(tap);
 		rtnl_unlock();
 		return ret;
 
@@ -1167,19 +1139,19 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
-int tap_queue_resize(struct macvlan_dev *vlan)
+int tap_queue_resize(struct tap_dev *tap)
 {
-	struct net_device *dev = vlan->dev;
+	struct net_device *dev = tap->dev;
 	struct tap_queue *q;
 	struct skb_array **arrays;
-	int n = vlan->numqueues;
+	int n = tap->numqueues;
 	int ret, i = 0;
 
 	arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
 	if (!arrays)
 		return -ENOMEM;
 
-	list_for_each_entry(q, &vlan->queue_list, next)
+	list_for_each_entry(q, &tap->queue_list, next)
 		arrays[i++] = &q->skb_array;
 
 	ret = skb_array_resize_multiple(arrays, n,
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index a2dfd9063a6c..75031e5d0a65 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -14,11 +14,60 @@ static inline struct socket *tap_get_socket(struct file *f)
 }
 #endif /* CONFIG_MACVTAP */
 
+#include <net/sock.h>
+#include <linux/skb_array.h>
+
+#define MAX_TAP_QUEUES 256
+
+struct tap_queue;
+
+struct tap_dev {
+	struct net_device	*dev;
+	u16			flags;
+	/* This array tracks active taps. */
+	struct tap_queue    __rcu *taps[MAX_TAP_QUEUES];
+	/* This list tracks all taps (both enabled and disabled) */
+	struct list_head	queue_list;
+	int			numvtaps;
+	int			numqueues;
+	netdev_features_t	tap_features;
+	int			minor;
+
+	void (*update_features)(struct tap_dev *tap, netdev_features_t features);
+	void (*count_tx_dropped)(struct tap_dev *tap);
+	void (*count_rx_dropped)(struct tap_dev *tap);
+};
+
+/*
+ * A tap queue is the central object of tap module, it connects
+ * an open character device to virtual interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * tap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ */
+
+struct tap_queue {
+	struct sock sk;
+	struct socket sock;
+	struct socket_wq wq;
+	int vnet_hdr_sz;
+	struct tap_dev __rcu *tap;
+	struct file *file;
+	unsigned int flags;
+	u16 queue_index;
+	bool enabled;
+	struct list_head next;
+	struct skb_array skb_array;
+};
+
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
-void tap_del_queues(struct net_device *dev);
-int tap_get_minor(struct macvlan_dev *vlan);
-void tap_free_minor(struct macvlan_dev *vlan);
-int tap_queue_resize(struct macvlan_dev *vlan);
+void tap_del_queues(struct tap_dev *tap);
+int tap_get_minor(struct tap_dev *tap);
+void tap_free_minor(struct tap_dev *tap);
+int tap_queue_resize(struct tap_dev *tap);
 int tap_create_cdev(struct cdev *tap_cdev,
 		    dev_t *tap_major, const char *device_name);
 void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev);
-- 
cgit v1.2.3


From d9f1f61c0801a73ff36d416a7ede54229b231e1d Mon Sep 17 00:00:00 2001
From: Sainath Grandhi <sainath.grandhi@intel.com>
Date: Fri, 10 Feb 2017 16:03:50 -0800
Subject: tap: Extending tap device create/destroy APIs

Extending tap APIs get/free_minor and create/destroy_cdev to handle more than one
type of virtual interface.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap_main.c |   6 +--
 drivers/net/tap.c          | 118 +++++++++++++++++++++++++++++++++++++--------
 include/linux/if_tap.h     |   4 +-
 3 files changed, 102 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
index 0238df62bf45..a4bfc10b61dd 100644
--- a/drivers/net/macvtap_main.c
+++ b/drivers/net/macvtap_main.c
@@ -163,7 +163,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		 * been registered but before register_netdevice has
 		 * finished running.
 		 */
-		err = tap_get_minor(&vlantap->tap);
+		err = tap_get_minor(macvtap_major, &vlantap->tap);
 		if (err)
 			return notifier_from_errno(err);
 
@@ -171,7 +171,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		classdev = device_create(&macvtap_class, &dev->dev, devt,
 					 dev, tap_name);
 		if (IS_ERR(classdev)) {
-			tap_free_minor(&vlantap->tap);
+			tap_free_minor(macvtap_major, &vlantap->tap);
 			return notifier_from_errno(PTR_ERR(classdev));
 		}
 		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
@@ -186,7 +186,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		sysfs_remove_link(&dev->dev.kobj, tap_name);
 		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
 		device_destroy(&macvtap_class, devt);
-		tap_free_minor(&vlantap->tap);
+		tap_free_minor(macvtap_major, &vlantap->tap);
 		break;
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		if (tap_queue_resize(&vlantap->tap))
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 7d3e8b18f5e6..71bbf0b6327d 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -99,12 +99,17 @@ static struct proto tap_proto = {
 };
 
 #define TAP_NUM_DEVS (1U << MINORBITS)
+
+static LIST_HEAD(major_list);
+
 struct major_info {
+	struct rcu_head rcu;
 	dev_t major;
 	struct idr minor_idr;
 	struct mutex minor_lock;
 	const char *device_name;
-} macvtap_major;
+	struct list_head next;
+};
 
 #define GOODCOPY_LEN 128
 
@@ -385,44 +390,89 @@ drop:
 	return RX_HANDLER_CONSUMED;
 }
 
-int tap_get_minor(struct tap_dev *tap)
+static struct major_info *tap_get_major(int major)
+{
+	struct major_info *tap_major;
+
+	list_for_each_entry_rcu(tap_major, &major_list, next) {
+		if (tap_major->major == major)
+			return tap_major;
+	}
+
+	return NULL;
+}
+
+int tap_get_minor(dev_t major, struct tap_dev *tap)
 {
 	int retval = -ENOMEM;
+	struct major_info *tap_major;
+
+	rcu_read_lock();
+	tap_major = tap_get_major(MAJOR(major));
+	if (!tap_major) {
+		retval = -EINVAL;
+		goto unlock;
+	}
 
-	mutex_lock(&macvtap_major.minor_lock);
-	retval = idr_alloc(&macvtap_major.minor_idr, tap, 1, TAP_NUM_DEVS, GFP_KERNEL);
+	mutex_lock(&tap_major->minor_lock);
+	retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_KERNEL);
 	if (retval >= 0) {
 		tap->minor = retval;
 	} else if (retval == -ENOSPC) {
 		netdev_err(tap->dev, "Too many tap devices\n");
 		retval = -EINVAL;
 	}
-	mutex_unlock(&macvtap_major.minor_lock);
+	mutex_unlock(&tap_major->minor_lock);
+
+unlock:
+	rcu_read_unlock();
 	return retval < 0 ? retval : 0;
 }
 
-void tap_free_minor(struct tap_dev *tap)
+void tap_free_minor(dev_t major, struct tap_dev *tap)
 {
-	mutex_lock(&macvtap_major.minor_lock);
+	struct major_info *tap_major;
+
+	rcu_read_lock();
+	tap_major = tap_get_major(MAJOR(major));
+	if (!tap_major) {
+		goto unlock;
+	}
+
+	mutex_lock(&tap_major->minor_lock);
 	if (tap->minor) {
-		idr_remove(&macvtap_major.minor_idr, tap->minor);
+		idr_remove(&tap_major->minor_idr, tap->minor);
 		tap->minor = 0;
 	}
-	mutex_unlock(&macvtap_major.minor_lock);
+	mutex_unlock(&tap_major->minor_lock);
+
+unlock:
+	rcu_read_unlock();
 }
 
-static struct tap_dev *dev_get_by_tap_minor(int minor)
+static struct tap_dev *dev_get_by_tap_file(int major, int minor)
 {
 	struct net_device *dev = NULL;
 	struct tap_dev *tap;
+	struct major_info *tap_major;
 
-	mutex_lock(&macvtap_major.minor_lock);
-	tap = idr_find(&macvtap_major.minor_idr, minor);
+	rcu_read_lock();
+	tap_major = tap_get_major(major);
+	if (!tap_major) {
+		tap = NULL;
+		goto unlock;
+	}
+
+	mutex_lock(&tap_major->minor_lock);
+	tap = idr_find(&tap_major->minor_idr, minor);
 	if (tap) {
 		dev = tap->dev;
 		dev_hold(dev);
 	}
-	mutex_unlock(&macvtap_major.minor_lock);
+	mutex_unlock(&tap_major->minor_lock);
+
+unlock:
+	rcu_read_unlock();
 	return tap;
 }
 
@@ -454,7 +504,7 @@ static int tap_open(struct inode *inode, struct file *file)
 	int err = -ENODEV;
 
 	rtnl_lock();
-	tap = dev_get_by_tap_minor(iminor(inode));
+	tap = dev_get_by_tap_file(imajor(inode), iminor(inode));
 	if (!tap)
 		goto err;
 
@@ -1161,6 +1211,25 @@ int tap_queue_resize(struct tap_dev *tap)
 	return ret;
 }
 
+static int tap_list_add(dev_t major, const char *device_name)
+{
+	struct major_info *tap_major;
+
+	tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC);
+	if (!tap_major)
+		return -ENOMEM;
+
+	tap_major->major = MAJOR(major);
+
+	idr_init(&tap_major->minor_idr);
+	mutex_init(&tap_major->minor_lock);
+
+	tap_major->device_name = device_name;
+
+	list_add_tail_rcu(&tap_major->next, &major_list);
+	return 0;
+}
+
 int tap_create_cdev(struct cdev *tap_cdev,
 		    dev_t *tap_major, const char *device_name)
 {
@@ -1175,15 +1244,14 @@ int tap_create_cdev(struct cdev *tap_cdev,
 	if (err)
 		goto out2;
 
-	macvtap_major.major = MAJOR(*tap_major);
-
-	idr_init(&macvtap_major.minor_idr);
-	mutex_init(&macvtap_major.minor_lock);
-
-	macvtap_major.device_name = device_name;
+	err =  tap_list_add(*tap_major, device_name);
+	if (err)
+		goto out3;
 
 	return 0;
 
+out3:
+	cdev_del(tap_cdev);
 out2:
 	unregister_chrdev_region(*tap_major, TAP_NUM_DEVS);
 out1:
@@ -1192,7 +1260,15 @@ out1:
 
 void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
 {
+	struct major_info *tap_major, *tmp;
+
 	cdev_del(tap_cdev);
 	unregister_chrdev_region(major, TAP_NUM_DEVS);
-	idr_destroy(&macvtap_major.minor_idr);
+	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
+		if (tap_major->major == MAJOR(major)) {
+			idr_destroy(&tap_major->minor_idr);
+			list_del_rcu(&tap_major->next);
+			kfree_rcu(tap_major, rcu);
+		}
+	}
 }
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 75031e5d0a65..362e71c16efb 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -65,8 +65,8 @@ struct tap_queue {
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
 void tap_del_queues(struct tap_dev *tap);
-int tap_get_minor(struct tap_dev *tap);
-void tap_free_minor(struct tap_dev *tap);
+int tap_get_minor(dev_t major, struct tap_dev *tap);
+void tap_free_minor(dev_t major, struct tap_dev *tap);
 int tap_queue_resize(struct tap_dev *tap);
 int tap_create_cdev(struct cdev *tap_cdev,
 		    dev_t *tap_major, const char *device_name);
-- 
cgit v1.2.3


From 9a393b5d5988ea4eaa3e0da138321abe0dc03a68 Mon Sep 17 00:00:00 2001
From: Sainath Grandhi <sainath.grandhi@intel.com>
Date: Fri, 10 Feb 2017 16:03:51 -0800
Subject: tap: tap as an independent module

This patch makes tap a separate module for other types of virtual interfaces, for example,
ipvlan to use.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig        |   7 ++
 drivers/net/Makefile       |   3 +-
 drivers/net/macvtap.c      | 249 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/macvtap_main.c | 249 ---------------------------------------------
 drivers/net/tap.c          |  11 ++
 drivers/vhost/Kconfig      |   2 +-
 include/linux/if_tap.h     |   4 +-
 7 files changed, 271 insertions(+), 254 deletions(-)
 create mode 100644 drivers/net/macvtap.c
 delete mode 100644 drivers/net/macvtap_main.c

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index a993cbeb9e0c..5763503fe4e6 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -135,6 +135,7 @@ config MACVTAP
 	tristate "MAC-VLAN based tap driver"
 	depends on MACVLAN
 	depends on INET
+	select TAP
 	help
 	  This adds a specialized tap character device driver that is based
 	  on the MAC-VLAN network interface, called macvtap. A macvtap device
@@ -287,6 +288,12 @@ config TUN
 
 	  If you don't know what to use this for, you don't need it.
 
+config TAP
+	tristate
+	---help---
+	  This option is selected by any driver implementing tap user space
+	  interface for a virtual interface to re-use core tap functionality.
+
 config TUN_VNET_CROSS_LE
 	bool "Support for cross-endian vnet headers on little-endian kernels"
 	default n
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 19b03a9fe0f6..7dd86ca02d0d 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_PHYLIB) += phy/
 obj-$(CONFIG_RIONET) += rionet.o
 obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
+obj-$(CONFIG_TAP) += tap.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
 obj-$(CONFIG_VXLAN) += vxlan.o
@@ -29,8 +30,6 @@ obj-$(CONFIG_GTP) += gtp.o
 obj-$(CONFIG_NLMON) += nlmon.o
 obj-$(CONFIG_NET_VRF) += vrf.o
 
-macvtap-objs := macvtap_main.o tap.o
-
 #
 # Networking Drivers
 #
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
new file mode 100644
index 000000000000..a4bfc10b61dd
--- /dev/null
+++ b/drivers/net/macvtap.c
@@ -0,0 +1,249 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/if_tap.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/uio.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+#include <linux/virtio_net.h>
+#include <linux/skb_array.h>
+
+struct macvtap_dev {
+	struct macvlan_dev vlan;
+	struct tap_dev    tap;
+};
+
+/*
+ * Variables for dealing with macvtaps device numbers.
+ */
+static dev_t macvtap_major;
+
+static const void *macvtap_net_namespace(struct device *d)
+{
+	struct net_device *dev = to_net_dev(d->parent);
+	return dev_net(dev);
+}
+
+static struct class macvtap_class = {
+	.name = "macvtap",
+	.owner = THIS_MODULE,
+	.ns_type = &net_ns_type_operations,
+	.namespace = macvtap_net_namespace,
+};
+static struct cdev macvtap_cdev;
+
+#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
+		      NETIF_F_TSO6 | NETIF_F_UFO)
+
+static void macvtap_count_tx_dropped(struct tap_dev *tap)
+{
+	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
+	struct macvlan_dev *vlan = &vlantap->vlan;
+
+	this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+}
+
+static void macvtap_count_rx_dropped(struct tap_dev *tap)
+{
+	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
+	struct macvlan_dev *vlan = &vlantap->vlan;
+
+	macvlan_count_rx(vlan, 0, 0, 0);
+}
+
+static void macvtap_update_features(struct tap_dev *tap,
+				    netdev_features_t features)
+{
+	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
+	struct macvlan_dev *vlan = &vlantap->vlan;
+
+	vlan->set_features = features;
+	netdev_update_features(vlan->dev);
+}
+
+static int macvtap_newlink(struct net *src_net,
+			   struct net_device *dev,
+			   struct nlattr *tb[],
+			   struct nlattr *data[])
+{
+	struct macvtap_dev *vlantap = netdev_priv(dev);
+	int err;
+
+	INIT_LIST_HEAD(&vlantap->tap.queue_list);
+
+	/* Since macvlan supports all offloads by default, make
+	 * tap support all offloads also.
+	 */
+	vlantap->tap.tap_features = TUN_OFFLOADS;
+
+	/* Register callbacks for rx/tx drops accounting and updating
+	 * net_device features
+	 */
+	vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
+	vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
+	vlantap->tap.update_features  = macvtap_update_features;
+
+	err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
+	if (err)
+		return err;
+
+	/* Don't put anything that may fail after macvlan_common_newlink
+	 * because we can't undo what it does.
+	 */
+	err = macvlan_common_newlink(src_net, dev, tb, data);
+	if (err) {
+		netdev_rx_handler_unregister(dev);
+		return err;
+	}
+
+	vlantap->tap.dev = vlantap->vlan.dev;
+
+	return 0;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+			    struct list_head *head)
+{
+	struct macvtap_dev *vlantap = netdev_priv(dev);
+
+	netdev_rx_handler_unregister(dev);
+	tap_del_queues(&vlantap->tap);
+	macvlan_dellink(dev, head);
+}
+
+static void macvtap_setup(struct net_device *dev)
+{
+	macvlan_common_setup(dev);
+	dev->tx_queue_len = TUN_READQ_SIZE;
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+	.kind		= "macvtap",
+	.setup		= macvtap_setup,
+	.newlink	= macvtap_newlink,
+	.dellink	= macvtap_dellink,
+	.priv_size      = sizeof(struct macvtap_dev),
+};
+
+static int macvtap_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct macvtap_dev *vlantap;
+	struct device *classdev;
+	dev_t devt;
+	int err;
+	char tap_name[IFNAMSIZ];
+
+	if (dev->rtnl_link_ops != &macvtap_link_ops)
+		return NOTIFY_DONE;
+
+	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
+	vlantap = netdev_priv(dev);
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		/* Create the device node here after the network device has
+		 * been registered but before register_netdevice has
+		 * finished running.
+		 */
+		err = tap_get_minor(macvtap_major, &vlantap->tap);
+		if (err)
+			return notifier_from_errno(err);
+
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
+		classdev = device_create(&macvtap_class, &dev->dev, devt,
+					 dev, tap_name);
+		if (IS_ERR(classdev)) {
+			tap_free_minor(macvtap_major, &vlantap->tap);
+			return notifier_from_errno(PTR_ERR(classdev));
+		}
+		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
+					tap_name);
+		if (err)
+			return notifier_from_errno(err);
+		break;
+	case NETDEV_UNREGISTER:
+		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
+		if (vlantap->tap.minor == 0)
+			break;
+		sysfs_remove_link(&dev->dev.kobj, tap_name);
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
+		device_destroy(&macvtap_class, devt);
+		tap_free_minor(macvtap_major, &vlantap->tap);
+		break;
+	case NETDEV_CHANGE_TX_QUEUE_LEN:
+		if (tap_queue_resize(&vlantap->tap))
+			return NOTIFY_BAD;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block macvtap_notifier_block __read_mostly = {
+	.notifier_call	= macvtap_device_event,
+};
+
+static int macvtap_init(void)
+{
+	int err;
+
+	err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");
+
+	if (err)
+		goto out1;
+
+	err = class_register(&macvtap_class);
+	if (err)
+		goto out2;
+
+	err = register_netdevice_notifier(&macvtap_notifier_block);
+	if (err)
+		goto out3;
+
+	err = macvlan_link_register(&macvtap_link_ops);
+	if (err)
+		goto out4;
+
+	return 0;
+
+out4:
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+out3:
+	class_unregister(&macvtap_class);
+out2:
+	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
+out1:
+	return err;
+}
+module_init(macvtap_init);
+
+static void macvtap_exit(void)
+{
+	rtnl_link_unregister(&macvtap_link_ops);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+	class_unregister(&macvtap_class);
+	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
deleted file mode 100644
index a4bfc10b61dd..000000000000
--- a/drivers/net/macvtap_main.c
+++ /dev/null
@@ -1,249 +0,0 @@
-#include <linux/etherdevice.h>
-#include <linux/if_macvlan.h>
-#include <linux/if_tap.h>
-#include <linux/if_vlan.h>
-#include <linux/interrupt.h>
-#include <linux/nsproxy.h>
-#include <linux/compat.h>
-#include <linux/if_tun.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/cache.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
-#include <linux/cdev.h>
-#include <linux/idr.h>
-#include <linux/fs.h>
-#include <linux/uio.h>
-
-#include <net/net_namespace.h>
-#include <net/rtnetlink.h>
-#include <net/sock.h>
-#include <linux/virtio_net.h>
-#include <linux/skb_array.h>
-
-struct macvtap_dev {
-	struct macvlan_dev vlan;
-	struct tap_dev    tap;
-};
-
-/*
- * Variables for dealing with macvtaps device numbers.
- */
-static dev_t macvtap_major;
-
-static const void *macvtap_net_namespace(struct device *d)
-{
-	struct net_device *dev = to_net_dev(d->parent);
-	return dev_net(dev);
-}
-
-static struct class macvtap_class = {
-	.name = "macvtap",
-	.owner = THIS_MODULE,
-	.ns_type = &net_ns_type_operations,
-	.namespace = macvtap_net_namespace,
-};
-static struct cdev macvtap_cdev;
-
-#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
-		      NETIF_F_TSO6 | NETIF_F_UFO)
-
-static void macvtap_count_tx_dropped(struct tap_dev *tap)
-{
-	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
-	struct macvlan_dev *vlan = &vlantap->vlan;
-
-	this_cpu_inc(vlan->pcpu_stats->tx_dropped);
-}
-
-static void macvtap_count_rx_dropped(struct tap_dev *tap)
-{
-	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
-	struct macvlan_dev *vlan = &vlantap->vlan;
-
-	macvlan_count_rx(vlan, 0, 0, 0);
-}
-
-static void macvtap_update_features(struct tap_dev *tap,
-				    netdev_features_t features)
-{
-	struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
-	struct macvlan_dev *vlan = &vlantap->vlan;
-
-	vlan->set_features = features;
-	netdev_update_features(vlan->dev);
-}
-
-static int macvtap_newlink(struct net *src_net,
-			   struct net_device *dev,
-			   struct nlattr *tb[],
-			   struct nlattr *data[])
-{
-	struct macvtap_dev *vlantap = netdev_priv(dev);
-	int err;
-
-	INIT_LIST_HEAD(&vlantap->tap.queue_list);
-
-	/* Since macvlan supports all offloads by default, make
-	 * tap support all offloads also.
-	 */
-	vlantap->tap.tap_features = TUN_OFFLOADS;
-
-	/* Register callbacks for rx/tx drops accounting and updating
-	 * net_device features
-	 */
-	vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
-	vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
-	vlantap->tap.update_features  = macvtap_update_features;
-
-	err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
-	if (err)
-		return err;
-
-	/* Don't put anything that may fail after macvlan_common_newlink
-	 * because we can't undo what it does.
-	 */
-	err = macvlan_common_newlink(src_net, dev, tb, data);
-	if (err) {
-		netdev_rx_handler_unregister(dev);
-		return err;
-	}
-
-	vlantap->tap.dev = vlantap->vlan.dev;
-
-	return 0;
-}
-
-static void macvtap_dellink(struct net_device *dev,
-			    struct list_head *head)
-{
-	struct macvtap_dev *vlantap = netdev_priv(dev);
-
-	netdev_rx_handler_unregister(dev);
-	tap_del_queues(&vlantap->tap);
-	macvlan_dellink(dev, head);
-}
-
-static void macvtap_setup(struct net_device *dev)
-{
-	macvlan_common_setup(dev);
-	dev->tx_queue_len = TUN_READQ_SIZE;
-}
-
-static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
-	.kind		= "macvtap",
-	.setup		= macvtap_setup,
-	.newlink	= macvtap_newlink,
-	.dellink	= macvtap_dellink,
-	.priv_size      = sizeof(struct macvtap_dev),
-};
-
-static int macvtap_device_event(struct notifier_block *unused,
-				unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct macvtap_dev *vlantap;
-	struct device *classdev;
-	dev_t devt;
-	int err;
-	char tap_name[IFNAMSIZ];
-
-	if (dev->rtnl_link_ops != &macvtap_link_ops)
-		return NOTIFY_DONE;
-
-	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
-	vlantap = netdev_priv(dev);
-
-	switch (event) {
-	case NETDEV_REGISTER:
-		/* Create the device node here after the network device has
-		 * been registered but before register_netdevice has
-		 * finished running.
-		 */
-		err = tap_get_minor(macvtap_major, &vlantap->tap);
-		if (err)
-			return notifier_from_errno(err);
-
-		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
-		classdev = device_create(&macvtap_class, &dev->dev, devt,
-					 dev, tap_name);
-		if (IS_ERR(classdev)) {
-			tap_free_minor(macvtap_major, &vlantap->tap);
-			return notifier_from_errno(PTR_ERR(classdev));
-		}
-		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
-					tap_name);
-		if (err)
-			return notifier_from_errno(err);
-		break;
-	case NETDEV_UNREGISTER:
-		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
-		if (vlantap->tap.minor == 0)
-			break;
-		sysfs_remove_link(&dev->dev.kobj, tap_name);
-		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
-		device_destroy(&macvtap_class, devt);
-		tap_free_minor(macvtap_major, &vlantap->tap);
-		break;
-	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		if (tap_queue_resize(&vlantap->tap))
-			return NOTIFY_BAD;
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block macvtap_notifier_block __read_mostly = {
-	.notifier_call	= macvtap_device_event,
-};
-
-static int macvtap_init(void)
-{
-	int err;
-
-	err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");
-
-	if (err)
-		goto out1;
-
-	err = class_register(&macvtap_class);
-	if (err)
-		goto out2;
-
-	err = register_netdevice_notifier(&macvtap_notifier_block);
-	if (err)
-		goto out3;
-
-	err = macvlan_link_register(&macvtap_link_ops);
-	if (err)
-		goto out4;
-
-	return 0;
-
-out4:
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-out3:
-	class_unregister(&macvtap_class);
-out2:
-	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
-out1:
-	return err;
-}
-module_init(macvtap_init);
-
-static void macvtap_exit(void)
-{
-	rtnl_link_unregister(&macvtap_link_ops);
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-	class_unregister(&macvtap_class);
-	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
-}
-module_exit(macvtap_exit);
-
-MODULE_ALIAS_RTNL_LINK("macvtap");
-MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 71bbf0b6327d..35b55a2fa1a1 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -312,6 +312,7 @@ void tap_del_queues(struct tap_dev *tap)
 	/* guarantee that any future tap_set_queue will fail */
 	tap->numvtaps = MAX_TAP_QUEUES;
 }
+EXPORT_SYMBOL_GPL(tap_del_queues);
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 {
@@ -389,6 +390,7 @@ drop:
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
+EXPORT_SYMBOL_GPL(tap_handle_frame);
 
 static struct major_info *tap_get_major(int major)
 {
@@ -428,6 +430,7 @@ unlock:
 	rcu_read_unlock();
 	return retval < 0 ? retval : 0;
 }
+EXPORT_SYMBOL_GPL(tap_get_minor);
 
 void tap_free_minor(dev_t major, struct tap_dev *tap)
 {
@@ -449,6 +452,7 @@ void tap_free_minor(dev_t major, struct tap_dev *tap)
 unlock:
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(tap_free_minor);
 
 static struct tap_dev *dev_get_by_tap_file(int major, int minor)
 {
@@ -1210,6 +1214,7 @@ int tap_queue_resize(struct tap_dev *tap)
 	kfree(arrays);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(tap_queue_resize);
 
 static int tap_list_add(dev_t major, const char *device_name)
 {
@@ -1257,6 +1262,7 @@ out2:
 out1:
 	return err;
 }
+EXPORT_SYMBOL_GPL(tap_create_cdev);
 
 void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
 {
@@ -1272,3 +1278,8 @@ void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(tap_destroy_cdev);
+
+MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
+MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 40764ecad9ce..cfdecea5078f 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,6 @@
 config VHOST_NET
 	tristate "Host kernel accelerator for virtio net"
-	depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
+	depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP)
 	select VHOST
 	---help---
 	  This kernel module can be loaded in host kernel to accelerate
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 362e71c16efb..3482c3c2037d 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_IF_TAP_H_
 #define _LINUX_IF_TAP_H_
 
-#if IS_ENABLED(CONFIG_MACVTAP)
+#if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
 #else
 #include <linux/err.h>
@@ -12,7 +12,7 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-#endif /* CONFIG_MACVTAP */
+#endif /* CONFIG_TAP */
 
 #include <net/sock.h>
 #include <linux/skb_array.h>
-- 
cgit v1.2.3


From 8c4d4e8b5626fec965fd5034e5bd5e57790f243f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Feb 2017 12:08:17 +0100
Subject: netfilter: nfnetlink: allow to check for generation ID

This patch allows userspace to specify the generation ID that has been
used to build an incremental batch update.

If userspace specifies the generation ID in the batch message as
attribute, then nfnetlink compares it to the current generation ID so
you make sure that you work against the right baseline. Otherwise, bail
out with ERESTART so userspace knows that its changeset is stale and
needs to respin. Userspace can do this transparently at the cost of
taking slightly more time to refresh caches and rework the changeset.

This check is optional, if there is no NFNL_BATCH_GENID attribute in the
batch begin message, then no check is performed.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h      |  1 +
 include/uapi/linux/netfilter/nfnetlink.h | 12 ++++++++++++
 net/netfilter/nfnetlink.c                | 31 +++++++++++++++++++++++++++----
 3 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 1d82dd5e9a08..1b49209dd5c7 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -28,6 +28,7 @@ struct nfnetlink_subsystem {
 	const struct nfnl_callback *cb;	/* callback for individual types */
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
+	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 4bb8cb7730e7..a09906a30d77 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -65,4 +65,16 @@ struct nfgenmsg {
 #define NFNL_MSG_BATCH_BEGIN		NLMSG_MIN_TYPE
 #define NFNL_MSG_BATCH_END		NLMSG_MIN_TYPE+1
 
+/**
+ * enum nfnl_batch_attributes - nfnetlink batch netlink attributes
+ *
+ * @NFNL_BATCH_GENID: generation ID for this changeset (NLA_U32)
+ */
+enum nfnl_batch_attributes {
+        NFNL_BATCH_UNSPEC,
+        NFNL_BATCH_GENID,
+        __NFNL_BATCH_MAX
+};
+#define NFNL_BATCH_MAX			(__NFNL_BATCH_MAX - 1)
+
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index ca645a3b1375..a2148d0bc50e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
  *
  * (C) 2001 by Jay Schulist <jschlst@samba.org>,
  * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
- * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * Initial netfilter messages via netlink development funded and
  * generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -273,7 +273,7 @@ enum {
 };
 
 static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
-				u16 subsys_id)
+				u16 subsys_id, u32 genid)
 {
 	struct sk_buff *oskb = skb;
 	struct net *net = sock_net(skb->sk);
@@ -315,6 +315,12 @@ replay:
 		return kfree_skb(skb);
 	}
 
+	if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+		nfnl_unlock(subsys_id);
+		netlink_ack(oskb, nlh, -ERESTART);
+		return kfree_skb(skb);
+	}
+
 	while (skb->len >= nlmsg_total_size(0)) {
 		int msglen, type;
 
@@ -436,11 +442,20 @@ done:
 	kfree_skb(skb);
 }
 
+static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
+	[NFNL_BATCH_GENID]	= { .type = NLA_U32 },
+};
+
 static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
+	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+	struct nlattr *attr = (void *)nlh + min_len;
+	struct nlattr *cda[NFNL_BATCH_MAX + 1];
+	int attrlen = nlh->nlmsg_len - min_len;
 	struct nfgenmsg *nfgenmsg;
+	int msglen, err;
+	u32 gen_id = 0;
 	u16 res_id;
-	int msglen;
 
 	msglen = NLMSG_ALIGN(nlh->nlmsg_len);
 	if (msglen > skb->len)
@@ -450,6 +465,14 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 	    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
 		return;
 
+	err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
+	if (err < 0) {
+		netlink_ack(skb, nlh, err);
+		return;
+	}
+	if (cda[NFNL_BATCH_GENID])
+		gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
+
 	nfgenmsg = nlmsg_data(nlh);
 	skb_pull(skb, msglen);
 	/* Work around old nft using host byte order */
@@ -458,7 +481,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 	else
 		res_id = ntohs(nfgenmsg->res_id);
 
-	nfnetlink_rcv_batch(skb, nlh, res_id);
+	nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
 }
 
 static void nfnetlink_rcv(struct sk_buff *skb)
-- 
cgit v1.2.3


From d0d89493bff8b7c7bf580197a44a1d49c50395b3 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 10 Feb 2017 15:18:07 -0800
Subject: Input: tsc2004/5 - switch to using generic device properties

Instead of supporting legacy platform data (of which we have no mainline
users) and OF-based properties, let's switch to generic device properties.
This will still allow legacy boards to use the driver (by defining property
sets and attaching them to the drivers) and will simplify probe and make
driver usable on ACPI-based systems as well.

Reviewed-By: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/tsc200x-core.c | 93 +++++++++++---------------------
 include/linux/spi/tsc2005.h              | 34 ------------
 2 files changed, 30 insertions(+), 97 deletions(-)
 delete mode 100644 include/linux/spi/tsc2005.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/tsc200x-core.c b/drivers/input/touchscreen/tsc200x-core.c
index 1c14a38e3748..88ea5e1b72ae 100644
--- a/drivers/input/touchscreen/tsc200x-core.c
+++ b/drivers/input/touchscreen/tsc200x-core.c
@@ -27,7 +27,6 @@
 #include <linux/delay.h>
 #include <linux/pm.h>
 #include <linux/of.h>
-#include <linux/spi/tsc2005.h>
 #include <linux/regulator/consumer.h>
 #include <linux/regmap.h>
 #include <linux/gpio/consumer.h>
@@ -114,7 +113,6 @@ struct tsc200x {
 	struct regulator	*vio;
 
 	struct gpio_desc	*reset_gpio;
-	void			(*set_reset)(bool enable);
 	int			(*tsc200x_cmd)(struct device *dev, u8 cmd);
 	int			irq;
 };
@@ -227,12 +225,13 @@ static void tsc200x_stop_scan(struct tsc200x *ts)
 	ts->tsc200x_cmd(ts->dev, TSC200X_CMD_STOP);
 }
 
-static void tsc200x_set_reset(struct tsc200x *ts, bool enable)
+static void tsc200x_reset(struct tsc200x *ts)
 {
-	if (ts->reset_gpio)
-		gpiod_set_value_cansleep(ts->reset_gpio, enable);
-	else if (ts->set_reset)
-		ts->set_reset(enable);
+	if (ts->reset_gpio) {
+		gpiod_set_value_cansleep(ts->reset_gpio, 1);
+		usleep_range(100, 500); /* only 10us required */
+		gpiod_set_value_cansleep(ts->reset_gpio, 0);
+	}
 }
 
 /* must be called with ts->mutex held */
@@ -253,7 +252,7 @@ static void __tsc200x_enable(struct tsc200x *ts)
 {
 	tsc200x_start_scan(ts);
 
-	if (ts->esd_timeout && (ts->set_reset || ts->reset_gpio)) {
+	if (ts->esd_timeout && ts->reset_gpio) {
 		ts->last_valid_interrupt = jiffies;
 		schedule_delayed_work(&ts->esd_work,
 				round_jiffies_relative(
@@ -310,9 +309,7 @@ static ssize_t tsc200x_selftest_show(struct device *dev,
 	}
 
 	/* hardware reset */
-	tsc200x_set_reset(ts, false);
-	usleep_range(100, 500); /* only 10us required */
-	tsc200x_set_reset(ts, true);
+	tsc200x_reset(ts);
 
 	if (!success)
 		goto out;
@@ -354,7 +351,7 @@ static umode_t tsc200x_attr_is_visible(struct kobject *kobj,
 	umode_t mode = attr->mode;
 
 	if (attr == &dev_attr_selftest.attr) {
-		if (!ts->set_reset && !ts->reset_gpio)
+		if (!ts->reset_gpio)
 			mode = 0;
 	}
 
@@ -404,9 +401,7 @@ static void tsc200x_esd_work(struct work_struct *work)
 
 	tsc200x_update_pen_state(ts, 0, 0, 0);
 
-	tsc200x_set_reset(ts, false);
-	usleep_range(100, 500); /* only 10us required */
-	tsc200x_set_reset(ts, true);
+	tsc200x_reset(ts);
 
 	enable_irq(ts->irq);
 	tsc200x_start_scan(ts);
@@ -454,26 +449,12 @@ int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id,
 		  struct regmap *regmap,
 		  int (*tsc200x_cmd)(struct device *dev, u8 cmd))
 {
-	const struct tsc2005_platform_data *pdata = dev_get_platdata(dev);
-	struct device_node *np = dev->of_node;
-
 	struct tsc200x *ts;
 	struct input_dev *input_dev;
-	unsigned int max_x = MAX_12BIT;
-	unsigned int max_y = MAX_12BIT;
-	unsigned int max_p = MAX_12BIT;
-	unsigned int fudge_x = TSC200X_DEF_X_FUZZ;
-	unsigned int fudge_y = TSC200X_DEF_Y_FUZZ;
-	unsigned int fudge_p = TSC200X_DEF_P_FUZZ;
-	unsigned int x_plate_ohm = TSC200X_DEF_RESISTOR;
-	unsigned int esd_timeout;
+	u32 x_plate_ohm;
+	u32 esd_timeout;
 	int error;
 
-	if (!np && !pdata) {
-		dev_err(dev, "no platform data\n");
-		return -ENODEV;
-	}
-
 	if (irq <= 0) {
 		dev_err(dev, "no irq\n");
 		return -ENODEV;
@@ -487,23 +468,6 @@ int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id,
 		return -ENODEV;
 	}
 
-	if (pdata) {
-		fudge_x	= pdata->ts_x_fudge;
-		fudge_y	= pdata->ts_y_fudge;
-		fudge_p	= pdata->ts_pressure_fudge;
-		max_x	= pdata->ts_x_max;
-		max_y	= pdata->ts_y_max;
-		max_p	= pdata->ts_pressure_max;
-		x_plate_ohm = pdata->ts_x_plate_ohm;
-		esd_timeout = pdata->esd_timeout_ms;
-	} else {
-		x_plate_ohm = TSC200X_DEF_RESISTOR;
-		of_property_read_u32(np, "ti,x-plate-ohms", &x_plate_ohm);
-		esd_timeout = 0;
-		of_property_read_u32(np, "ti,esd-recovery-timeout-ms",
-								&esd_timeout);
-	}
-
 	ts = devm_kzalloc(dev, sizeof(*ts), GFP_KERNEL);
 	if (!ts)
 		return -ENOMEM;
@@ -517,8 +481,13 @@ int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id,
 	ts->idev = input_dev;
 	ts->regmap = regmap;
 	ts->tsc200x_cmd = tsc200x_cmd;
-	ts->x_plate_ohm = x_plate_ohm;
-	ts->esd_timeout = esd_timeout;
+
+	error = device_property_read_u32(dev, "ti,x-plate-ohms", &x_plate_ohm);
+	ts->x_plate_ohm = error ? TSC200X_DEF_RESISTOR : x_plate_ohm;
+
+	error = device_property_read_u32(dev, "ti,esd-recovery-timeout-ms",
+					 &esd_timeout);
+	ts->esd_timeout = error ? 0 : esd_timeout;
 
 	ts->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
 	if (IS_ERR(ts->reset_gpio)) {
@@ -534,9 +503,6 @@ int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id,
 		return error;
 	}
 
-	if (!ts->reset_gpio && pdata)
-		ts->set_reset = pdata->set_reset;
-
 	mutex_init(&ts->mutex);
 
 	spin_lock_init(&ts->lock);
@@ -559,22 +525,23 @@ int tsc200x_probe(struct device *dev, int irq, const struct input_id *tsc_id,
 
 	input_dev->phys = ts->phys;
 	input_dev->id = *tsc_id;
-	input_dev->dev.parent = dev;
-	input_dev->evbit[0] = BIT(EV_ABS) | BIT(EV_KEY);
-	input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH);
-
-	input_set_abs_params(input_dev, ABS_X, 0, max_x, fudge_x, 0);
-	input_set_abs_params(input_dev, ABS_Y, 0, max_y, fudge_y, 0);
-	input_set_abs_params(input_dev, ABS_PRESSURE, 0, max_p, fudge_p, 0);
-
-	if (np)
-		touchscreen_parse_properties(input_dev, false, NULL);
 
 	input_dev->open = tsc200x_open;
 	input_dev->close = tsc200x_close;
 
 	input_set_drvdata(input_dev, ts);
 
+	input_set_capability(input_dev, EV_KEY, BTN_TOUCH);
+
+	input_set_abs_params(input_dev, ABS_X,
+			     0, MAX_12BIT, TSC200X_DEF_X_FUZZ, 0);
+	input_set_abs_params(input_dev, ABS_Y,
+			     0, MAX_12BIT, TSC200X_DEF_Y_FUZZ, 0);
+	input_set_abs_params(input_dev, ABS_PRESSURE,
+			     0, MAX_12BIT, TSC200X_DEF_P_FUZZ, 0);
+
+	touchscreen_parse_properties(input_dev, false, NULL);
+
 	/* Ensure the touchscreen is off */
 	tsc200x_stop_scan(ts);
 
diff --git a/include/linux/spi/tsc2005.h b/include/linux/spi/tsc2005.h
deleted file mode 100644
index 563b3b1799a8..000000000000
--- a/include/linux/spi/tsc2005.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * This file is part of TSC2005 touchscreen driver
- *
- * Copyright (C) 2009-2010 Nokia Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#ifndef _LINUX_SPI_TSC2005_H
-#define _LINUX_SPI_TSC2005_H
-
-#include <linux/types.h>
-
-struct tsc2005_platform_data {
-	int		ts_pressure_max;
-	int		ts_pressure_fudge;
-	int		ts_x_max;
-	int		ts_x_fudge;
-	int		ts_y_max;
-	int		ts_y_fudge;
-	int		ts_x_plate_ohm;
-	unsigned int	esd_timeout_ms;
-	void		(*set_reset)(bool enable);
-};
-
-#endif
-- 
cgit v1.2.3


From 7f677633379b4abb3281cdbe7e7006f049305c03 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 10 Feb 2017 20:28:24 -0800
Subject: bpf: introduce BPF_F_ALLOW_OVERRIDE flag

If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
to the given cgroup the descendent cgroup will be able to override
effective bpf program that was inherited from this cgroup.
By default it's not passed, therefore override is disallowed.

Examples:
1.
prog X attached to /A with default
prog Y fails to attach to /A/B and /A/B/C
Everything under /A runs prog X

2.
prog X attached to /A with allow_override.
prog Y fails to attach to /A/B with default (non-override)
prog M attached to /A/B with allow_override.
Everything under /A/B runs prog M only.

3.
prog X attached to /A with allow_override.
prog Y fails to attach to /A with default.
The user has to detach first to switch the mode.

In the future this behavior may be extended with a chain of
non-overridable programs.

Also fix the bug where detach from cgroup where nothing is attached
was not throwing error. Return ENOENT in such case.

Add several testcases and adjust libbpf.

Fixes: 3007098494be ("cgroup: add support for eBPF programs")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h       | 13 ++++----
 include/uapi/linux/bpf.h         |  7 +++++
 kernel/bpf/cgroup.c              | 59 +++++++++++++++++++++++++++-------
 kernel/bpf/syscall.c             | 20 ++++++++----
 kernel/cgroup.c                  |  9 +++---
 samples/bpf/test_cgrp2_attach.c  |  2 +-
 samples/bpf/test_cgrp2_attach2.c | 68 +++++++++++++++++++++++++++++++++++++---
 samples/bpf/test_cgrp2_sock.c    |  2 +-
 samples/bpf/test_cgrp2_sock2.c   |  2 +-
 tools/lib/bpf/bpf.c              |  4 ++-
 tools/lib/bpf/bpf.h              |  3 +-
 11 files changed, 151 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 92bc89ae7e20..c970a25d2a49 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -21,20 +21,19 @@ struct cgroup_bpf {
 	 */
 	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
 	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
+	bool disallow_override[MAX_BPF_ATTACH_TYPE];
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
 void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
 
-void __cgroup_bpf_update(struct cgroup *cgrp,
-			 struct cgroup *parent,
-			 struct bpf_prog *prog,
-			 enum bpf_attach_type type);
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+			struct bpf_prog *prog, enum bpf_attach_type type,
+			bool overridable);
 
 /* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-void cgroup_bpf_update(struct cgroup *cgrp,
-		       struct bpf_prog *prog,
-		       enum bpf_attach_type type);
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, bool overridable);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..d2b0ac799d03 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,12 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
+/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
+ * to the given target_fd cgroup the descendent cgroup will be able to
+ * override effective bpf program that was inherited from this cgroup
+ */
+#define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -171,6 +177,7 @@ union bpf_attr {
 		__u32		target_fd;	/* container object to attach to */
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
+		__u32		attach_flags;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a515f7b007c6..da0f53690295 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -52,6 +52,7 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
 		e = rcu_dereference_protected(parent->bpf.effective[type],
 					      lockdep_is_held(&cgroup_mutex));
 		rcu_assign_pointer(cgrp->bpf.effective[type], e);
+		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
 	}
 }
 
@@ -82,30 +83,63 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
  *
  * Must be called with cgroup_mutex held.
  */
-void __cgroup_bpf_update(struct cgroup *cgrp,
-			 struct cgroup *parent,
-			 struct bpf_prog *prog,
-			 enum bpf_attach_type type)
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+			struct bpf_prog *prog, enum bpf_attach_type type,
+			bool new_overridable)
 {
-	struct bpf_prog *old_prog, *effective;
+	struct bpf_prog *old_prog, *effective = NULL;
 	struct cgroup_subsys_state *pos;
+	bool overridable = true;
 
-	old_prog = xchg(cgrp->bpf.prog + type, prog);
+	if (parent) {
+		overridable = !parent->bpf.disallow_override[type];
+		effective = rcu_dereference_protected(parent->bpf.effective[type],
+						      lockdep_is_held(&cgroup_mutex));
+	}
+
+	if (prog && effective && !overridable)
+		/* if parent has non-overridable prog attached, disallow
+		 * attaching new programs to descendent cgroup
+		 */
+		return -EPERM;
+
+	if (prog && effective && overridable != new_overridable)
+		/* if parent has overridable prog attached, only
+		 * allow overridable programs in descendent cgroup
+		 */
+		return -EPERM;
 
-	effective = (!prog && parent) ?
-		rcu_dereference_protected(parent->bpf.effective[type],
-					  lockdep_is_held(&cgroup_mutex)) :
-		prog;
+	old_prog = cgrp->bpf.prog[type];
+
+	if (prog) {
+		overridable = new_overridable;
+		effective = prog;
+		if (old_prog &&
+		    cgrp->bpf.disallow_override[type] == new_overridable)
+			/* disallow attaching non-overridable on top
+			 * of existing overridable in this cgroup
+			 * and vice versa
+			 */
+			return -EPERM;
+	}
+
+	if (!prog && !old_prog)
+		/* report error when trying to detach and nothing is attached */
+		return -ENOENT;
+
+	cgrp->bpf.prog[type] = prog;
 
 	css_for_each_descendant_pre(pos, &cgrp->self) {
 		struct cgroup *desc = container_of(pos, struct cgroup, self);
 
 		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp)
+		if (desc->bpf.prog[type] && desc != cgrp) {
 			pos = css_rightmost_descendant(pos);
-		else
+		} else {
 			rcu_assign_pointer(desc->bpf.effective[type],
 					   effective);
+			desc->bpf.disallow_override[type] = !overridable;
+		}
 	}
 
 	if (prog)
@@ -115,6 +149,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
+	return 0;
 }
 
 /**
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 19b6129eab23..bbb016adbaeb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -920,13 +920,14 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_type
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
-	enum bpf_prog_type ptype;
+	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -934,6 +935,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
+	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+		return -EINVAL;
+
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
@@ -956,10 +960,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	cgroup_bpf_update(cgrp, prog, attr->attach_type);
+	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
+				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	if (ret)
+		bpf_prog_put(prog);
 	cgroup_put(cgrp);
 
-	return 0;
+	return ret;
 }
 
 #define BPF_PROG_DETACH_LAST_FIELD attach_type
@@ -967,6 +974,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
 	struct cgroup *cgrp;
+	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -982,7 +990,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
 
-		cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
 		cgroup_put(cgrp);
 		break;
 
@@ -990,7 +998,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		return -EINVAL;
 	}
 
-	return 0;
+	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 688dd02af985..53bbca7c4859 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6498,15 +6498,16 @@ static __init int cgroup_namespaces_init(void)
 subsys_initcall(cgroup_namespaces_init);
 
 #ifdef CONFIG_CGROUP_BPF
-void cgroup_bpf_update(struct cgroup *cgrp,
-		       struct bpf_prog *prog,
-		       enum bpf_attach_type type)
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, bool overridable)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
+	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	__cgroup_bpf_update(cgrp, parent, prog, type);
+	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
 	mutex_unlock(&cgroup_mutex);
+	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
index 504058631ffc..4bfcaf93fcf3 100644
--- a/samples/bpf/test_cgrp2_attach.c
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -104,7 +104,7 @@ static int attach_filter(int cg_fd, int type, int verdict)
 		return EXIT_FAILURE;
 	}
 
-	ret = bpf_prog_attach(prog_fd, cg_fd, type);
+	ret = bpf_prog_attach(prog_fd, cg_fd, type, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 6e69be37f87f..3049b1f26267 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -79,11 +79,12 @@ int main(int argc, char **argv)
 	if (join_cgroup(FOO))
 		goto err;
 
-	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo");
 		goto err;
 	}
 
+	printf("Attached DROP prog. This ping in cgroup /foo should fail...\n");
 	assert(system(PING_CMD) != 0);
 
 	/* Create cgroup /foo/bar, get fd, and join it */
@@ -94,24 +95,27 @@ int main(int argc, char **argv)
 	if (join_cgroup(BAR))
 		goto err;
 
+	printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
 
+	printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
-
 	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
 		log_err("Detaching program from /foo/bar");
 		goto err;
 	}
 
+	printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n"
+	       "This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -121,8 +125,60 @@ int main(int argc, char **argv)
 		goto err;
 	}
 
+	printf("Attached PASS from /foo/bar and detached DROP from /foo.\n"
+	       "This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+		log_err("Attaching prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+		errno = 0;
+		log_err("Unexpected success attaching prog to /foo/bar");
+		goto err;
+	}
+
+	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
+		log_err("Detaching program from /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
+		errno = 0;
+		log_err("Unexpected success in double detach from /foo");
+		goto err;
+	}
+
+	if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+		log_err("Attaching non-overridable prog to /foo");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+		errno = 0;
+		log_err("Unexpected success attaching non-overridable prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+		errno = 0;
+		log_err("Unexpected success attaching overridable prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+		errno = 0;
+		log_err("Unexpected success attaching overridable prog to /foo");
+		goto err;
+	}
+
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+		log_err("Attaching different non-overridable prog to /foo");
+		goto err;
+	}
+
 	goto out;
 
 err:
@@ -132,5 +188,9 @@ out:
 	close(foo);
 	close(bar);
 	cleanup_cgroup_environment();
+	if (!rc)
+		printf("PASS\n");
+	else
+		printf("FAIL\n");
 	return rc;
 }
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 0791b949cbe4..c3cfb23e23b5 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -75,7 +75,7 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
-	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
index 455ef0d06e93..db036077b644 100644
--- a/samples/bpf/test_cgrp2_sock2.c
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -55,7 +55,7 @@ int main(int argc, char **argv)
 	}
 
 	ret = bpf_prog_attach(prog_fd[filter_id], cg_fd,
-			      BPF_CGROUP_INET_SOCK_CREATE);
+			      BPF_CGROUP_INET_SOCK_CREATE, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 3ddb58a36d3c..ae752fa4eaa7 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -168,7 +168,8 @@ int bpf_obj_get(const char *pathname)
 	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
 }
 
-int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
+		    unsigned int flags)
 {
 	union bpf_attr attr;
 
@@ -176,6 +177,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
 	attr.target_fd	   = target_fd;
 	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type   = type;
+	attr.attach_flags  = flags;
 
 	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index a2f9853dd882..4ac6c4b84100 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -41,7 +41,8 @@ int bpf_map_delete_elem(int fd, void *key);
 int bpf_map_get_next_key(int fd, void *key, void *next_key);
 int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
-int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type);
+int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
+		    unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
 
 
-- 
cgit v1.2.3


From d6cffbbe9a7e51eb705182965a189457c17ba8a3 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 10 Feb 2017 10:35:02 +0300
Subject: proc/sysctl: prune stale dentries during unregistering

Currently unregistering sysctl table does not prune its dentries.
Stale dentries could slowdown sysctl operations significantly.

For example, command:

 # for i in {1..100000} ; do unshare -n -- sysctl -a &> /dev/null ; done
 creates a millions of stale denties around sysctls of loopback interface:

 # sysctl fs.dentry-state
 fs.dentry-state = 25812579  24724135        45      0       0       0

 All of them have matching names thus lookup have to scan though whole
 hash chain and call d_compare (proc_sys_compare) which checks them
 under system-wide spinlock (sysctl_lock).

 # time sysctl -a > /dev/null
 real    1m12.806s
 user    0m0.016s
 sys     1m12.400s

Currently only memory reclaimer could remove this garbage.
But without significant memory pressure this never happens.

This patch collects sysctl inodes into list on sysctl table header and
prunes all their dentries once that table unregisters.

Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
> On 10.02.2017 10:47, Al Viro wrote:
>> how about >> the matching stats *after* that patch?
>
> dcache size doesn't grow endlessly, so stats are fine
>
> # sysctl fs.dentry-state
> fs.dentry-state = 92712	58376	45	0	0	0
>
> # time sysctl -a &>/dev/null
>
> real	0m0.013s
> user	0m0.004s
> sys	0m0.008s

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/inode.c        |  3 ++-
 fs/proc/internal.h     |  7 ++++--
 fs/proc/proc_sysctl.c  | 59 ++++++++++++++++++++++++++++++++++++--------------
 include/linux/sysctl.h |  1 +
 4 files changed, 51 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
 	de = PDE(inode);
 	if (de)
 		pde_put(de);
+
 	head = PROC_I(inode)->sysctl;
 	if (head) {
 		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
-		sysctl_head_put(head);
+		proc_sys_evict_inode(inode, head);
 	}
 }
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e2c3c461fa20..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
 	struct ctl_table *sysctl_entry;
+	struct list_head sysctl_inodes;
 	const struct proc_ns_operations *ns_ops;
 	struct inode vfs_inode;
 };
@@ -237,10 +238,12 @@ extern void proc_thread_self_init(void);
  */
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *);
+extern void proc_sys_evict_inode(struct inode *inode,
+				 struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
+static inline void proc_sys_evict_inode(struct  inode *inode,
+					struct ctl_table_header *head) { }
 #endif
 
 /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..8efb1e10b025 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
 	head->set = set;
 	head->parent = NULL;
 	head->node = node;
+	INIT_LIST_HEAD(&head->inodes);
 	if (node) {
 		struct ctl_table *entry;
 		for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,29 @@ static void unuse_table(struct ctl_table_header *p)
 			complete(p->unregistering);
 }
 
+/* called under sysctl_lock */
+static void proc_sys_prune_dcache(struct ctl_table_header *head)
+{
+	struct inode *inode, *prev = NULL;
+	struct proc_inode *ei;
+
+	list_for_each_entry(ei, &head->inodes, sysctl_inodes) {
+		inode = igrab(&ei->vfs_inode);
+		if (inode) {
+			spin_unlock(&sysctl_lock);
+			iput(prev);
+			prev = inode;
+			d_prune_aliases(inode);
+			spin_lock(&sysctl_lock);
+		}
+	}
+	if (prev) {
+		spin_unlock(&sysctl_lock);
+		iput(prev);
+		spin_lock(&sysctl_lock);
+	}
+}
+
 /* called under sysctl_lock, will reacquire if has to wait */
 static void start_unregistering(struct ctl_table_header *p)
 {
@@ -277,6 +301,11 @@ static void start_unregistering(struct ctl_table_header *p)
 		/* anything non-NULL; we'll never dereference it */
 		p->unregistering = ERR_PTR(-EINVAL);
 	}
+	/*
+	 * Prune dentries for unregistered sysctls: namespaced sysctls
+	 * can have duplicate names and contaminate dcache very badly.
+	 */
+	proc_sys_prune_dcache(p);
 	/*
 	 * do not remove from the list until nobody holds it; walking the
 	 * list in do_sysctl() relies on that.
@@ -284,21 +313,6 @@ static void start_unregistering(struct ctl_table_header *p)
 	erase_header(p);
 }
 
-static void sysctl_head_get(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	head->count++;
-	spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	if (!--head->count)
-		kfree_rcu(head, rcu);
-	spin_unlock(&sysctl_lock);
-}
-
 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
 {
 	BUG_ON(!head);
@@ -440,11 +454,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 
 	inode->i_ino = get_next_ino();
 
-	sysctl_head_get(head);
 	ei = PROC_I(inode);
 	ei->sysctl = head;
 	ei->sysctl_entry = table;
 
+	spin_lock(&sysctl_lock);
+	list_add(&ei->sysctl_inodes, &head->inodes);
+	head->count++;
+	spin_unlock(&sysctl_lock);
+
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_mode = table->mode;
 	if (!S_ISDIR(table->mode)) {
@@ -466,6 +484,15 @@ out:
 	return inode;
 }
 
+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	list_del(&PROC_I(inode)->sysctl_inodes);
+	if (!--head->count)
+		kfree_rcu(head, rcu);
+	spin_unlock(&sysctl_lock);
+}
+
 static struct ctl_table_header *grab_header(struct inode *inode)
 {
 	struct ctl_table_header *head = PROC_I(inode)->sysctl;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index adf4e51cf597..b7e82049fec7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -143,6 +143,7 @@ struct ctl_table_header
 	struct ctl_table_set *set;
 	struct ctl_dir *parent;
 	struct ctl_node *node;
+	struct list_head inodes; /* head for proc_inode->sysctl_inodes */
 };
 
 struct ctl_dir {
-- 
cgit v1.2.3


From 6ccc3a33810e8ec09936fa990c13370d9f61606f Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Fri, 27 Jan 2017 11:52:35 +0100
Subject: input: cros_ec_keyb: Add Tablet Mode switch

Add switch to report tablet mode.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Enric Balletbo Serra <enric.balletbo@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/input/keyboard/cros_ec_keyb.c | 5 +++++
 include/linux/mfd/cros_ec_commands.h  | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c
index ad74ebce0cdd..604c7ade8df2 100644
--- a/drivers/input/keyboard/cros_ec_keyb.c
+++ b/drivers/input/keyboard/cros_ec_keyb.c
@@ -111,6 +111,11 @@ static const struct cros_ec_bs_map cros_ec_keyb_bs[] = {
 		.bit		= EC_MKBP_LID_OPEN,
 		.inverted	= true,
 	},
+	{
+		.ev_type	= EV_SW,
+		.code		= SW_TABLET_MODE,
+		.bit		= EC_MKBP_TABLET_MODE,
+	},
 };
 
 /*
diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 004dcf3560fb..da1c188562bc 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -2068,6 +2068,7 @@ struct ec_response_get_next_event {
 
 /* Switches */
 #define EC_MKBP_LID_OPEN	0
+#define EC_MKBP_TABLET_MODE	1
 
 /*****************************************************************************/
 /* Temperature sensor commands */
-- 
cgit v1.2.3


From 34a23327697d558a582e356573d7d36773996fbb Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Mon, 5 Dec 2016 13:15:28 +0100
Subject: mfd: axp20x: Correct a typo in axp20x_device_remove documentation

The documentation of axp20x_device_remove() have a typo and use
axp20x_device_probe() as name. This patch fix this typo.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/axp20x.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 812806d6319b..39f1da18c917 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -582,7 +582,7 @@ int axp20x_match_device(struct axp20x_dev *axp20x);
 int axp20x_device_probe(struct axp20x_dev *axp20x);
 
 /**
- * axp20x_device_probe(): Remove a axp20x device
+ * axp20x_device_remove(): Remove a axp20x device
  *
  * @axp20x: axp20x device to remove
  *
-- 
cgit v1.2.3


From 0a5454c901aea0fef99f5ef7910c69c501817ae1 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 14 Dec 2016 14:52:05 +0100
Subject: mfd: axp20x: Use IRQF_TRIGGER_LOW on the axp288

The interrupt line of the entire family of axp2xx pmics is active-low,
for devicetree enumerated irqs, this is dealt with in the devicetree.

ACPI irq resources have a flag field for this too, I tried using this
on my CUBE iwork8 Air tablet, but it does not contain the right data.

The dstd shows the irq listed as either ActiveLow or ActiveHigh,
depending on the OSID variable, which seems to be set by the
"OS IMAGE ID" in the BIOS/EFI setup screen.

Since the acpi-resource info is no good, simply pass in IRQF_TRIGGER_LOW
on the axp288.

Together with the other axp288 fixes in this series, this fixes the axp288
irq contineously triggering.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x.c       | 6 +++---
 include/linux/mfd/axp20x.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index ed918de84238..f124bc36f65d 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -802,6 +802,7 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->nr_cells = ARRAY_SIZE(axp288_cells);
 		axp20x->regmap_cfg = &axp288_regmap_config;
 		axp20x->regmap_irq_chip = &axp288_regmap_irq_chip;
+		axp20x->irq_flags = IRQF_TRIGGER_LOW;
 		break;
 	case AXP806_ID:
 		axp20x->nr_cells = ARRAY_SIZE(axp806_cells);
@@ -831,9 +832,8 @@ int axp20x_device_probe(struct axp20x_dev *axp20x)
 	int ret;
 
 	ret = regmap_add_irq_chip(axp20x->regmap, axp20x->irq,
-				  IRQF_ONESHOT | IRQF_SHARED, -1,
-				  axp20x->regmap_irq_chip,
-				  &axp20x->regmap_irqc);
+			  IRQF_ONESHOT | IRQF_SHARED | axp20x->irq_flags,
+			   -1, axp20x->regmap_irq_chip, &axp20x->regmap_irqc);
 	if (ret) {
 		dev_err(axp20x->dev, "failed to add irq chip: %d\n", ret);
 		return ret;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 39f1da18c917..6d5dd3f96d33 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -523,6 +523,7 @@ enum axp809_irqs {
 struct axp20x_dev {
 	struct device			*dev;
 	int				irq;
+	unsigned long			irq_flags;
 	struct regmap			*regmap;
 	struct regmap_irq_chip_data	*regmap_irqc;
 	long				variant;
-- 
cgit v1.2.3


From 59f10f7e62a28b9b664a9b2efbc5bd10d8d751f0 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 14 Dec 2016 14:52:08 +0100
Subject: mfd: axp20x: Drop wrong AXP288_PMIC_ADC_EN define

The adc-enable register for the axp288 is 0x82, not 0x84.
0x82 is already defined as AXP20X_ADC_EN1 and that is what the
axp288_adc driver is actually using, so simply drop the wrong
AXP288_PMIC_ADC_EN define.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/axp20x.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 6d5dd3f96d33..35418ccb3bb7 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -238,7 +238,6 @@ enum {
 #define AXP288_PMIC_ADC_H               0x56
 #define AXP288_PMIC_ADC_L               0x57
 #define AXP288_ADC_TS_PIN_CTRL          0x84
-#define AXP288_PMIC_ADC_EN              0x84
 
 /* Fuel Gauge */
 #define AXP288_FG_RDC1_REG          0xba
-- 
cgit v1.2.3


From 178e8351cef3607821ec9a4e2fe3f414145fdf34 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 16 Dec 2016 21:09:05 +0100
Subject: mfd: axp20x: Add a few missing defines for AXP288 specific registers

Add defines for the AXP288_POWER_REASON and AXP288_RT_BATT_V_H and
AXP288_RT_BATT_V_L and AXP288_BC_* registers. While at it also move the
AXP288_TS_ADC_H-AXP288_GP_ADC_L defines, which for some reason where
in a different place, together with the rest of the AXP288 specific
defines.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/axp20x.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 35418ccb3bb7..0aa4ef7157b8 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -235,9 +235,20 @@ enum {
 #define AXP22X_BATLOW_THRES1		0xe6
 
 /* AXP288 specific registers */
+#define AXP288_POWER_REASON		0x02
+#define AXP288_BC_GLOBAL		0x2c
+#define AXP288_BC_VBUS_CNTL		0x2d
+#define AXP288_BC_USB_STAT		0x2e
+#define AXP288_BC_DET_STAT		0x2f
 #define AXP288_PMIC_ADC_H               0x56
 #define AXP288_PMIC_ADC_L               0x57
+#define AXP288_TS_ADC_H			0x58
+#define AXP288_TS_ADC_L			0x59
+#define AXP288_GP_ADC_H			0x5a
+#define AXP288_GP_ADC_L			0x5b
 #define AXP288_ADC_TS_PIN_CTRL          0x84
+#define AXP288_RT_BATT_V_H		0xa0
+#define AXP288_RT_BATT_V_L		0xa1
 
 /* Fuel Gauge */
 #define AXP288_FG_RDC1_REG          0xba
@@ -514,11 +525,6 @@ enum axp809_irqs {
 	AXP809_IRQ_GPIO0_INPUT,
 };
 
-#define AXP288_TS_ADC_H		0x58
-#define AXP288_TS_ADC_L		0x59
-#define AXP288_GP_ADC_H		0x5a
-#define AXP288_GP_ADC_L		0x5b
-
 struct axp20x_dev {
 	struct device			*dev;
 	int				irq;
-- 
cgit v1.2.3


From a042a7a4e1ce31380d1370a4d45a3f48d0972655 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 28 Dec 2016 21:55:47 +0000
Subject: mfd: abx500: Fix spelling mistake: "Celcius" -> "Celsius"

Trivial fix to spelling mistake in MFD headers.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/abx500.h           | 2 +-
 include/linux/mfd/abx500/ab8500-bm.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/abx500.h b/include/linux/mfd/abx500.h
index 552cc1d61cc7..44412c9d26e1 100644
--- a/include/linux/mfd/abx500.h
+++ b/include/linux/mfd/abx500.h
@@ -45,7 +45,7 @@ enum abx500_adc_therm {
  * struct abx500_res_to_temp - defines one point in a temp to res curve. To
  * be used in battery packs that combines the identification resistor with a
  * NTC resistor.
- * @temp:			battery pack temperature in Celcius
+ * @temp:			battery pack temperature in Celsius
  * @resist:			NTC resistor net total resistance
  */
 struct abx500_res_to_temp {
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index 12a5b396921e..e63681eb6c62 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -279,7 +279,7 @@ enum bup_vch_sel {
  * struct res_to_temp - defines one point in a temp to res curve. To
  * be used in battery packs that combines the identification resistor with a
  * NTC resistor.
- * @temp:			battery pack temperature in Celcius
+ * @temp:			battery pack temperature in Celsius
  * @resist:			NTC resistor net total resistance
  */
 struct res_to_temp {
@@ -290,7 +290,7 @@ struct res_to_temp {
 /**
  * struct batres_vs_temp - defines one point in a temp vs battery internal
  * resistance curve.
- * @temp:			battery pack temperature in Celcius
+ * @temp:			battery pack temperature in Celsius
  * @resist:			battery internal reistance in mOhm
  */
 struct batres_vs_temp {
-- 
cgit v1.2.3


From a9eb186e13144782232cc6fa731441be54baf505 Mon Sep 17 00:00:00 2001
From: Joseph Lo <josephl@nvidia.com>
Date: Fri, 16 Dec 2016 18:57:36 +0100
Subject: mfd: cros_ec: Prevent data transfer while device is suspended

The cros_ec driver is still active while the device is suspended.
Besides that, it also tries to transfer data even after the I2C host had
been suspended. This patch uses a simple flag to prevent this.

Signed-off-by: Joseph Lo <josephl@nvidia.com>
Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec.c                   | 2 ++
 drivers/platform/chrome/cros_ec_proto.c | 5 +++++
 include/linux/mfd/cros_ec.h             | 2 ++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index abd83424b498..ad48633c9a47 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -165,6 +165,7 @@ int cros_ec_suspend(struct cros_ec_device *ec_dev)
 
 	disable_irq(ec_dev->irq);
 	ec_dev->was_wake_device = ec_dev->wake_enabled;
+	ec_dev->suspended = true;
 
 	return 0;
 }
@@ -179,6 +180,7 @@ static void cros_ec_drain_events(struct cros_ec_device *ec_dev)
 
 int cros_ec_resume(struct cros_ec_device *ec_dev)
 {
+	ec_dev->suspended = false;
 	enable_irq(ec_dev->irq);
 
 	/*
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 04053fe1e980..ed5dee744c74 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -447,6 +447,11 @@ static int get_next_event(struct cros_ec_device *ec_dev)
 	struct cros_ec_command *msg = (struct cros_ec_command *)&buffer;
 	int ret;
 
+	if (ec_dev->suspended) {
+		dev_dbg(ec_dev->dev, "Device suspended.\n");
+		return -EHOSTDOWN;
+	}
+
 	msg->version = 0;
 	msg->command = EC_CMD_GET_NEXT_EVENT;
 	msg->insize = sizeof(ec_dev->event_data);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index f62043a75f43..7a01c94496f1 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -103,6 +103,7 @@ struct cros_ec_command {
  * @din_size: size of din buffer to allocate (zero to use static din)
  * @dout_size: size of dout buffer to allocate (zero to use static dout)
  * @wake_enabled: true if this device can wake the system from sleep
+ * @suspended: true if this device had been suspended
  * @cmd_xfer: send command to EC and get response
  *     Returns the number of bytes received if the communication succeeded, but
  *     that doesn't mean the EC was happy with the command. The caller
@@ -136,6 +137,7 @@ struct cros_ec_device {
 	int din_size;
 	int dout_size;
 	bool wake_enabled;
+	bool suspended;
 	int (*cmd_xfer)(struct cros_ec_device *ec,
 			struct cros_ec_command *msg);
 	int (*pkt_xfer)(struct cros_ec_device *ec,
-- 
cgit v1.2.3


From f00c06fd98576face871e62bb3aa045c5f647661 Mon Sep 17 00:00:00 2001
From: Shawn Nematbakhsh <shawnn@chromium.org>
Date: Fri, 16 Dec 2016 18:57:37 +0100
Subject: mfd: cros_ec: Send suspend state notification to EC

Notify EC when going to or returning from suspend so that proper actions
related to wake events can be taken.

Signed-off-by: Shawn Nematbakhsh <shawnn@chromium.org>
Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec.c                | 49 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/cros_ec_commands.h | 14 +++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index ad48633c9a47..b8a50808bedb 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/cros_ec.h>
+#include <linux/suspend.h>
 #include <asm/unaligned.h>
 
 #define CROS_EC_DEV_EC_INDEX 0
@@ -65,6 +66,24 @@ static irqreturn_t ec_irq_thread(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event)
+{
+	struct {
+		struct cros_ec_command msg;
+		struct ec_params_host_sleep_event req;
+	} __packed buf;
+
+	memset(&buf, 0, sizeof(buf));
+
+	buf.req.sleep_event = sleep_event;
+
+	buf.msg.command = EC_CMD_HOST_SLEEP_EVENT;
+	buf.msg.version = 0;
+	buf.msg.outsize = sizeof(buf.req);
+
+	return cros_ec_cmd_xfer(ec_dev, &buf.msg);
+}
+
 int cros_ec_register(struct cros_ec_device *ec_dev)
 {
 	struct device *dev = ec_dev->dev;
@@ -136,6 +155,15 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 		}
 	}
 
+	/*
+	 * Clear sleep event - this will fail harmlessly on platforms that
+	 * don't implement the sleep event host command.
+	 */
+	err = cros_ec_sleep_event(ec_dev, 0);
+	if (err < 0)
+		dev_dbg(ec_dev->dev, "Error %d clearing sleep event to ec",
+			err);
+
 	dev_info(dev, "Chrome EC device registered\n");
 
 	return 0;
@@ -159,6 +187,16 @@ EXPORT_SYMBOL(cros_ec_remove);
 int cros_ec_suspend(struct cros_ec_device *ec_dev)
 {
 	struct device *dev = ec_dev->dev;
+	int ret;
+	u8 sleep_event;
+
+	sleep_event = pm_suspend_via_firmware() ? HOST_SLEEP_EVENT_S3_RESUME :
+						  HOST_SLEEP_EVENT_S0IX_RESUME;
+
+	ret = cros_ec_sleep_event(ec_dev, sleep_event);
+	if (ret < 0)
+		dev_dbg(ec_dev->dev, "Error %d sending suspend event to ec",
+			ret);
 
 	if (device_may_wakeup(dev))
 		ec_dev->wake_enabled = !enable_irq_wake(ec_dev->irq);
@@ -180,9 +218,20 @@ static void cros_ec_drain_events(struct cros_ec_device *ec_dev)
 
 int cros_ec_resume(struct cros_ec_device *ec_dev)
 {
+	int ret;
+	u8 sleep_event;
+
 	ec_dev->suspended = false;
 	enable_irq(ec_dev->irq);
 
+	sleep_event = pm_suspend_via_firmware() ? HOST_SLEEP_EVENT_S3_RESUME :
+						  HOST_SLEEP_EVENT_S0IX_RESUME;
+
+	ret = cros_ec_sleep_event(ec_dev, sleep_event);
+	if (ret < 0)
+		dev_dbg(ec_dev->dev, "Error %d sending resume event to ec",
+			ret);
+
 	/*
 	 * In some cases, we need to distinguish between events that occur
 	 * during suspend if the EC is not a wake source. For example,
diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index da1c188562bc..34ecae4d612c 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -2547,6 +2547,20 @@ struct ec_params_ext_power_current_limit {
 	uint32_t limit; /* in mA */
 } __packed;
 
+/* Inform the EC when entering a sleep state */
+#define EC_CMD_HOST_SLEEP_EVENT 0xa9
+
+enum host_sleep_event {
+	HOST_SLEEP_EVENT_S3_SUSPEND   = 1,
+	HOST_SLEEP_EVENT_S3_RESUME    = 2,
+	HOST_SLEEP_EVENT_S0IX_SUSPEND = 3,
+	HOST_SLEEP_EVENT_S0IX_RESUME  = 4
+};
+
+struct ec_params_host_sleep_event {
+	uint8_t sleep_event;
+} __packed;
+
 /*****************************************************************************/
 /* Smart battery pass-through */
 
-- 
cgit v1.2.3


From 56e1d40d3beab2f247d48574bf51fc5daeebc285 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 5 Jan 2017 16:44:39 -0800
Subject: mfd: cpcap: Add minimal support

Many Motorola phones like droid 4 are using a custom PMIC called CPCAP
or 6556002. We can support it's core features quite easily with regmap_spi
and regmap_irq.

The children of cpcap, such as regulators, ADC and USB, can be just regular
device drivers and defined in the dts file. They get probed as we call
of_platform_populate() at the end of our probe, and then the children
can just call dev_get_regmap(dev.parent, NULL) to get the regmap.

Cc: devicetree@vger.kernel.org
Cc: Marcel Partap <mpartap@gmx.net>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Scott <michael.scott@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../devicetree/bindings/mfd/motorola-cpcap.txt     |  31 +++
 drivers/mfd/Kconfig                                |  11 +
 drivers/mfd/Makefile                               |   1 +
 drivers/mfd/motorola-cpcap.c                       | 259 ++++++++++++++++++
 include/linux/mfd/motorola-cpcap.h                 | 292 +++++++++++++++++++++
 5 files changed, 594 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mfd/motorola-cpcap.txt
 create mode 100644 drivers/mfd/motorola-cpcap.c
 create mode 100644 include/linux/mfd/motorola-cpcap.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mfd/motorola-cpcap.txt b/Documentation/devicetree/bindings/mfd/motorola-cpcap.txt
new file mode 100644
index 000000000000..15bc885f9df4
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/motorola-cpcap.txt
@@ -0,0 +1,31 @@
+Motorola CPCAP PMIC device tree binding
+
+Required properties:
+- compatible		: One or both of "motorola,cpcap" or "ste,6556002"
+- reg			: SPI chip select
+- interrupt-parent	: The parent interrupt controller
+- interrupts		: The interrupt line the device is connected to
+- interrupt-controller	: Marks the device node as an interrupt controller
+- #interrupt-cells	: The number of cells to describe an IRQ, should be 2
+- #address-cells	: Child device offset number of cells, should be 1
+- #size-cells		: Child device size number of cells, should be 0
+- spi-max-frequency	: Typically set to 3000000
+- spi-cs-high		: SPI chip select direction
+
+Example:
+
+&mcspi1 {
+	cpcap: pmic@0 {
+		compatible = "motorola,cpcap", "ste,6556002";
+		reg = <0>;	/* cs0 */
+		interrupt-parent = <&gpio1>;
+		interrupts = <7 IRQ_TYPE_EDGE_RISING>;
+		interrupt-controller;
+		#interrupt-cells = <2>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		spi-max-frequency = <3000000>;
+		spi-cs-high;
+	};
+};
+
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index dbebe20c6d6d..496247a7f893 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -715,6 +715,17 @@ config EZX_PCAP
 	  This enables the PCAP ASIC present on EZX Phones. This is
 	  needed for MMC, TouchScreen, Sound, USB, etc..
 
+config MFD_CPCAP
+	tristate "Support for Motorola CPCAP"
+	depends on SPI
+	depends on OF || COMPILE_TEST
+	select REGMAP_SPI
+	select REGMAP_IRQ
+	help
+	  Say yes here if you want to include driver for CPCAP.
+	  It is used on many Motorola phones and tablets as a PMIC.
+	  At least Motorola Droid 4 is known to use CPCAP.
+
 config MFD_VIPERBOARD
         tristate "Nano River Technologies Viperboard"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 876ca8600c51..31ce07611a6f 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_MFD_MC13XXX_I2C)	+= mc13xxx-i2c.o
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
 obj-$(CONFIG_EZX_PCAP)		+= ezx-pcap.o
+obj-$(CONFIG_MFD_CPCAP)		+= motorola-cpcap.o
 
 obj-$(CONFIG_MCP)		+= mcp-core.o
 obj-$(CONFIG_MCP_SA11X0)	+= mcp-sa11x0.o
diff --git a/drivers/mfd/motorola-cpcap.c b/drivers/mfd/motorola-cpcap.c
new file mode 100644
index 000000000000..6aeada7d7ce5
--- /dev/null
+++ b/drivers/mfd/motorola-cpcap.c
@@ -0,0 +1,259 @@
+/*
+ * Motorola CPCAP PMIC core driver
+ *
+ * Copyright (C) 2016 Tony Lindgren <tony@atomide.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/sysfs.h>
+
+#include <linux/mfd/motorola-cpcap.h>
+#include <linux/spi/spi.h>
+
+#define CPCAP_NR_IRQ_REG_BANKS	6
+#define CPCAP_NR_IRQ_CHIPS	3
+
+struct cpcap_ddata {
+	struct spi_device *spi;
+	struct regmap_irq *irqs;
+	struct regmap_irq_chip_data *irqdata[CPCAP_NR_IRQ_CHIPS];
+	const struct regmap_config *regmap_conf;
+	struct regmap *regmap;
+};
+
+static int cpcap_check_revision(struct cpcap_ddata *cpcap)
+{
+	u16 vendor, rev;
+	int ret;
+
+	ret = cpcap_get_vendor(&cpcap->spi->dev, cpcap->regmap, &vendor);
+	if (ret)
+		return ret;
+
+	ret = cpcap_get_revision(&cpcap->spi->dev, cpcap->regmap, &rev);
+	if (ret)
+		return ret;
+
+	dev_info(&cpcap->spi->dev, "CPCAP vendor: %s rev: %i.%i (%x)\n",
+		 vendor == CPCAP_VENDOR_ST ? "ST" : "TI",
+		 CPCAP_REVISION_MAJOR(rev), CPCAP_REVISION_MINOR(rev),
+		 rev);
+
+	if (rev < CPCAP_REVISION_2_1) {
+		dev_info(&cpcap->spi->dev,
+			 "Please add old CPCAP revision support as needed\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/*
+ * First two irq chips are the two private macro interrupt chips, the third
+ * irq chip is for register banks 1 - 4 and is available for drivers to use.
+ */
+static struct regmap_irq_chip cpcap_irq_chip[CPCAP_NR_IRQ_CHIPS] = {
+	{
+		.name = "cpcap-m2",
+		.num_regs = 1,
+		.status_base = CPCAP_REG_MI1,
+		.ack_base = CPCAP_REG_MI1,
+		.mask_base = CPCAP_REG_MIM1,
+		.use_ack = true,
+	},
+	{
+		.name = "cpcap-m2",
+		.num_regs = 1,
+		.status_base = CPCAP_REG_MI2,
+		.ack_base = CPCAP_REG_MI2,
+		.mask_base = CPCAP_REG_MIM2,
+		.use_ack = true,
+	},
+	{
+		.name = "cpcap1-4",
+		.num_regs = 4,
+		.status_base = CPCAP_REG_INT1,
+		.ack_base = CPCAP_REG_INT1,
+		.mask_base = CPCAP_REG_INTM1,
+		.type_base = CPCAP_REG_INTS1,
+		.use_ack = true,
+	},
+};
+
+static void cpcap_init_one_regmap_irq(struct cpcap_ddata *cpcap,
+				      struct regmap_irq *rirq,
+				      int irq_base, int irq)
+{
+	unsigned int reg_offset;
+	unsigned int bit, mask;
+
+	reg_offset = irq - irq_base;
+	reg_offset /= cpcap->regmap_conf->val_bits;
+	reg_offset *= cpcap->regmap_conf->reg_stride;
+
+	bit = irq % cpcap->regmap_conf->val_bits;
+	mask = (1 << bit);
+
+	rirq->reg_offset = reg_offset;
+	rirq->mask = mask;
+}
+
+static int cpcap_init_irq_chip(struct cpcap_ddata *cpcap, int irq_chip,
+			       int irq_start, int nr_irqs)
+{
+	struct regmap_irq_chip *chip = &cpcap_irq_chip[irq_chip];
+	int i, ret;
+
+	for (i = irq_start; i < irq_start + nr_irqs; i++) {
+		struct regmap_irq *rirq = &cpcap->irqs[i];
+
+		cpcap_init_one_regmap_irq(cpcap, rirq, irq_start, i);
+	}
+	chip->irqs = &cpcap->irqs[irq_start];
+	chip->num_irqs = nr_irqs;
+	chip->irq_drv_data = cpcap;
+
+	ret = devm_regmap_add_irq_chip(&cpcap->spi->dev, cpcap->regmap,
+				       cpcap->spi->irq,
+				       IRQF_TRIGGER_RISING |
+				       IRQF_SHARED, -1,
+				       chip, &cpcap->irqdata[irq_chip]);
+	if (ret) {
+		dev_err(&cpcap->spi->dev, "could not add irq chip %i: %i\n",
+			irq_chip, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int cpcap_init_irq(struct cpcap_ddata *cpcap)
+{
+	int ret;
+
+	cpcap->irqs = devm_kzalloc(&cpcap->spi->dev,
+				   sizeof(*cpcap->irqs) *
+				   CPCAP_NR_IRQ_REG_BANKS *
+				   cpcap->regmap_conf->val_bits,
+				   GFP_KERNEL);
+	if (!cpcap->irqs)
+		return -ENOMEM;
+
+	ret = cpcap_init_irq_chip(cpcap, 0, 0, 16);
+	if (ret)
+		return ret;
+
+	ret = cpcap_init_irq_chip(cpcap, 1, 16, 16);
+	if (ret)
+		return ret;
+
+	ret = cpcap_init_irq_chip(cpcap, 2, 32, 64);
+	if (ret)
+		return ret;
+
+	enable_irq_wake(cpcap->spi->irq);
+
+	return 0;
+}
+
+static const struct of_device_id cpcap_of_match[] = {
+	{ .compatible = "motorola,cpcap", },
+	{ .compatible = "st,6556002", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, cpcap_of_match);
+
+static const struct regmap_config cpcap_regmap_config = {
+	.reg_bits = 16,
+	.reg_stride = 4,
+	.pad_bits = 0,
+	.val_bits = 16,
+	.write_flag_mask = 0x8000,
+	.max_register = CPCAP_REG_ST_TEST2,
+	.cache_type = REGCACHE_NONE,
+	.reg_format_endian = REGMAP_ENDIAN_LITTLE,
+	.val_format_endian = REGMAP_ENDIAN_LITTLE,
+};
+
+static int cpcap_probe(struct spi_device *spi)
+{
+	const struct of_device_id *match;
+	struct cpcap_ddata *cpcap;
+	int ret;
+
+	match = of_match_device(of_match_ptr(cpcap_of_match), &spi->dev);
+	if (!match)
+		return -ENODEV;
+
+	cpcap = devm_kzalloc(&spi->dev, sizeof(*cpcap), GFP_KERNEL);
+	if (!cpcap)
+		return -ENOMEM;
+
+	cpcap->spi = spi;
+	spi_set_drvdata(spi, cpcap);
+
+	spi->bits_per_word = 16;
+	spi->mode = SPI_MODE_0 | SPI_CS_HIGH;
+
+	ret = spi_setup(spi);
+	if (ret)
+		return ret;
+
+	cpcap->regmap_conf = &cpcap_regmap_config;
+	cpcap->regmap = devm_regmap_init_spi(spi, &cpcap_regmap_config);
+	if (IS_ERR(cpcap->regmap)) {
+		ret = PTR_ERR(cpcap->regmap);
+		dev_err(&cpcap->spi->dev, "Failed to initialize regmap: %d\n",
+			ret);
+
+		return ret;
+	}
+
+	ret = cpcap_check_revision(cpcap);
+	if (ret) {
+		dev_err(&cpcap->spi->dev, "Failed to detect CPCAP: %i\n", ret);
+		return ret;
+	}
+
+	ret = cpcap_init_irq(cpcap);
+	if (ret)
+		return ret;
+
+	return of_platform_populate(spi->dev.of_node, NULL, NULL,
+				    &cpcap->spi->dev);
+}
+
+static int cpcap_remove(struct spi_device *pdev)
+{
+	struct cpcap_ddata *cpcap = spi_get_drvdata(pdev);
+
+	of_platform_depopulate(&cpcap->spi->dev);
+
+	return 0;
+}
+
+static struct spi_driver cpcap_driver = {
+	.driver = {
+		.name = "cpcap-core",
+		.of_match_table = cpcap_of_match,
+	},
+	.probe = cpcap_probe,
+	.remove = cpcap_remove,
+};
+module_spi_driver(cpcap_driver);
+
+MODULE_ALIAS("platform:cpcap");
+MODULE_DESCRIPTION("CPCAP driver");
+MODULE_AUTHOR("Tony Lindgren <tony@atomide.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/motorola-cpcap.h b/include/linux/mfd/motorola-cpcap.h
new file mode 100644
index 000000000000..b4031c2b2214
--- /dev/null
+++ b/include/linux/mfd/motorola-cpcap.h
@@ -0,0 +1,292 @@
+/*
+ * The register defines are based on earlier cpcap.h in Motorola Linux kernel
+ * tree.
+ *
+ * Copyright (C) 2007-2009 Motorola, Inc.
+ *
+ * Rewritten for the real register offsets instead of enumeration
+ * to make the defines usable with Linux kernel regmap support
+ *
+ * Copyright (C) 2016 Tony Lindgren <tony@atomide.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define CPCAP_VENDOR_ST		0
+#define CPCAP_VENDOR_TI		1
+
+#define CPCAP_REVISION_MAJOR(r)	(((r) >> 4) + 1)
+#define CPCAP_REVISION_MINOR(r)	((r) & 0xf)
+
+#define CPCAP_REVISION_1_0	0x08
+#define CPCAP_REVISION_1_1	0x09
+#define CPCAP_REVISION_2_0	0x10
+#define CPCAP_REVISION_2_1	0x11
+
+/* CPCAP registers */
+#define CPCAP_REG_INT1		0x0000	/* Interrupt 1 */
+#define CPCAP_REG_INT2		0x0004	/* Interrupt 2 */
+#define CPCAP_REG_INT3		0x0008	/* Interrupt 3 */
+#define CPCAP_REG_INT4		0x000c	/* Interrupt 4 */
+#define CPCAP_REG_INTM1		0x0010	/* Interrupt Mask 1 */
+#define CPCAP_REG_INTM2		0x0014	/* Interrupt Mask 2 */
+#define CPCAP_REG_INTM3		0x0018	/* Interrupt Mask 3 */
+#define CPCAP_REG_INTM4		0x001c	/* Interrupt Mask 4 */
+#define CPCAP_REG_INTS1		0x0020	/* Interrupt Sense 1 */
+#define CPCAP_REG_INTS2		0x0024	/* Interrupt Sense 2 */
+#define CPCAP_REG_INTS3		0x0028	/* Interrupt Sense 3 */
+#define CPCAP_REG_INTS4		0x002c	/* Interrupt Sense 4 */
+#define CPCAP_REG_ASSIGN1	0x0030	/* Resource Assignment 1 */
+#define CPCAP_REG_ASSIGN2	0x0034	/* Resource Assignment 2 */
+#define CPCAP_REG_ASSIGN3	0x0038	/* Resource Assignment 3 */
+#define CPCAP_REG_ASSIGN4	0x003c	/* Resource Assignment 4 */
+#define CPCAP_REG_ASSIGN5	0x0040	/* Resource Assignment 5 */
+#define CPCAP_REG_ASSIGN6	0x0044	/* Resource Assignment 6 */
+#define CPCAP_REG_VERSC1	0x0048	/* Version Control 1 */
+#define CPCAP_REG_VERSC2	0x004c	/* Version Control 2 */
+
+#define CPCAP_REG_MI1		0x0200	/* Macro Interrupt 1 */
+#define CPCAP_REG_MIM1		0x0204	/* Macro Interrupt Mask 1 */
+#define CPCAP_REG_MI2		0x0208	/* Macro Interrupt 2 */
+#define CPCAP_REG_MIM2		0x020c	/* Macro Interrupt Mask 2 */
+#define CPCAP_REG_UCC1		0x0210	/* UC Control 1 */
+#define CPCAP_REG_UCC2		0x0214	/* UC Control 2 */
+
+#define CPCAP_REG_PC1		0x021c	/* Power Cut 1 */
+#define CPCAP_REG_PC2		0x0220	/* Power Cut 2 */
+#define CPCAP_REG_BPEOL		0x0224	/* BP and EOL */
+#define CPCAP_REG_PGC		0x0228	/* Power Gate and Control */
+#define CPCAP_REG_MT1		0x022c	/* Memory Transfer 1 */
+#define CPCAP_REG_MT2		0x0230	/* Memory Transfer 2 */
+#define CPCAP_REG_MT3		0x0234	/* Memory Transfer 3 */
+#define CPCAP_REG_PF		0x0238	/* Print Format */
+
+#define CPCAP_REG_SCC		0x0400	/* System Clock Control */
+#define CPCAP_REG_SW1		0x0404	/* Stop Watch 1 */
+#define CPCAP_REG_SW2		0x0408	/* Stop Watch 2 */
+#define CPCAP_REG_UCTM		0x040c	/* UC Turbo Mode */
+#define CPCAP_REG_TOD1		0x0410	/* Time of Day 1 */
+#define CPCAP_REG_TOD2		0x0414	/* Time of Day 2 */
+#define CPCAP_REG_TODA1		0x0418	/* Time of Day Alarm 1 */
+#define CPCAP_REG_TODA2		0x041c	/* Time of Day Alarm 2 */
+#define CPCAP_REG_DAY		0x0420	/* Day */
+#define CPCAP_REG_DAYA		0x0424	/* Day Alarm */
+#define CPCAP_REG_VAL1		0x0428	/* Validity 1 */
+#define CPCAP_REG_VAL2		0x042c	/* Validity 2 */
+
+#define CPCAP_REG_SDVSPLL	0x0600	/* Switcher DVS and PLL */
+#define CPCAP_REG_SI2CC1	0x0604	/* Switcher I2C Control 1 */
+#define CPCAP_REG_Si2CC2	0x0608	/* Switcher I2C Control 2 */
+#define CPCAP_REG_S1C1		0x060c	/* Switcher 1 Control 1 */
+#define CPCAP_REG_S1C2		0x0610	/* Switcher 1 Control 2 */
+#define CPCAP_REG_S2C1		0x0614	/* Switcher 2 Control 1 */
+#define CPCAP_REG_S2C2		0x0618	/* Switcher 2 Control 2 */
+#define CPCAP_REG_S3C		0x061c	/* Switcher 3 Control */
+#define CPCAP_REG_S4C1		0x0620	/* Switcher 4 Control 1 */
+#define CPCAP_REG_S4C2		0x0624	/* Switcher 4 Control 2 */
+#define CPCAP_REG_S5C		0x0628	/* Switcher 5 Control */
+#define CPCAP_REG_S6C		0x062c	/* Switcher 6 Control */
+#define CPCAP_REG_VCAMC		0x0630	/* VCAM Control */
+#define CPCAP_REG_VCSIC		0x0634	/* VCSI Control */
+#define CPCAP_REG_VDACC		0x0638	/* VDAC Control */
+#define CPCAP_REG_VDIGC		0x063c	/* VDIG Control */
+#define CPCAP_REG_VFUSEC	0x0640	/* VFUSE Control */
+#define CPCAP_REG_VHVIOC	0x0644	/* VHVIO Control */
+#define CPCAP_REG_VSDIOC	0x0648	/* VSDIO Control */
+#define CPCAP_REG_VPLLC		0x064c	/* VPLL Control */
+#define CPCAP_REG_VRF1C		0x0650	/* VRF1 Control */
+#define CPCAP_REG_VRF2C		0x0654	/* VRF2 Control */
+#define CPCAP_REG_VRFREFC	0x0658	/* VRFREF Control */
+#define CPCAP_REG_VWLAN1C	0x065c	/* VWLAN1 Control */
+#define CPCAP_REG_VWLAN2C	0x0660	/* VWLAN2 Control */
+#define CPCAP_REG_VSIMC		0x0664	/* VSIM Control */
+#define CPCAP_REG_VVIBC		0x0668	/* VVIB Control */
+#define CPCAP_REG_VUSBC		0x066c	/* VUSB Control */
+#define CPCAP_REG_VUSBINT1C	0x0670	/* VUSBINT1 Control */
+#define CPCAP_REG_VUSBINT2C	0x0674	/* VUSBINT2 Control */
+#define CPCAP_REG_URT		0x0678	/* Useroff Regulator Trigger */
+#define CPCAP_REG_URM1		0x067c	/* Useroff Regulator Mask 1 */
+#define CPCAP_REG_URM2		0x0680	/* Useroff Regulator Mask 2 */
+
+#define CPCAP_REG_VAUDIOC	0x0800	/* VAUDIO Control */
+#define CPCAP_REG_CC		0x0804	/* Codec Control */
+#define CPCAP_REG_CDI		0x0808	/* Codec Digital Interface */
+#define CPCAP_REG_SDAC		0x080c	/* Stereo DAC */
+#define CPCAP_REG_SDACDI	0x0810	/* Stereo DAC Digital Interface */
+#define CPCAP_REG_TXI		0x0814	/* TX Inputs */
+#define CPCAP_REG_TXMP		0x0818	/* TX MIC PGA's */
+#define CPCAP_REG_RXOA		0x081c	/* RX Output Amplifiers */
+#define CPCAP_REG_RXVC		0x0820	/* RX Volume Control */
+#define CPCAP_REG_RXCOA		0x0824	/* RX Codec to Output Amps */
+#define CPCAP_REG_RXSDOA	0x0828	/* RX Stereo DAC to Output Amps */
+#define CPCAP_REG_RXEPOA	0x082c	/* RX External PGA to Output Amps */
+#define CPCAP_REG_RXLL		0x0830	/* RX Low Latency */
+#define CPCAP_REG_A2LA		0x0834	/* A2 Loudspeaker Amplifier */
+#define CPCAP_REG_MIPIS1	0x0838	/* MIPI Slimbus 1 */
+#define CPCAP_REG_MIPIS2	0x083c	/* MIPI Slimbus 2 */
+#define CPCAP_REG_MIPIS3	0x0840	/* MIPI Slimbus 3. */
+#define CPCAP_REG_LVAB		0x0844	/* LMR Volume and A4 Balanced. */
+
+#define CPCAP_REG_CCC1		0x0a00	/* Coulomb Counter Control 1 */
+#define CPCAP_REG_CRM		0x0a04	/* Charger and Reverse Mode */
+#define CPCAP_REG_CCCC2		0x0a08	/* Coincell and Coulomb Ctr Ctrl 2 */
+#define CPCAP_REG_CCS1		0x0a0c	/* Coulomb Counter Sample 1 */
+#define CPCAP_REG_CCS2		0x0a10	/* Coulomb Counter Sample 2 */
+#define CPCAP_REG_CCA1		0x0a14	/* Coulomb Counter Accumulator 1 */
+#define CPCAP_REG_CCA2		0x0a18	/* Coulomb Counter Accumulator 2 */
+#define CPCAP_REG_CCM		0x0a1c	/* Coulomb Counter Mode */
+#define CPCAP_REG_CCO		0x0a20	/* Coulomb Counter Offset */
+#define CPCAP_REG_CCI		0x0a24	/* Coulomb Counter Integrator */
+
+#define CPCAP_REG_ADCC1		0x0c00	/* A/D Converter Configuration 1 */
+#define CPCAP_REG_ADCC2		0x0c04	/* A/D Converter Configuration 2 */
+#define CPCAP_REG_ADCD0		0x0c08	/* A/D Converter Data 0 */
+#define CPCAP_REG_ADCD1		0x0c0c	/* A/D Converter Data 1 */
+#define CPCAP_REG_ADCD2		0x0c10	/* A/D Converter Data 2 */
+#define CPCAP_REG_ADCD3		0x0c14	/* A/D Converter Data 3 */
+#define CPCAP_REG_ADCD4		0x0c18	/* A/D Converter Data 4 */
+#define CPCAP_REG_ADCD5		0x0c1c	/* A/D Converter Data 5 */
+#define CPCAP_REG_ADCD6		0x0c20	/* A/D Converter Data 6 */
+#define CPCAP_REG_ADCD7		0x0c24	/* A/D Converter Data 7 */
+#define CPCAP_REG_ADCAL1	0x0c28	/* A/D Converter Calibration 1 */
+#define CPCAP_REG_ADCAL2	0x0c2c	/* A/D Converter Calibration 2 */
+
+#define CPCAP_REG_USBC1		0x0e00	/* USB Control 1 */
+#define CPCAP_REG_USBC2		0x0e04	/* USB Control 2 */
+#define CPCAP_REG_USBC3		0x0e08	/* USB Control 3 */
+#define CPCAP_REG_UVIDL		0x0e0c	/* ULPI Vendor ID Low */
+#define CPCAP_REG_UVIDH		0x0e10	/* ULPI Vendor ID High */
+#define CPCAP_REG_UPIDL		0x0e14	/* ULPI Product ID Low */
+#define CPCAP_REG_UPIDH		0x0e18	/* ULPI Product ID High */
+#define CPCAP_REG_UFC1		0x0e1c	/* ULPI Function Control 1 */
+#define CPCAP_REG_UFC2		0x0e20	/* ULPI Function Control 2 */
+#define CPCAP_REG_UFC3		0x0e24	/* ULPI Function Control 3 */
+#define CPCAP_REG_UIC1		0x0e28	/* ULPI Interface Control 1 */
+#define CPCAP_REG_UIC2		0x0e2c	/* ULPI Interface Control 2 */
+#define CPCAP_REG_UIC3		0x0e30	/* ULPI Interface Control 3 */
+#define CPCAP_REG_USBOTG1	0x0e34	/* USB OTG Control 1 */
+#define CPCAP_REG_USBOTG2	0x0e38	/* USB OTG Control 2 */
+#define CPCAP_REG_USBOTG3	0x0e3c	/* USB OTG Control 3 */
+#define CPCAP_REG_UIER1		0x0e40	/* USB Interrupt Enable Rising 1 */
+#define CPCAP_REG_UIER2		0x0e44	/* USB Interrupt Enable Rising 2 */
+#define CPCAP_REG_UIER3		0x0e48	/* USB Interrupt Enable Rising 3 */
+#define CPCAP_REG_UIEF1		0x0e4c	/* USB Interrupt Enable Falling 1 */
+#define CPCAP_REG_UIEF2		0x0e50	/* USB Interrupt Enable Falling 1 */
+#define CPCAP_REG_UIEF3		0x0e54	/* USB Interrupt Enable Falling 1 */
+#define CPCAP_REG_UIS		0x0e58	/* USB Interrupt Status */
+#define CPCAP_REG_UIL		0x0e5c	/* USB Interrupt Latch */
+#define CPCAP_REG_USBD		0x0e60	/* USB Debug */
+#define CPCAP_REG_SCR1		0x0e64	/* Scratch 1 */
+#define CPCAP_REG_SCR2		0x0e68	/* Scratch 2 */
+#define CPCAP_REG_SCR3		0x0e6c	/* Scratch 3 */
+
+#define CPCAP_REG_VMC		0x0eac	/* Video Mux Control */
+#define CPCAP_REG_OWDC		0x0eb0	/* One Wire Device Control */
+#define CPCAP_REG_GPIO0		0x0eb4	/* GPIO 0 Control */
+
+#define CPCAP_REG_GPIO1		0x0ebc	/* GPIO 1 Control */
+
+#define CPCAP_REG_GPIO2		0x0ec4	/* GPIO 2 Control */
+
+#define CPCAP_REG_GPIO3		0x0ecc	/* GPIO 3 Control */
+
+#define CPCAP_REG_GPIO4		0x0ed4	/* GPIO 4 Control */
+
+#define CPCAP_REG_GPIO5		0x0edc	/* GPIO 5 Control */
+
+#define CPCAP_REG_GPIO6		0x0ee4	/* GPIO 6 Control */
+
+#define CPCAP_REG_MDLC		0x1000	/* Main Display Lighting Control */
+#define CPCAP_REG_KLC		0x1004	/* Keypad Lighting Control */
+#define CPCAP_REG_ADLC		0x1008	/* Aux Display Lighting Control */
+#define CPCAP_REG_REDC		0x100c	/* Red Triode Control */
+#define CPCAP_REG_GREENC	0x1010	/* Green Triode Control */
+#define CPCAP_REG_BLUEC		0x1014	/* Blue Triode Control */
+#define CPCAP_REG_CFC		0x1018	/* Camera Flash Control */
+#define CPCAP_REG_ABC		0x101c	/* Adaptive Boost Control */
+#define CPCAP_REG_BLEDC		0x1020	/* Bluetooth LED Control */
+#define CPCAP_REG_CLEDC		0x1024	/* Camera Privacy LED Control */
+
+#define CPCAP_REG_OW1C		0x1200	/* One Wire 1 Command */
+#define CPCAP_REG_OW1D		0x1204	/* One Wire 1 Data */
+#define CPCAP_REG_OW1I		0x1208	/* One Wire 1 Interrupt */
+#define CPCAP_REG_OW1IE		0x120c	/* One Wire 1 Interrupt Enable */
+
+#define CPCAP_REG_OW1		0x1214	/* One Wire 1 Control */
+
+#define CPCAP_REG_OW2C		0x1220	/* One Wire 2 Command */
+#define CPCAP_REG_OW2D		0x1224	/* One Wire 2 Data */
+#define CPCAP_REG_OW2I		0x1228	/* One Wire 2 Interrupt */
+#define CPCAP_REG_OW2IE		0x122c	/* One Wire 2 Interrupt Enable */
+
+#define CPCAP_REG_OW2		0x1234	/* One Wire 2 Control */
+
+#define CPCAP_REG_OW3C		0x1240	/* One Wire 3 Command */
+#define CPCAP_REG_OW3D		0x1244	/* One Wire 3 Data */
+#define CPCAP_REG_OW3I		0x1248	/* One Wire 3 Interrupt */
+#define CPCAP_REG_OW3IE		0x124c	/* One Wire 3 Interrupt Enable */
+
+#define CPCAP_REG_OW3		0x1254	/* One Wire 3 Control */
+#define CPCAP_REG_GCAIC		0x1258	/* GCAI Clock Control */
+#define CPCAP_REG_GCAIM		0x125c	/* GCAI GPIO Mode */
+#define CPCAP_REG_LGDIR		0x1260	/* LMR GCAI GPIO Direction */
+#define CPCAP_REG_LGPU		0x1264	/* LMR GCAI GPIO Pull-up */
+#define CPCAP_REG_LGPIN		0x1268	/* LMR GCAI GPIO Pin */
+#define CPCAP_REG_LGMASK	0x126c	/* LMR GCAI GPIO Mask */
+#define CPCAP_REG_LDEB		0x1270	/* LMR Debounce Settings */
+#define CPCAP_REG_LGDET		0x1274	/* LMR GCAI Detach Detect */
+#define CPCAP_REG_LMISC		0x1278	/* LMR Misc Bits */
+#define CPCAP_REG_LMACE		0x127c	/* LMR Mace IC Support */
+
+#define CPCAP_REG_TEST		0x7c00	/* Test */
+
+#define CPCAP_REG_ST_TEST1	0x7d08	/* ST Test1 */
+
+#define CPCAP_REG_ST_TEST2	0x7d18	/* ST Test2 */
+
+/*
+ * Helpers for child devices to check the revision and vendor.
+ *
+ * REVISIT: No documentation for the bits below, please update
+ * to use proper names for defines when available.
+ */
+
+static inline int cpcap_get_revision(struct device *dev,
+				     struct regmap *regmap,
+				     u16 *revision)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(regmap, CPCAP_REG_VERSC1, &val);
+	if (ret) {
+		dev_err(dev, "Could not read revision\n");
+
+		return ret;
+	}
+
+	*revision = ((val >> 3) & 0x7) | ((val << 3) & 0x38);
+
+	return 0;
+}
+
+static inline int cpcap_get_vendor(struct device *dev,
+				   struct regmap *regmap,
+				   u16 *vendor)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(regmap, CPCAP_REG_VERSC1, &val);
+	if (ret) {
+		dev_err(dev, "Could not read vendor\n");
+
+		return ret;
+	}
+
+	*vendor = (val >> 6) & 0x7;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 46bc5c408e4e325efbfff26609c76099979180a7 Mon Sep 17 00:00:00 2001
From: Jungseung Lee <js07.lee@samsung.com>
Date: Thu, 22 Dec 2016 12:37:34 +0900
Subject: mmc: core: Export device lifetime information through sysfs

In the eMMC 5.0 version of the spec, several EXT_CSD fields about
device lifetime are added.

 - Two types of estimated indications reflected by averaged wear out of memory
 - An indication reflected by average reserved blocks

Export the information through sysfs.

Signed-off-by: Jungseung Lee <js07.lee@samsung.com>
Reviewed-by: Jaehoon Chung <jh80.chung@samsung.com>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   | 12 ++++++++++++
 include/linux/mmc/card.h |  3 +++
 include/linux/mmc/mmc.h  |  3 +++
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index b61b52f9da3d..c0e250706353 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -617,6 +617,12 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		card->ext_csd.ffu_capable =
 			(ext_csd[EXT_CSD_SUPPORTED_MODE] & 0x1) &&
 			!(ext_csd[EXT_CSD_FW_CONFIG] & 0x1);
+
+		card->ext_csd.pre_eol_info = ext_csd[EXT_CSD_PRE_EOL_INFO];
+		card->ext_csd.device_life_time_est_typ_a =
+			ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A];
+		card->ext_csd.device_life_time_est_typ_b =
+			ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B];
 	}
 
 	/* eMMC v5.1 or later */
@@ -764,6 +770,10 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
 MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
 MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
 MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
+MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info);
+MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n",
+	card->ext_csd.device_life_time_est_typ_a,
+	card->ext_csd.device_life_time_est_typ_b);
 MMC_DEV_ATTR(serial, "0x%08x\n", card->cid.serial);
 MMC_DEV_ATTR(enhanced_area_offset, "%llu\n",
 		card->ext_csd.enhanced_area_offset);
@@ -817,6 +827,8 @@ static struct attribute *mmc_std_attrs[] = {
 	&dev_attr_name.attr,
 	&dev_attr_oemid.attr,
 	&dev_attr_prv.attr,
+	&dev_attr_pre_eol_info.attr,
+	&dev_attr_life_time.attr,
 	&dev_attr_serial.attr,
 	&dev_attr_enhanced_area_offset.attr,
 	&dev_attr_enhanced_area_size.attr,
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 95d69d498296..00449e51838a 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -121,6 +121,9 @@ struct mmc_ext_csd {
 	u8			raw_pwr_cl_ddr_200_360;	/* 253 */
 	u8			raw_bkops_status;	/* 246 */
 	u8			raw_sectors[4];		/* 212 - 4 bytes */
+	u8			pre_eol_info;		/* 267 */
+	u8			device_life_time_est_typ_a;	/* 268 */
+	u8			device_life_time_est_typ_b;	/* 269 */
 
 	unsigned int            feature_support;
 #define MMC_DISCARD_FEATURE	BIT(0)                  /* CMD38 feature */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 672730acc705..a0740829ff00 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -339,6 +339,9 @@ struct _mmc_csd {
 #define EXT_CSD_CACHE_SIZE		249	/* RO, 4 bytes */
 #define EXT_CSD_PWR_CL_DDR_200_360	253	/* RO */
 #define EXT_CSD_FIRMWARE_VERSION	254	/* RO, 8 bytes */
+#define EXT_CSD_PRE_EOL_INFO		267	/* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A	268	/* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B	269	/* RO */
 #define EXT_CSD_CMDQ_DEPTH		307	/* RO */
 #define EXT_CSD_CMDQ_SUPPORT		308	/* RO */
 #define EXT_CSD_SUPPORTED_MODE		493	/* RO */
-- 
cgit v1.2.3


From 13f00f9f2460cfa106de6c30512b648d85831468 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 30 Dec 2016 13:47:15 +0100
Subject: mmc: Removed the unused public mmc boot.h header

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/mmc/boot.h | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 include/linux/mmc/boot.h

(limited to 'include/linux')

diff --git a/include/linux/mmc/boot.h b/include/linux/mmc/boot.h
deleted file mode 100644
index 23acc3baa07d..000000000000
--- a/include/linux/mmc/boot.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef LINUX_MMC_BOOT_H
-#define LINUX_MMC_BOOT_H
-
-enum { MMC_PROGRESS_ENTER, MMC_PROGRESS_INIT,
-       MMC_PROGRESS_LOAD, MMC_PROGRESS_DONE };
-
-#endif /* LINUX_MMC_BOOT_H */
-- 
cgit v1.2.3


From 95cc4df716a210a19f0611215c49484d460250fd Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 30 Dec 2016 13:47:16 +0100
Subject: mmc: sh_mmcif: Remove unused use_cd_gpio/cd_gpio from platform data

Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/mmc/host/sh_mmcif.c  | 6 ------
 include/linux/mmc/sh_mmcif.h | 2 --
 2 files changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c
index 900778421be6..5f970550d44e 100644
--- a/drivers/mmc/host/sh_mmcif.c
+++ b/drivers/mmc/host/sh_mmcif.c
@@ -1509,12 +1509,6 @@ static int sh_mmcif_probe(struct platform_device *pdev)
 		}
 	}
 
-	if (pd && pd->use_cd_gpio) {
-		ret = mmc_gpio_request_cd(mmc, pd->cd_gpio, 0);
-		if (ret < 0)
-			goto err_clk;
-	}
-
 	mutex_init(&host->thread_lock);
 
 	ret = mmc_add_host(mmc);
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index ccd8fb2cad52..5ce5a2c1a1f5 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -35,10 +35,8 @@ struct sh_mmcif_plat_data {
 	int (*get_cd)(struct platform_device *pdef);
 	unsigned int		slave_id_tx;	/* embedded slave_id_[tr]x */
 	unsigned int		slave_id_rx;
-	bool			use_cd_gpio : 1;
 	bool			ccs_unsupported : 1;
 	bool			clk_ctrl2_present : 1;
-	unsigned int		cd_gpio;
 	u8			sup_pclk;	/* 1 :SH7757, 0: SH7724/SH7372 */
 	unsigned long		caps;
 	u32			ocr;
-- 
cgit v1.2.3


From 5957eeba530747e9d77daf2f300a186758be51d9 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 30 Dec 2016 13:47:17 +0100
Subject: mmc: sh_mmcif: Remove unused ->get_cd() platform callback

Removing the callback also enables us to remove the sh_mmcif_get_cd()
altogether, as we convert to use mmc_gpio_get_cd() to the same kind of
work.

Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mmcif.c  | 18 +-----------------
 include/linux/mmc/sh_mmcif.h |  1 -
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c
index 5f970550d44e..7ba92a425ee0 100644
--- a/drivers/mmc/host/sh_mmcif.c
+++ b/drivers/mmc/host/sh_mmcif.c
@@ -1079,26 +1079,10 @@ static void sh_mmcif_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	host->state = STATE_IDLE;
 }
 
-static int sh_mmcif_get_cd(struct mmc_host *mmc)
-{
-	struct sh_mmcif_host *host = mmc_priv(mmc);
-	struct device *dev = sh_mmcif_host_to_dev(host);
-	struct sh_mmcif_plat_data *p = dev->platform_data;
-	int ret = mmc_gpio_get_cd(mmc);
-
-	if (ret >= 0)
-		return ret;
-
-	if (!p || !p->get_cd)
-		return -ENOSYS;
-	else
-		return p->get_cd(host->pd);
-}
-
 static struct mmc_host_ops sh_mmcif_ops = {
 	.request	= sh_mmcif_request,
 	.set_ios	= sh_mmcif_set_ios,
-	.get_cd		= sh_mmcif_get_cd,
+	.get_cd		= mmc_gpio_get_cd,
 };
 
 static bool sh_mmcif_end_cmd(struct sh_mmcif_host *host)
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index 5ce5a2c1a1f5..7cafc95a2b4e 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -32,7 +32,6 @@
  */
 
 struct sh_mmcif_plat_data {
-	int (*get_cd)(struct platform_device *pdef);
 	unsigned int		slave_id_tx;	/* embedded slave_id_[tr]x */
 	unsigned int		slave_id_rx;
 	bool			ccs_unsupported : 1;
-- 
cgit v1.2.3


From 8020f71117042ed82287e4f51c48b57ce4c783df Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 30 Dec 2016 13:47:18 +0100
Subject: mmc: sh_mmcif: Remove unused ccs_unsupported from the platform data

There are currently no users of the ccs_unsupported member from the
platform data, so let's remove it.

Note, as some of the sh_mmcif variants may not support ccs, let's keep the
current code in the driver, which deals with this. For future support, we
should invent a DT binding instead, but let's leave that until it's needed.

Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/mmc/host/sh_mmcif.c  | 2 +-
 include/linux/mmc/sh_mmcif.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c
index 7ba92a425ee0..9fa8ce3fbb1b 100644
--- a/drivers/mmc/host/sh_mmcif.c
+++ b/drivers/mmc/host/sh_mmcif.c
@@ -1427,7 +1427,7 @@ static int sh_mmcif_probe(struct platform_device *pdev)
 	host->mmc	= mmc;
 	host->addr	= reg;
 	host->timeout	= msecs_to_jiffies(10000);
-	host->ccs_enable = !pd || !pd->ccs_unsupported;
+	host->ccs_enable = true;
 	host->clk_ctrl2_enable = pd && pd->clk_ctrl2_present;
 
 	host->pd = pdev;
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index 7cafc95a2b4e..384b86b90473 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -34,7 +34,6 @@
 struct sh_mmcif_plat_data {
 	unsigned int		slave_id_tx;	/* embedded slave_id_[tr]x */
 	unsigned int		slave_id_rx;
-	bool			ccs_unsupported : 1;
 	bool			clk_ctrl2_present : 1;
 	u8			sup_pclk;	/* 1 :SH7757, 0: SH7724/SH7372 */
 	unsigned long		caps;
-- 
cgit v1.2.3


From dba4bb484e9e495478f2bcf474393d33f7e0ec27 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 30 Dec 2016 13:47:19 +0100
Subject: mmc: sh_mmcif: Remove unused clk_ctrl2_present from the platform data

There are currently no users of the clk_ctrl2_present member from the
platform data, so let's remove it.

Note, as some of the sh_mmcif variants may support clk_ctrl2, let's keep
the current code in the driver, which deals with this. For future support,
we should invent a DT binding instead, but let's leave that until it's
needed.

Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/mmc/host/sh_mmcif.c  | 2 +-
 include/linux/mmc/sh_mmcif.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c
index 9fa8ce3fbb1b..4062d6bef3c8 100644
--- a/drivers/mmc/host/sh_mmcif.c
+++ b/drivers/mmc/host/sh_mmcif.c
@@ -1428,7 +1428,7 @@ static int sh_mmcif_probe(struct platform_device *pdev)
 	host->addr	= reg;
 	host->timeout	= msecs_to_jiffies(10000);
 	host->ccs_enable = true;
-	host->clk_ctrl2_enable = pd && pd->clk_ctrl2_present;
+	host->clk_ctrl2_enable = false;
 
 	host->pd = pdev;
 
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index 384b86b90473..a7baa29484c3 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -34,7 +34,6 @@
 struct sh_mmcif_plat_data {
 	unsigned int		slave_id_tx;	/* embedded slave_id_[tr]x */
 	unsigned int		slave_id_rx;
-	bool			clk_ctrl2_present : 1;
 	u8			sup_pclk;	/* 1 :SH7757, 0: SH7724/SH7372 */
 	unsigned long		caps;
 	u32			ocr;
-- 
cgit v1.2.3


From 0f21c58c63add705065a75495e7a1860a62470ed Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 30 Dec 2016 13:47:20 +0100
Subject: mmc: dw_mmc: Remove the public dw_mmc header file

There are currently no external users of the public dw_mmc header file,
except the dw_mmc driver itself. Therefore let's move the definitions from
the public dw_mmc header file into the existing private dw_mmc header file
and then remove the public one.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 MAINTAINERS                        |   1 -
 drivers/mmc/host/dw_mmc-exynos.c   |   1 -
 drivers/mmc/host/dw_mmc-k3.c       |   1 -
 drivers/mmc/host/dw_mmc-pci.c      |   1 -
 drivers/mmc/host/dw_mmc-pltfm.c    |   1 -
 drivers/mmc/host/dw_mmc-rockchip.c |   1 -
 drivers/mmc/host/dw_mmc.c          |   1 -
 drivers/mmc/host/dw_mmc.h          | 257 ++++++++++++++++++++++++++++++++++
 include/linux/mmc/dw_mmc.h         | 274 -------------------------------------
 9 files changed, 257 insertions(+), 281 deletions(-)
 delete mode 100644 include/linux/mmc/dw_mmc.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 107c10e8f2d2..3cee6d55f91f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10887,7 +10887,6 @@ SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER
 M:	Jaehoon Chung <jh80.chung@samsung.com>
 L:	linux-mmc@vger.kernel.org
 S:	Maintained
-F:	include/linux/mmc/dw_mmc.h
 F:	drivers/mmc/host/dw_mmc*
 
 SYSTEM TRACE MODULE CLASS
diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c
index e1335289316c..25691cca1881 100644
--- a/drivers/mmc/host/dw_mmc-exynos.c
+++ b/drivers/mmc/host/dw_mmc-exynos.c
@@ -13,7 +13,6 @@
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/mmc/host.h>
-#include <linux/mmc/dw_mmc.h>
 #include <linux/mmc/mmc.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
diff --git a/drivers/mmc/host/dw_mmc-k3.c b/drivers/mmc/host/dw_mmc-k3.c
index 9821e6bd5d5e..e38fb0020bb1 100644
--- a/drivers/mmc/host/dw_mmc-k3.c
+++ b/drivers/mmc/host/dw_mmc-k3.c
@@ -11,7 +11,6 @@
 #include <linux/clk.h>
 #include <linux/mfd/syscon.h>
 #include <linux/mmc/host.h>
-#include <linux/mmc/dw_mmc.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/platform_device.h>
diff --git a/drivers/mmc/host/dw_mmc-pci.c b/drivers/mmc/host/dw_mmc-pci.c
index ab82796b01e2..ab8713297edb 100644
--- a/drivers/mmc/host/dw_mmc-pci.c
+++ b/drivers/mmc/host/dw_mmc-pci.c
@@ -18,7 +18,6 @@
 #include <linux/slab.h>
 #include <linux/mmc/host.h>
 #include <linux/mmc/mmc.h>
-#include <linux/mmc/dw_mmc.h>
 #include "dw_mmc.h"
 
 #define PCI_BAR_NO 2
diff --git a/drivers/mmc/host/dw_mmc-pltfm.c b/drivers/mmc/host/dw_mmc-pltfm.c
index 1236d49ba36e..58c13e21bd5a 100644
--- a/drivers/mmc/host/dw_mmc-pltfm.c
+++ b/drivers/mmc/host/dw_mmc-pltfm.c
@@ -20,7 +20,6 @@
 #include <linux/slab.h>
 #include <linux/mmc/host.h>
 #include <linux/mmc/mmc.h>
-#include <linux/mmc/dw_mmc.h>
 #include <linux/of.h>
 #include <linux/clk.h>
 
diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c
index 9a46e4694227..372fb6e948c1 100644
--- a/drivers/mmc/host/dw_mmc-rockchip.c
+++ b/drivers/mmc/host/dw_mmc-rockchip.c
@@ -11,7 +11,6 @@
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/mmc/host.h>
-#include <linux/mmc/dw_mmc.h>
 #include <linux/of_address.h>
 #include <linux/mmc/slot-gpio.h>
 #include <linux/pm_runtime.h>
diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 73db08558e4d..2d8932a20161 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -32,7 +32,6 @@
 #include <linux/mmc/mmc.h>
 #include <linux/mmc/sd.h>
 #include <linux/mmc/sdio.h>
-#include <linux/mmc/dw_mmc.h>
 #include <linux/bitops.h>
 #include <linux/regulator/consumer.h>
 #include <linux/of.h>
diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h
index c59465829387..960146106596 100644
--- a/drivers/mmc/host/dw_mmc.h
+++ b/drivers/mmc/host/dw_mmc.h
@@ -14,6 +14,263 @@
 #ifndef _DW_MMC_H_
 #define _DW_MMC_H_
 
+#include <linux/scatterlist.h>
+#include <linux/mmc/core.h>
+#include <linux/dmaengine.h>
+#include <linux/reset.h>
+
+#define MAX_MCI_SLOTS	2
+
+enum dw_mci_state {
+	STATE_IDLE = 0,
+	STATE_SENDING_CMD,
+	STATE_SENDING_DATA,
+	STATE_DATA_BUSY,
+	STATE_SENDING_STOP,
+	STATE_DATA_ERROR,
+	STATE_SENDING_CMD11,
+	STATE_WAITING_CMD11_DONE,
+};
+
+enum {
+	EVENT_CMD_COMPLETE = 0,
+	EVENT_XFER_COMPLETE,
+	EVENT_DATA_COMPLETE,
+	EVENT_DATA_ERROR,
+};
+
+enum dw_mci_cookie {
+	COOKIE_UNMAPPED,
+	COOKIE_PRE_MAPPED,	/* mapped by pre_req() of dwmmc */
+	COOKIE_MAPPED,		/* mapped by prepare_data() of dwmmc */
+};
+
+struct mmc_data;
+
+enum {
+	TRANS_MODE_PIO = 0,
+	TRANS_MODE_IDMAC,
+	TRANS_MODE_EDMAC
+};
+
+struct dw_mci_dma_slave {
+	struct dma_chan *ch;
+	enum dma_transfer_direction direction;
+};
+
+/**
+ * struct dw_mci - MMC controller state shared between all slots
+ * @lock: Spinlock protecting the queue and associated data.
+ * @irq_lock: Spinlock protecting the INTMASK setting.
+ * @regs: Pointer to MMIO registers.
+ * @fifo_reg: Pointer to MMIO registers for data FIFO
+ * @sg: Scatterlist entry currently being processed by PIO code, if any.
+ * @sg_miter: PIO mapping scatterlist iterator.
+ * @cur_slot: The slot which is currently using the controller.
+ * @mrq: The request currently being processed on @cur_slot,
+ *	or NULL if the controller is idle.
+ * @cmd: The command currently being sent to the card, or NULL.
+ * @data: The data currently being transferred, or NULL if no data
+ *	transfer is in progress.
+ * @stop_abort: The command currently prepared for stoping transfer.
+ * @prev_blksz: The former transfer blksz record.
+ * @timing: Record of current ios timing.
+ * @use_dma: Whether DMA channel is initialized or not.
+ * @using_dma: Whether DMA is in use for the current transfer.
+ * @dma_64bit_address: Whether DMA supports 64-bit address mode or not.
+ * @sg_dma: Bus address of DMA buffer.
+ * @sg_cpu: Virtual address of DMA buffer.
+ * @dma_ops: Pointer to platform-specific DMA callbacks.
+ * @cmd_status: Snapshot of SR taken upon completion of the current
+ * @ring_size: Buffer size for idma descriptors.
+ *	command. Only valid when EVENT_CMD_COMPLETE is pending.
+ * @dms: structure of slave-dma private data.
+ * @phy_regs: physical address of controller's register map
+ * @data_status: Snapshot of SR taken upon completion of the current
+ *	data transfer. Only valid when EVENT_DATA_COMPLETE or
+ *	EVENT_DATA_ERROR is pending.
+ * @stop_cmdr: Value to be loaded into CMDR when the stop command is
+ *	to be sent.
+ * @dir_status: Direction of current transfer.
+ * @tasklet: Tasklet running the request state machine.
+ * @pending_events: Bitmask of events flagged by the interrupt handler
+ *	to be processed by the tasklet.
+ * @completed_events: Bitmask of events which the state machine has
+ *	processed.
+ * @state: Tasklet state.
+ * @queue: List of slots waiting for access to the controller.
+ * @bus_hz: The rate of @mck in Hz. This forms the basis for MMC bus
+ *	rate and timeout calculations.
+ * @current_speed: Configured rate of the controller.
+ * @num_slots: Number of slots available.
+ * @fifoth_val: The value of FIFOTH register.
+ * @verid: Denote Version ID.
+ * @dev: Device associated with the MMC controller.
+ * @pdata: Platform data associated with the MMC controller.
+ * @drv_data: Driver specific data for identified variant of the controller
+ * @priv: Implementation defined private data.
+ * @biu_clk: Pointer to bus interface unit clock instance.
+ * @ciu_clk: Pointer to card interface unit clock instance.
+ * @slot: Slots sharing this MMC controller.
+ * @fifo_depth: depth of FIFO.
+ * @data_shift: log2 of FIFO item size.
+ * @part_buf_start: Start index in part_buf.
+ * @part_buf_count: Bytes of partial data in part_buf.
+ * @part_buf: Simple buffer for partial fifo reads/writes.
+ * @push_data: Pointer to FIFO push function.
+ * @pull_data: Pointer to FIFO pull function.
+ * @vqmmc_enabled: Status of vqmmc, should be true or false.
+ * @irq_flags: The flags to be passed to request_irq.
+ * @irq: The irq value to be passed to request_irq.
+ * @sdio_id0: Number of slot0 in the SDIO interrupt registers.
+ * @cmd11_timer: Timer for SD3.0 voltage switch over scheme.
+ * @dto_timer: Timer for broken data transfer over scheme.
+ *
+ * Locking
+ * =======
+ *
+ * @lock is a softirq-safe spinlock protecting @queue as well as
+ * @cur_slot, @mrq and @state. These must always be updated
+ * at the same time while holding @lock.
+ *
+ * @irq_lock is an irq-safe spinlock protecting the INTMASK register
+ * to allow the interrupt handler to modify it directly.  Held for only long
+ * enough to read-modify-write INTMASK and no other locks are grabbed when
+ * holding this one.
+ *
+ * The @mrq field of struct dw_mci_slot is also protected by @lock,
+ * and must always be written at the same time as the slot is added to
+ * @queue.
+ *
+ * @pending_events and @completed_events are accessed using atomic bit
+ * operations, so they don't need any locking.
+ *
+ * None of the fields touched by the interrupt handler need any
+ * locking. However, ordering is important: Before EVENT_DATA_ERROR or
+ * EVENT_DATA_COMPLETE is set in @pending_events, all data-related
+ * interrupts must be disabled and @data_status updated with a
+ * snapshot of SR. Similarly, before EVENT_CMD_COMPLETE is set, the
+ * CMDRDY interrupt must be disabled and @cmd_status updated with a
+ * snapshot of SR, and before EVENT_XFER_COMPLETE can be set, the
+ * bytes_xfered field of @data must be written. This is ensured by
+ * using barriers.
+ */
+struct dw_mci {
+	spinlock_t		lock;
+	spinlock_t		irq_lock;
+	void __iomem		*regs;
+	void __iomem		*fifo_reg;
+
+	struct scatterlist	*sg;
+	struct sg_mapping_iter	sg_miter;
+
+	struct dw_mci_slot	*cur_slot;
+	struct mmc_request	*mrq;
+	struct mmc_command	*cmd;
+	struct mmc_data		*data;
+	struct mmc_command	stop_abort;
+	unsigned int		prev_blksz;
+	unsigned char		timing;
+
+	/* DMA interface members*/
+	int			use_dma;
+	int			using_dma;
+	int			dma_64bit_address;
+
+	dma_addr_t		sg_dma;
+	void			*sg_cpu;
+	const struct dw_mci_dma_ops	*dma_ops;
+	/* For idmac */
+	unsigned int		ring_size;
+
+	/* For edmac */
+	struct dw_mci_dma_slave *dms;
+	/* Registers's physical base address */
+	resource_size_t		phy_regs;
+
+	u32			cmd_status;
+	u32			data_status;
+	u32			stop_cmdr;
+	u32			dir_status;
+	struct tasklet_struct	tasklet;
+	unsigned long		pending_events;
+	unsigned long		completed_events;
+	enum dw_mci_state	state;
+	struct list_head	queue;
+
+	u32			bus_hz;
+	u32			current_speed;
+	u32			num_slots;
+	u32			fifoth_val;
+	u16			verid;
+	struct device		*dev;
+	struct dw_mci_board	*pdata;
+	const struct dw_mci_drv_data	*drv_data;
+	void			*priv;
+	struct clk		*biu_clk;
+	struct clk		*ciu_clk;
+	struct dw_mci_slot	*slot[MAX_MCI_SLOTS];
+
+	/* FIFO push and pull */
+	int			fifo_depth;
+	int			data_shift;
+	u8			part_buf_start;
+	u8			part_buf_count;
+	union {
+		u16		part_buf16;
+		u32		part_buf32;
+		u64		part_buf;
+	};
+	void (*push_data)(struct dw_mci *host, void *buf, int cnt);
+	void (*pull_data)(struct dw_mci *host, void *buf, int cnt);
+
+	bool			vqmmc_enabled;
+	unsigned long		irq_flags; /* IRQ flags */
+	int			irq;
+
+	int			sdio_id0;
+
+	struct timer_list       cmd11_timer;
+	struct timer_list       dto_timer;
+};
+
+/* DMA ops for Internal/External DMAC interface */
+struct dw_mci_dma_ops {
+	/* DMA Ops */
+	int (*init)(struct dw_mci *host);
+	int (*start)(struct dw_mci *host, unsigned int sg_len);
+	void (*complete)(void *host);
+	void (*stop)(struct dw_mci *host);
+	void (*cleanup)(struct dw_mci *host);
+	void (*exit)(struct dw_mci *host);
+};
+
+struct dma_pdata;
+
+/* Board platform data */
+struct dw_mci_board {
+	u32 num_slots;
+
+	unsigned int bus_hz; /* Clock speed at the cclk_in pad */
+
+	u32 caps;	/* Capabilities */
+	u32 caps2;	/* More capabilities */
+	u32 pm_caps;	/* PM capabilities */
+	/*
+	 * Override fifo depth. If 0, autodetect it from the FIFOTH register,
+	 * but note that this may not be reliable after a bootloader has used
+	 * it.
+	 */
+	unsigned int fifo_depth;
+
+	/* delay in mS before detecting cards after interrupt */
+	u32 detect_delay_ms;
+
+	struct reset_control *rstc;
+	struct dw_mci_dma_ops *dma_ops;
+	struct dma_pdata *data;
+};
+
 #define DW_MMC_240A		0x240a
 #define DW_MMC_280A		0x280a
 
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
deleted file mode 100644
index 15db6f83f53f..000000000000
--- a/include/linux/mmc/dw_mmc.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Synopsys DesignWare Multimedia Card Interface driver
- *  (Based on NXP driver for lpc 31xx)
- *
- * Copyright (C) 2009 NXP Semiconductors
- * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef LINUX_MMC_DW_MMC_H
-#define LINUX_MMC_DW_MMC_H
-
-#include <linux/scatterlist.h>
-#include <linux/mmc/core.h>
-#include <linux/dmaengine.h>
-#include <linux/reset.h>
-
-#define MAX_MCI_SLOTS	2
-
-enum dw_mci_state {
-	STATE_IDLE = 0,
-	STATE_SENDING_CMD,
-	STATE_SENDING_DATA,
-	STATE_DATA_BUSY,
-	STATE_SENDING_STOP,
-	STATE_DATA_ERROR,
-	STATE_SENDING_CMD11,
-	STATE_WAITING_CMD11_DONE,
-};
-
-enum {
-	EVENT_CMD_COMPLETE = 0,
-	EVENT_XFER_COMPLETE,
-	EVENT_DATA_COMPLETE,
-	EVENT_DATA_ERROR,
-};
-
-enum dw_mci_cookie {
-	COOKIE_UNMAPPED,
-	COOKIE_PRE_MAPPED,	/* mapped by pre_req() of dwmmc */
-	COOKIE_MAPPED,		/* mapped by prepare_data() of dwmmc */
-};
-
-struct mmc_data;
-
-enum {
-	TRANS_MODE_PIO = 0,
-	TRANS_MODE_IDMAC,
-	TRANS_MODE_EDMAC
-};
-
-struct dw_mci_dma_slave {
-	struct dma_chan *ch;
-	enum dma_transfer_direction direction;
-};
-
-/**
- * struct dw_mci - MMC controller state shared between all slots
- * @lock: Spinlock protecting the queue and associated data.
- * @irq_lock: Spinlock protecting the INTMASK setting.
- * @regs: Pointer to MMIO registers.
- * @fifo_reg: Pointer to MMIO registers for data FIFO
- * @sg: Scatterlist entry currently being processed by PIO code, if any.
- * @sg_miter: PIO mapping scatterlist iterator.
- * @cur_slot: The slot which is currently using the controller.
- * @mrq: The request currently being processed on @cur_slot,
- *	or NULL if the controller is idle.
- * @cmd: The command currently being sent to the card, or NULL.
- * @data: The data currently being transferred, or NULL if no data
- *	transfer is in progress.
- * @stop_abort: The command currently prepared for stoping transfer.
- * @prev_blksz: The former transfer blksz record.
- * @timing: Record of current ios timing.
- * @use_dma: Whether DMA channel is initialized or not.
- * @using_dma: Whether DMA is in use for the current transfer.
- * @dma_64bit_address: Whether DMA supports 64-bit address mode or not.
- * @sg_dma: Bus address of DMA buffer.
- * @sg_cpu: Virtual address of DMA buffer.
- * @dma_ops: Pointer to platform-specific DMA callbacks.
- * @cmd_status: Snapshot of SR taken upon completion of the current
- * @ring_size: Buffer size for idma descriptors.
- *	command. Only valid when EVENT_CMD_COMPLETE is pending.
- * @dms: structure of slave-dma private data.
- * @phy_regs: physical address of controller's register map
- * @data_status: Snapshot of SR taken upon completion of the current
- *	data transfer. Only valid when EVENT_DATA_COMPLETE or
- *	EVENT_DATA_ERROR is pending.
- * @stop_cmdr: Value to be loaded into CMDR when the stop command is
- *	to be sent.
- * @dir_status: Direction of current transfer.
- * @tasklet: Tasklet running the request state machine.
- * @pending_events: Bitmask of events flagged by the interrupt handler
- *	to be processed by the tasklet.
- * @completed_events: Bitmask of events which the state machine has
- *	processed.
- * @state: Tasklet state.
- * @queue: List of slots waiting for access to the controller.
- * @bus_hz: The rate of @mck in Hz. This forms the basis for MMC bus
- *	rate and timeout calculations.
- * @current_speed: Configured rate of the controller.
- * @num_slots: Number of slots available.
- * @fifoth_val: The value of FIFOTH register.
- * @verid: Denote Version ID.
- * @dev: Device associated with the MMC controller.
- * @pdata: Platform data associated with the MMC controller.
- * @drv_data: Driver specific data for identified variant of the controller
- * @priv: Implementation defined private data.
- * @biu_clk: Pointer to bus interface unit clock instance.
- * @ciu_clk: Pointer to card interface unit clock instance.
- * @slot: Slots sharing this MMC controller.
- * @fifo_depth: depth of FIFO.
- * @data_shift: log2 of FIFO item size.
- * @part_buf_start: Start index in part_buf.
- * @part_buf_count: Bytes of partial data in part_buf.
- * @part_buf: Simple buffer for partial fifo reads/writes.
- * @push_data: Pointer to FIFO push function.
- * @pull_data: Pointer to FIFO pull function.
- * @vqmmc_enabled: Status of vqmmc, should be true or false.
- * @irq_flags: The flags to be passed to request_irq.
- * @irq: The irq value to be passed to request_irq.
- * @sdio_id0: Number of slot0 in the SDIO interrupt registers.
- * @cmd11_timer: Timer for SD3.0 voltage switch over scheme.
- * @dto_timer: Timer for broken data transfer over scheme.
- *
- * Locking
- * =======
- *
- * @lock is a softirq-safe spinlock protecting @queue as well as
- * @cur_slot, @mrq and @state. These must always be updated
- * at the same time while holding @lock.
- *
- * @irq_lock is an irq-safe spinlock protecting the INTMASK register
- * to allow the interrupt handler to modify it directly.  Held for only long
- * enough to read-modify-write INTMASK and no other locks are grabbed when
- * holding this one.
- *
- * The @mrq field of struct dw_mci_slot is also protected by @lock,
- * and must always be written at the same time as the slot is added to
- * @queue.
- *
- * @pending_events and @completed_events are accessed using atomic bit
- * operations, so they don't need any locking.
- *
- * None of the fields touched by the interrupt handler need any
- * locking. However, ordering is important: Before EVENT_DATA_ERROR or
- * EVENT_DATA_COMPLETE is set in @pending_events, all data-related
- * interrupts must be disabled and @data_status updated with a
- * snapshot of SR. Similarly, before EVENT_CMD_COMPLETE is set, the
- * CMDRDY interrupt must be disabled and @cmd_status updated with a
- * snapshot of SR, and before EVENT_XFER_COMPLETE can be set, the
- * bytes_xfered field of @data must be written. This is ensured by
- * using barriers.
- */
-struct dw_mci {
-	spinlock_t		lock;
-	spinlock_t		irq_lock;
-	void __iomem		*regs;
-	void __iomem		*fifo_reg;
-
-	struct scatterlist	*sg;
-	struct sg_mapping_iter	sg_miter;
-
-	struct dw_mci_slot	*cur_slot;
-	struct mmc_request	*mrq;
-	struct mmc_command	*cmd;
-	struct mmc_data		*data;
-	struct mmc_command	stop_abort;
-	unsigned int		prev_blksz;
-	unsigned char		timing;
-
-	/* DMA interface members*/
-	int			use_dma;
-	int			using_dma;
-	int			dma_64bit_address;
-
-	dma_addr_t		sg_dma;
-	void			*sg_cpu;
-	const struct dw_mci_dma_ops	*dma_ops;
-	/* For idmac */
-	unsigned int		ring_size;
-
-	/* For edmac */
-	struct dw_mci_dma_slave *dms;
-	/* Registers's physical base address */
-	resource_size_t		phy_regs;
-
-	u32			cmd_status;
-	u32			data_status;
-	u32			stop_cmdr;
-	u32			dir_status;
-	struct tasklet_struct	tasklet;
-	unsigned long		pending_events;
-	unsigned long		completed_events;
-	enum dw_mci_state	state;
-	struct list_head	queue;
-
-	u32			bus_hz;
-	u32			current_speed;
-	u32			num_slots;
-	u32			fifoth_val;
-	u16			verid;
-	struct device		*dev;
-	struct dw_mci_board	*pdata;
-	const struct dw_mci_drv_data	*drv_data;
-	void			*priv;
-	struct clk		*biu_clk;
-	struct clk		*ciu_clk;
-	struct dw_mci_slot	*slot[MAX_MCI_SLOTS];
-
-	/* FIFO push and pull */
-	int			fifo_depth;
-	int			data_shift;
-	u8			part_buf_start;
-	u8			part_buf_count;
-	union {
-		u16		part_buf16;
-		u32		part_buf32;
-		u64		part_buf;
-	};
-	void (*push_data)(struct dw_mci *host, void *buf, int cnt);
-	void (*pull_data)(struct dw_mci *host, void *buf, int cnt);
-
-	bool			vqmmc_enabled;
-	unsigned long		irq_flags; /* IRQ flags */
-	int			irq;
-
-	int			sdio_id0;
-
-	struct timer_list       cmd11_timer;
-	struct timer_list       dto_timer;
-};
-
-/* DMA ops for Internal/External DMAC interface */
-struct dw_mci_dma_ops {
-	/* DMA Ops */
-	int (*init)(struct dw_mci *host);
-	int (*start)(struct dw_mci *host, unsigned int sg_len);
-	void (*complete)(void *host);
-	void (*stop)(struct dw_mci *host);
-	void (*cleanup)(struct dw_mci *host);
-	void (*exit)(struct dw_mci *host);
-};
-
-struct dma_pdata;
-
-/* Board platform data */
-struct dw_mci_board {
-	u32 num_slots;
-
-	unsigned int bus_hz; /* Clock speed at the cclk_in pad */
-
-	u32 caps;	/* Capabilities */
-	u32 caps2;	/* More capabilities */
-	u32 pm_caps;	/* PM capabilities */
-	/*
-	 * Override fifo depth. If 0, autodetect it from the FIFOTH register,
-	 * but note that this may not be reliable after a bootloader has used
-	 * it.
-	 */
-	unsigned int fifo_depth;
-
-	/* delay in mS before detecting cards after interrupt */
-	u32 detect_delay_ms;
-
-	struct reset_control *rstc;
-	struct dw_mci_dma_ops *dma_ops;
-	struct dma_pdata *data;
-};
-
-#endif /* LINUX_MMC_DW_MMC_H */
-- 
cgit v1.2.3


From 8336bf68b57e1e736be8d4ef1f46a789fe7b9bde Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 11 Jan 2017 12:28:15 +0100
Subject: mmc: mxcmmc: Include interrupt.h in the platform data header

The mxcmmc platform data header depends on interrupt.h. Don't rely on the
public mmc header host.h to include it, bud instead make that dependency
explicit.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/platform_data/mmc-mxcmmc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mmc-mxcmmc.h b/include/linux/platform_data/mmc-mxcmmc.h
index 29115f405af9..b0fdaa9bd185 100644
--- a/include/linux/platform_data/mmc-mxcmmc.h
+++ b/include/linux/platform_data/mmc-mxcmmc.h
@@ -1,6 +1,7 @@
 #ifndef ASMARM_ARCH_MMC_H
 #define ASMARM_ARCH_MMC_H
 
+#include <linux/interrupt.h>
 #include <linux/mmc/host.h>
 
 struct device;
-- 
cgit v1.2.3


From 8da007348bf52a91e5137d27d7dcd528edbb80ce Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:06 +0100
Subject: mmc: core: First step in cleaning up public mmc header files

This is the first step in cleaning up the public mmc header files. In this
change we makes sure each header file builds standalone, as that helps to
resolve dependencies.

While changing this, it also seems reasonable to stop including other
headers from inside a header itself which it don't depend upon.
Additionally, in some cases such dependencies are better resolved by
forward declaring the needed struct.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 include/linux/mmc/card.h      | 2 --
 include/linux/mmc/core.h      | 3 +--
 include/linux/mmc/host.h      | 9 ++-------
 include/linux/mmc/mmc.h       | 2 ++
 include/linux/mmc/slot-gpio.h | 3 +++
 5 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 00449e51838a..ca64f5b67983 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -11,7 +11,6 @@
 #define LINUX_MMC_CARD_H
 
 #include <linux/device.h>
-#include <linux/mmc/core.h>
 #include <linux/mod_devicetable.h>
 
 struct mmc_cid {
@@ -206,7 +205,6 @@ struct sdio_cis {
 };
 
 struct mmc_host;
-struct mmc_ios;
 struct sdio_func;
 struct sdio_func_tuple;
 
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index e33cc748dcfe..64e2ddf33f38 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -8,10 +8,9 @@
 #ifndef LINUX_MMC_CORE_H
 #define LINUX_MMC_CORE_H
 
-#include <linux/interrupt.h>
 #include <linux/completion.h>
+#include <linux/types.h>
 
-struct request;
 struct mmc_data;
 struct mmc_request;
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 8bc884121465..8d38c76b06d6 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -10,16 +10,12 @@
 #ifndef LINUX_MMC_HOST_H
 #define LINUX_MMC_HOST_H
 
-#include <linux/leds.h>
-#include <linux/mutex.h>
-#include <linux/timer.h>
 #include <linux/sched.h>
 #include <linux/device.h>
 #include <linux/fault-inject.h>
 
 #include <linux/mmc/core.h>
 #include <linux/mmc/card.h>
-#include <linux/mmc/mmc.h>
 #include <linux/mmc/pm.h>
 
 struct mmc_ios {
@@ -82,6 +78,8 @@ struct mmc_ios {
 	bool enhanced_strobe;			/* hs400es selection */
 };
 
+struct mmc_host;
+
 struct mmc_host_ops {
 	/*
 	 * It is optional for the host to implement pre_req and post_req in
@@ -162,9 +160,6 @@ struct mmc_host_ops {
 				  unsigned int direction, int blk_size);
 };
 
-struct mmc_card;
-struct device;
-
 struct mmc_async_req {
 	/* active mmc request */
 	struct mmc_request	*mrq;
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index a0740829ff00..261772e3effe 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -24,6 +24,8 @@
 #ifndef LINUX_MMC_MMC_H
 #define LINUX_MMC_MMC_H
 
+#include <linux/types.h>
+
 /* Standard MMC commands (4.1)           type  argument     response */
    /* class 1 */
 #define MMC_GO_IDLE_STATE         0   /* bc                          */
diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h
index a7972cd3bc14..82f0d289f110 100644
--- a/include/linux/mmc/slot-gpio.h
+++ b/include/linux/mmc/slot-gpio.h
@@ -11,6 +11,9 @@
 #ifndef MMC_SLOT_GPIO_H
 #define MMC_SLOT_GPIO_H
 
+#include <linux/types.h>
+#include <linux/irqreturn.h>
+
 struct mmc_host;
 
 int mmc_gpio_get_ro(struct mmc_host *host);
-- 
cgit v1.2.3


From 55244c5659b5e73a969b285a2e763223d8aab979 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:08 +0100
Subject: mmc: core: Move public functions from core.h to private headers

A significant amount of functions are available through the public mmc
core.h header file. Let's slim down this public mmc interface, as to
prevent users from abusing it, by moving some of the functions to private
mmc header files.

This change concentrates on moving the functions into private mmc headers,
following changes may continue with additional clean-ups, as an example
some functions can be turned into static.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 drivers/mmc/core/block.c    |  3 +++
 drivers/mmc/core/core.h     | 37 ++++++++++++++++++++++++++++++++-
 drivers/mmc/core/mmc_ops.h  |  9 ++++++++
 drivers/mmc/core/mmc_test.c |  2 ++
 drivers/mmc/core/queue.c    |  1 +
 drivers/mmc/core/sd_ops.h   |  4 ++++
 drivers/mmc/core/sdio_io.c  |  1 +
 drivers/mmc/core/sdio_irq.c |  1 +
 drivers/mmc/core/sdio_ops.h |  1 +
 include/linux/mmc/core.h    | 50 ---------------------------------------------
 10 files changed, 58 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 6adc3ce28447..41b5dbcd5b5a 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -47,6 +47,9 @@
 
 #include "queue.h"
 #include "block.h"
+#include "core.h"
+#include "mmc_ops.h"
+#include "sd_ops.h"
 
 MODULE_ALIAS("mmc:block");
 #ifdef MODULE_PARAM_PREFIX
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 29b91aea7878..8a95c82554be 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -16,6 +16,7 @@
 
 struct mmc_host;
 struct mmc_card;
+struct mmc_request;
 
 #define MMC_CMD_RETRIES        3
 
@@ -73,6 +74,7 @@ void mmc_start_host(struct mmc_host *host);
 void mmc_stop_host(struct mmc_host *host);
 
 int _mmc_detect_card_removed(struct mmc_host *host);
+int mmc_detect_card_removed(struct mmc_host *host);
 
 int mmc_attach_mmc(struct mmc_host *host);
 int mmc_attach_sd(struct mmc_host *host);
@@ -102,5 +104,38 @@ static inline void mmc_register_pm_notifier(struct mmc_host *host) { }
 static inline void mmc_unregister_pm_notifier(struct mmc_host *host) { }
 #endif
 
-#endif
+void mmc_wait_for_req_done(struct mmc_host *host, struct mmc_request *mrq);
+bool mmc_is_req_done(struct mmc_host *host, struct mmc_request *mrq);
+
+int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
+		unsigned int arg);
+int mmc_can_erase(struct mmc_card *card);
+int mmc_can_trim(struct mmc_card *card);
+int mmc_can_discard(struct mmc_card *card);
+int mmc_can_sanitize(struct mmc_card *card);
+int mmc_can_secure_erase_trim(struct mmc_card *card);
+int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from,
+			unsigned int nr);
+unsigned int mmc_calc_max_discard(struct mmc_card *card);
+
+int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen);
+int mmc_set_blockcount(struct mmc_card *card, unsigned int blockcount,
+			bool is_rel_write);
+
+int __mmc_claim_host(struct mmc_host *host, atomic_t *abort);
+void mmc_release_host(struct mmc_host *host);
+void mmc_get_card(struct mmc_card *card);
+void mmc_put_card(struct mmc_card *card);
+
+/**
+ *	mmc_claim_host - exclusively claim a host
+ *	@host: mmc host to claim
+ *
+ *	Claim a host for a set of operations.
+ */
+static inline void mmc_claim_host(struct mmc_host *host)
+{
+	__mmc_claim_host(host, NULL);
+}
 
+#endif
diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h
index e76365aa9587..74beea8a9c7e 100644
--- a/drivers/mmc/core/mmc_ops.h
+++ b/drivers/mmc/core/mmc_ops.h
@@ -31,12 +31,21 @@ int mmc_spi_read_ocr(struct mmc_host *host, int highcap, u32 *ocrp);
 int mmc_spi_set_crc(struct mmc_host *host, int use_crc);
 int mmc_bus_test(struct mmc_card *card, u8 bus_width);
 int mmc_send_hpi_cmd(struct mmc_card *card, u32 *status);
+int mmc_interrupt_hpi(struct mmc_card *card);
 int mmc_can_ext_csd(struct mmc_card *card);
+int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd);
 int mmc_switch_status(struct mmc_card *card);
 int __mmc_switch_status(struct mmc_card *card, bool crc_err_fatal);
 int __mmc_switch(struct mmc_card *card, u8 set, u8 index, u8 value,
 		unsigned int timeout_ms, unsigned char timing,
 		bool use_busy_signal, bool send_status,	bool retry_crc_err);
+int mmc_switch(struct mmc_card *card, u8 set, u8 index, u8 value,
+		unsigned int timeout_ms);
+int mmc_stop_bkops(struct mmc_card *card);
+int mmc_read_bkops_status(struct mmc_card *card);
+void mmc_start_bkops(struct mmc_card *card, bool from_exception);
+int mmc_can_reset(struct mmc_card *card);
+int mmc_flush_cache(struct mmc_card *card);
 
 #endif
 
diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c
index d350bd67b4f8..8075ad133b7d 100644
--- a/drivers/mmc/core/mmc_test.c
+++ b/drivers/mmc/core/mmc_test.c
@@ -22,6 +22,8 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 
+#include "core.h"
+
 #define RESULT_OK		0
 #define RESULT_FAIL		1
 #define RESULT_UNSUP_HOST	2
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index b1986622c60e..8f5bf5f82aa7 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -20,6 +20,7 @@
 
 #include "queue.h"
 #include "block.h"
+#include "core.h"
 
 #define MMC_QUEUE_BOUNCESZ	65536
 
diff --git a/drivers/mmc/core/sd_ops.h b/drivers/mmc/core/sd_ops.h
index ac7223cf33e3..784f8e6b6baa 100644
--- a/drivers/mmc/core/sd_ops.h
+++ b/drivers/mmc/core/sd_ops.h
@@ -16,6 +16,7 @@
 
 struct mmc_card;
 struct mmc_host;
+struct mmc_command;
 
 int mmc_app_set_bus_width(struct mmc_card *card, int width);
 int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr);
@@ -25,6 +26,9 @@ int mmc_app_send_scr(struct mmc_card *card, u32 *scr);
 int mmc_sd_switch(struct mmc_card *card, int mode, int group,
 	u8 value, u8 *resp);
 int mmc_app_sd_status(struct mmc_card *card, void *ssr);
+int mmc_app_cmd(struct mmc_host *host, struct mmc_card *card);
+int mmc_wait_for_app_cmd(struct mmc_host *host, struct mmc_card *card,
+	struct mmc_command *cmd, int retries);
 
 #endif
 
diff --git a/drivers/mmc/core/sdio_io.c b/drivers/mmc/core/sdio_io.c
index 406e5f037e32..76fe6d599b77 100644
--- a/drivers/mmc/core/sdio_io.c
+++ b/drivers/mmc/core/sdio_io.c
@@ -16,6 +16,7 @@
 #include <linux/mmc/sdio_func.h>
 
 #include "sdio_ops.h"
+#include "core.h"
 
 /**
  *	sdio_claim_host - exclusively claim a bus for a certain SDIO function
diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index f1faf9acc007..d0846350e2fe 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -27,6 +27,7 @@
 #include <linux/mmc/sdio_func.h>
 
 #include "sdio_ops.h"
+#include "core.h"
 
 static int process_sdio_pending_irqs(struct mmc_host *host)
 {
diff --git a/drivers/mmc/core/sdio_ops.h b/drivers/mmc/core/sdio_ops.h
index e1c36d64572c..bed8a8377fec 100644
--- a/drivers/mmc/core/sdio_ops.h
+++ b/drivers/mmc/core/sdio_ops.h
@@ -24,6 +24,7 @@ int mmc_io_rw_direct(struct mmc_card *card, int write, unsigned fn,
 int mmc_io_rw_extended(struct mmc_card *card, int write, unsigned fn,
 	unsigned addr, int incr_addr, u8 *buf, unsigned blocks, unsigned blksz);
 int sdio_reset(struct mmc_host *host);
+unsigned int mmc_align_data_size(struct mmc_card *card, unsigned int sz);
 
 static inline bool mmc_is_io_op(u32 opcode)
 {
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 64e2ddf33f38..e679a8678594 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -158,25 +158,13 @@ struct mmc_request {
 struct mmc_card;
 struct mmc_async_req;
 
-extern int mmc_stop_bkops(struct mmc_card *);
-extern int mmc_read_bkops_status(struct mmc_card *);
 extern struct mmc_async_req *mmc_start_req(struct mmc_host *,
 					   struct mmc_async_req *,
 					   enum mmc_blk_status *);
-extern int mmc_interrupt_hpi(struct mmc_card *);
 extern void mmc_wait_for_req(struct mmc_host *, struct mmc_request *);
-extern void mmc_wait_for_req_done(struct mmc_host *host,
-				  struct mmc_request *mrq);
-extern bool mmc_is_req_done(struct mmc_host *host, struct mmc_request *mrq);
 extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int);
-extern int mmc_app_cmd(struct mmc_host *, struct mmc_card *);
-extern int mmc_wait_for_app_cmd(struct mmc_host *, struct mmc_card *,
-	struct mmc_command *, int);
-extern void mmc_start_bkops(struct mmc_card *card, bool from_exception);
-extern int mmc_switch(struct mmc_card *, u8, u8, u8, unsigned int);
 extern int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
 extern int mmc_abort_tuning(struct mmc_host *host, u32 opcode);
-extern int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd);
 
 #define MMC_ERASE_ARG		0x00000000
 #define MMC_SECURE_ERASE_ARG	0x80000000
@@ -188,46 +176,8 @@ extern int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd);
 #define MMC_SECURE_ARGS		0x80000000
 #define MMC_TRIM_ARGS		0x00008001
 
-extern int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
-		     unsigned int arg);
-extern int mmc_can_erase(struct mmc_card *card);
-extern int mmc_can_trim(struct mmc_card *card);
-extern int mmc_can_discard(struct mmc_card *card);
-extern int mmc_can_sanitize(struct mmc_card *card);
-extern int mmc_can_secure_erase_trim(struct mmc_card *card);
-extern int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from,
-				   unsigned int nr);
-extern unsigned int mmc_calc_max_discard(struct mmc_card *card);
-
-extern int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen);
-extern int mmc_set_blockcount(struct mmc_card *card, unsigned int blockcount,
-			      bool is_rel_write);
 extern int mmc_hw_reset(struct mmc_host *host);
-extern int mmc_can_reset(struct mmc_card *card);
-
 extern void mmc_set_data_timeout(struct mmc_data *, const struct mmc_card *);
-extern unsigned int mmc_align_data_size(struct mmc_card *, unsigned int);
-
-extern int __mmc_claim_host(struct mmc_host *host, atomic_t *abort);
-extern void mmc_release_host(struct mmc_host *host);
-
-extern void mmc_get_card(struct mmc_card *card);
-extern void mmc_put_card(struct mmc_card *card);
-
-extern int mmc_flush_cache(struct mmc_card *);
-
-extern int mmc_detect_card_removed(struct mmc_host *host);
-
-/**
- *	mmc_claim_host - exclusively claim a host
- *	@host: mmc host to claim
- *
- *	Claim a host for a set of operations.
- */
-static inline void mmc_claim_host(struct mmc_host *host)
-{
-	__mmc_claim_host(host, NULL);
-}
 
 struct device_node;
 extern u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max);
-- 
cgit v1.2.3


From 9e1bbc72727506c173cc6ed1d704cb2d7911d9d5 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:09 +0100
Subject: mmc: core: Move some host specific public functions to host.h

Ideally the public mmc header file, core.h, shouldn't contain interfaces
particularly intended to be used by host drivers. Instead those should
remain in the host.h header file. Therefore, let's move a couple functions
from core.h to host.h.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 include/linux/mmc/core.h | 6 ------
 include/linux/mmc/host.h | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index e679a8678594..faacc90f8711 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -163,8 +163,6 @@ extern struct mmc_async_req *mmc_start_req(struct mmc_host *,
 					   enum mmc_blk_status *);
 extern void mmc_wait_for_req(struct mmc_host *, struct mmc_request *);
 extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int);
-extern int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
-extern int mmc_abort_tuning(struct mmc_host *host, u32 opcode);
 
 #define MMC_ERASE_ARG		0x00000000
 #define MMC_SECURE_ERASE_ARG	0x80000000
@@ -179,8 +177,4 @@ extern int mmc_abort_tuning(struct mmc_host *host, u32 opcode);
 extern int mmc_hw_reset(struct mmc_host *host);
 extern void mmc_set_data_timeout(struct mmc_data *, const struct mmc_card *);
 
-struct device_node;
-extern u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max);
-extern int mmc_of_parse_voltage(struct device_node *np, u32 *mask);
-
 #endif /* LINUX_MMC_CORE_H */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 8d38c76b06d6..7de05195bff6 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -392,11 +392,14 @@ struct mmc_host {
 	unsigned long		private[0] ____cacheline_aligned;
 };
 
+struct device_node;
+
 struct mmc_host *mmc_alloc_host(int extra, struct device *);
 int mmc_add_host(struct mmc_host *);
 void mmc_remove_host(struct mmc_host *);
 void mmc_free_host(struct mmc_host *);
 int mmc_of_parse(struct mmc_host *host);
+int mmc_of_parse_voltage(struct device_node *np, u32 *mask);
 
 static inline void *mmc_priv(struct mmc_host *host)
 {
@@ -452,6 +455,7 @@ static inline int mmc_regulator_set_vqmmc(struct mmc_host *mmc,
 }
 #endif
 
+u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max);
 int mmc_regulator_get_supply(struct mmc_host *mmc);
 
 static inline int mmc_card_is_removable(struct mmc_host *host)
@@ -538,6 +542,8 @@ static inline bool mmc_can_retune(struct mmc_host *host)
 	return host->can_retune == 1;
 }
 
+int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
+int mmc_abort_tuning(struct mmc_host *host, u32 opcode);
 void mmc_retune_pause(struct mmc_host *host);
 void mmc_retune_unpause(struct mmc_host *host);
 
-- 
cgit v1.2.3


From c0a3e080f9294117a7d510a4f01a0b7c6dbcadae Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:10 +0100
Subject: mmc: core: Move erase/trim/discard defines from public core.h to
 mmc.h

As the public mmc.h header already contains similar defines for other mmc
commands and arguments, let's move those for erase/trim/discard into here
as well.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 include/linux/mmc/core.h | 10 ----------
 include/linux/mmc/mmc.h  | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index faacc90f8711..6440e10c86cd 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -164,16 +164,6 @@ extern struct mmc_async_req *mmc_start_req(struct mmc_host *,
 extern void mmc_wait_for_req(struct mmc_host *, struct mmc_request *);
 extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int);
 
-#define MMC_ERASE_ARG		0x00000000
-#define MMC_SECURE_ERASE_ARG	0x80000000
-#define MMC_TRIM_ARG		0x00000001
-#define MMC_DISCARD_ARG		0x00000003
-#define MMC_SECURE_TRIM1_ARG	0x80000001
-#define MMC_SECURE_TRIM2_ARG	0x80008000
-
-#define MMC_SECURE_ARGS		0x80000000
-#define MMC_TRIM_ARGS		0x00008001
-
 extern int mmc_hw_reset(struct mmc_host *host);
 extern void mmc_set_data_timeout(struct mmc_data *, const struct mmc_card *);
 
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 261772e3effe..8f7854324d2b 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -462,12 +462,23 @@ struct _mmc_csd {
 /*
  * MMC_SWITCH access modes
  */
-
 #define MMC_SWITCH_MODE_CMD_SET		0x00	/* Change the command set */
 #define MMC_SWITCH_MODE_SET_BITS	0x01	/* Set bits which are 1 in value */
 #define MMC_SWITCH_MODE_CLEAR_BITS	0x02	/* Clear bits which are 1 in value */
 #define MMC_SWITCH_MODE_WRITE_BYTE	0x03	/* Set target to value */
 
+/*
+ * Erase/trim/discard
+ */
+#define MMC_ERASE_ARG			0x00000000
+#define MMC_SECURE_ERASE_ARG		0x80000000
+#define MMC_TRIM_ARG			0x00000001
+#define MMC_DISCARD_ARG			0x00000003
+#define MMC_SECURE_TRIM1_ARG		0x80000001
+#define MMC_SECURE_TRIM2_ARG		0x80008000
+#define MMC_SECURE_ARGS			0x80000000
+#define MMC_TRIM_ARGS			0x00008001
+
 #define mmc_driver_type_mask(n)		(1 << (n))
 
 #endif /* LINUX_MMC_MMC_H */
-- 
cgit v1.2.3


From 009b0fcea5dfe77663f7ec3927c7c50a97a7675c Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:11 +0100
Subject: mmc: core: Remove unused struct _mmc_csd from public mmc.h header

The struct _mmc_csd isn't being used and has been lurking around for a
while. Let's kill it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 include/linux/mmc/mmc.h | 44 --------------------------------------------
 1 file changed, 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 8f7854324d2b..7406d9badda0 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -184,50 +184,6 @@ static inline bool mmc_op_multi(u32 opcode)
 #define R2_SPI_OUT_OF_RANGE	(1 << 15)	/* or CSD overwrite */
 #define R2_SPI_CSD_OVERWRITE	R2_SPI_OUT_OF_RANGE
 
-/* These are unpacked versions of the actual responses */
-
-struct _mmc_csd {
-	u8  csd_structure;
-	u8  spec_vers;
-	u8  taac;
-	u8  nsac;
-	u8  tran_speed;
-	u16 ccc;
-	u8  read_bl_len;
-	u8  read_bl_partial;
-	u8  write_blk_misalign;
-	u8  read_blk_misalign;
-	u8  dsr_imp;
-	u16 c_size;
-	u8  vdd_r_curr_min;
-	u8  vdd_r_curr_max;
-	u8  vdd_w_curr_min;
-	u8  vdd_w_curr_max;
-	u8  c_size_mult;
-	union {
-		struct { /* MMC system specification version 3.1 */
-			u8  erase_grp_size;
-			u8  erase_grp_mult;
-		} v31;
-		struct { /* MMC system specification version 2.2 */
-			u8  sector_size;
-			u8  erase_grp_size;
-		} v22;
-	} erase;
-	u8  wp_grp_size;
-	u8  wp_grp_enable;
-	u8  default_ecc;
-	u8  r2w_factor;
-	u8  write_bl_len;
-	u8  write_bl_partial;
-	u8  file_format_grp;
-	u8  copy;
-	u8  perm_write_protect;
-	u8  tmp_write_protect;
-	u8  file_format;
-	u8  ecc;
-};
-
 /*
  * OCR bits are mostly in host.h
  */
-- 
cgit v1.2.3


From 4facdde11394d44b3869807841042d059f074a07 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:14 +0100
Subject: mmc: core: Move public functions from card.h to private headers

A significant amount of functions and other definitions are available
through the public mmc card.h header file. Let's slim down this public mmc
interface, as to prevent users from abusing it, by moving some of the
functions/definitions to private mmc header files.

This change concentrates on moving the functions into private mmc headers,
following changes may continue with additional clean-ups.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 drivers/mmc/core/block.c    |   2 +
 drivers/mmc/core/bus.c      |   1 +
 drivers/mmc/core/bus.h      |  14 ++-
 drivers/mmc/core/card.h     | 223 ++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/core.c     |   1 +
 drivers/mmc/core/debugfs.c  |   1 +
 drivers/mmc/core/mmc.c      |  13 +++
 drivers/mmc/core/mmc_test.c |   2 +
 drivers/mmc/core/queue.c    |   1 +
 drivers/mmc/core/quirks.c   |   2 +
 drivers/mmc/core/sd.c       |   1 +
 drivers/mmc/core/sdio.c     |   1 +
 drivers/mmc/core/sdio_bus.c |   1 +
 drivers/mmc/core/sdio_io.c  |   1 +
 drivers/mmc/core/sdio_irq.c |   1 +
 include/linux/mmc/card.h    | 240 --------------------------------------------
 16 files changed, 263 insertions(+), 242 deletions(-)
 create mode 100644 drivers/mmc/core/card.h

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 41b5dbcd5b5a..a8300beae918 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -48,6 +48,8 @@
 #include "queue.h"
 #include "block.h"
 #include "core.h"
+#include "card.h"
+#include "bus.h"
 #include "mmc_ops.h"
 #include "sd_ops.h"
 
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index c64266f5a399..3be2e6a75cd9 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -23,6 +23,7 @@
 #include <linux/mmc/host.h>
 
 #include "core.h"
+#include "card.h"
 #include "sdio_cis.h"
 #include "bus.h"
 
diff --git a/drivers/mmc/core/bus.h b/drivers/mmc/core/bus.h
index 93b516ab6e88..72b0ef03f10a 100644
--- a/drivers/mmc/core/bus.h
+++ b/drivers/mmc/core/bus.h
@@ -11,9 +11,10 @@
 #ifndef _MMC_CORE_BUS_H
 #define _MMC_CORE_BUS_H
 
+#include <linux/device.h>
+
 struct mmc_host;
 struct mmc_card;
-struct device_type;
 
 #define MMC_DEV_ATTR(name, fmt, args...)					\
 static ssize_t mmc_##name##_show (struct device *dev, struct device_attribute *attr, char *buf)	\
@@ -31,5 +32,14 @@ void mmc_remove_card(struct mmc_card *card);
 int mmc_register_bus(void);
 void mmc_unregister_bus(void);
 
-#endif
+struct mmc_driver {
+	struct device_driver drv;
+	int (*probe)(struct mmc_card *card);
+	void (*remove)(struct mmc_card *card);
+	void (*shutdown)(struct mmc_card *card);
+};
 
+int mmc_register_driver(struct mmc_driver *drv);
+void mmc_unregister_driver(struct mmc_driver *drv);
+
+#endif
diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h
new file mode 100644
index 000000000000..95e8fc54eb21
--- /dev/null
+++ b/drivers/mmc/core/card.h
@@ -0,0 +1,223 @@
+/*
+ * Private header for the mmc subsystem
+ *
+ * Copyright (C) 2016 Linaro Ltd
+ *
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#ifndef _MMC_CORE_CARD_H
+#define _MMC_CORE_CARD_H
+
+#include <linux/mmc/card.h>
+
+#define mmc_card_name(c)	((c)->cid.prod_name)
+#define mmc_card_id(c)		(dev_name(&(c)->dev))
+#define mmc_dev_to_card(d)	container_of(d, struct mmc_card, dev)
+
+/* Card states */
+#define MMC_STATE_PRESENT	(1<<0)		/* present in sysfs */
+#define MMC_STATE_READONLY	(1<<1)		/* card is read-only */
+#define MMC_STATE_BLOCKADDR	(1<<2)		/* card uses block-addressing */
+#define MMC_CARD_SDXC		(1<<3)		/* card is SDXC */
+#define MMC_CARD_REMOVED	(1<<4)		/* card has been removed */
+#define MMC_STATE_DOING_BKOPS	(1<<5)		/* card is doing BKOPS */
+#define MMC_STATE_SUSPENDED	(1<<6)		/* card is suspended */
+
+#define mmc_card_present(c)	((c)->state & MMC_STATE_PRESENT)
+#define mmc_card_readonly(c)	((c)->state & MMC_STATE_READONLY)
+#define mmc_card_blockaddr(c)	((c)->state & MMC_STATE_BLOCKADDR)
+#define mmc_card_ext_capacity(c) ((c)->state & MMC_CARD_SDXC)
+#define mmc_card_removed(c)	((c) && ((c)->state & MMC_CARD_REMOVED))
+#define mmc_card_doing_bkops(c)	((c)->state & MMC_STATE_DOING_BKOPS)
+#define mmc_card_suspended(c)	((c)->state & MMC_STATE_SUSPENDED)
+
+#define mmc_card_set_present(c)	((c)->state |= MMC_STATE_PRESENT)
+#define mmc_card_set_readonly(c) ((c)->state |= MMC_STATE_READONLY)
+#define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR)
+#define mmc_card_set_ext_capacity(c) ((c)->state |= MMC_CARD_SDXC)
+#define mmc_card_set_removed(c) ((c)->state |= MMC_CARD_REMOVED)
+#define mmc_card_set_doing_bkops(c)	((c)->state |= MMC_STATE_DOING_BKOPS)
+#define mmc_card_clr_doing_bkops(c)	((c)->state &= ~MMC_STATE_DOING_BKOPS)
+#define mmc_card_set_suspended(c) ((c)->state |= MMC_STATE_SUSPENDED)
+#define mmc_card_clr_suspended(c) ((c)->state &= ~MMC_STATE_SUSPENDED)
+
+/*
+ * The world is not perfect and supplies us with broken mmc/sdio devices.
+ * For at least some of these bugs we need a work-around.
+ */
+struct mmc_fixup {
+	/* CID-specific fields. */
+	const char *name;
+
+	/* Valid revision range */
+	u64 rev_start, rev_end;
+
+	unsigned int manfid;
+	unsigned short oemid;
+
+	/* SDIO-specific fields. You can use SDIO_ANY_ID here of course */
+	u16 cis_vendor, cis_device;
+
+	/* for MMC cards */
+	unsigned int ext_csd_rev;
+
+	void (*vendor_fixup)(struct mmc_card *card, int data);
+	int data;
+};
+
+#define CID_MANFID_ANY (-1u)
+#define CID_OEMID_ANY ((unsigned short) -1)
+#define CID_NAME_ANY (NULL)
+
+#define EXT_CSD_REV_ANY (-1u)
+
+#define CID_MANFID_SANDISK      0x2
+#define CID_MANFID_TOSHIBA      0x11
+#define CID_MANFID_MICRON       0x13
+#define CID_MANFID_SAMSUNG      0x15
+#define CID_MANFID_KINGSTON     0x70
+#define CID_MANFID_HYNIX	0x90
+
+#define END_FIXUP { NULL }
+
+#define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		   _cis_vendor, _cis_device,				\
+		   _fixup, _data, _ext_csd_rev)				\
+	{						\
+		.name = (_name),			\
+		.manfid = (_manfid),			\
+		.oemid = (_oemid),			\
+		.rev_start = (_rev_start),		\
+		.rev_end = (_rev_end),			\
+		.cis_vendor = (_cis_vendor),		\
+		.cis_device = (_cis_device),		\
+		.vendor_fixup = (_fixup),		\
+		.data = (_data),			\
+		.ext_csd_rev = (_ext_csd_rev),		\
+	}
+
+#define MMC_FIXUP_REV(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		      _fixup, _data, _ext_csd_rev)			\
+	_FIXUP_EXT(_name, _manfid,					\
+		   _oemid, _rev_start, _rev_end,			\
+		   SDIO_ANY_ID, SDIO_ANY_ID,				\
+		   _fixup, _data, _ext_csd_rev)				\
+
+#define MMC_FIXUP(_name, _manfid, _oemid, _fixup, _data) \
+	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data,	\
+		      EXT_CSD_REV_ANY)
+
+#define MMC_FIXUP_EXT_CSD_REV(_name, _manfid, _oemid, _fixup, _data,	\
+			      _ext_csd_rev)				\
+	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data,	\
+		      _ext_csd_rev)
+
+#define SDIO_FIXUP(_vendor, _device, _fixup, _data)			\
+	_FIXUP_EXT(CID_NAME_ANY, CID_MANFID_ANY,			\
+		    CID_OEMID_ANY, 0, -1ull,				\
+		   _vendor, _device,					\
+		   _fixup, _data, EXT_CSD_REV_ANY)			\
+
+#define cid_rev(hwrev, fwrev, year, month)	\
+	(((u64) hwrev) << 40 |			\
+	 ((u64) fwrev) << 32 |			\
+	 ((u64) year) << 16 |			\
+	 ((u64) month))
+
+#define cid_rev_card(card)			\
+	cid_rev(card->cid.hwrev,		\
+		    card->cid.fwrev,		\
+		    card->cid.year,		\
+		    card->cid.month)
+
+/*
+ * Unconditionally quirk add/remove.
+ */
+static inline void __maybe_unused add_quirk(struct mmc_card *card, int data)
+{
+	card->quirks |= data;
+}
+
+static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data)
+{
+	card->quirks &= ~data;
+}
+
+/*
+ * Quirk add/remove for MMC products.
+ */
+static inline void __maybe_unused add_quirk_mmc(struct mmc_card *card, int data)
+{
+	if (mmc_card_mmc(card))
+		card->quirks |= data;
+}
+
+static inline void __maybe_unused remove_quirk_mmc(struct mmc_card *card,
+						   int data)
+{
+	if (mmc_card_mmc(card))
+		card->quirks &= ~data;
+}
+
+/*
+ * Quirk add/remove for SD products.
+ */
+static inline void __maybe_unused add_quirk_sd(struct mmc_card *card, int data)
+{
+	if (mmc_card_sd(card))
+		card->quirks |= data;
+}
+
+static inline void __maybe_unused remove_quirk_sd(struct mmc_card *card,
+						   int data)
+{
+	if (mmc_card_sd(card))
+		card->quirks &= ~data;
+}
+
+static inline int mmc_card_lenient_fn0(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_LENIENT_FN0;
+}
+
+static inline int mmc_blksz_for_byte_mode(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_BLKSZ_FOR_BYTE_MODE;
+}
+
+static inline int mmc_card_disable_cd(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_DISABLE_CD;
+}
+
+static inline int mmc_card_nonstd_func_interface(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_NONSTD_FUNC_IF;
+}
+
+static inline int mmc_card_broken_byte_mode_512(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_BROKEN_BYTE_MODE_512;
+}
+
+static inline int mmc_card_long_read_time(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_LONG_READ_TIME;
+}
+
+static inline int mmc_card_broken_irq_polling(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_BROKEN_IRQ_POLLING;
+}
+
+static inline int mmc_card_broken_hpi(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_BROKEN_HPI;
+}
+
+void mmc_fixup_device(struct mmc_card *card, const struct mmc_fixup *table);
+
+#endif
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 3b34a751eea1..8c458255e55a 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -40,6 +40,7 @@
 #include <trace/events/mmc.h>
 
 #include "core.h"
+#include "card.h"
 #include "bus.h"
 #include "host.h"
 #include "sdio_bus.h"
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 30623b8b86a4..2843e6aef7ef 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -20,6 +20,7 @@
 #include <linux/mmc/host.h>
 
 #include "core.h"
+#include "card.h"
 #include "mmc_ops.h"
 
 #ifdef CONFIG_FAIL_MMC_REQUEST
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index ec2e85c7ae12..b91abe01aacc 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -21,6 +21,7 @@
 #include <linux/mmc/mmc.h>
 
 #include "core.h"
+#include "card.h"
 #include "host.h"
 #include "bus.h"
 #include "mmc_ops.h"
@@ -307,6 +308,18 @@ static void mmc_manage_enhanced_area(struct mmc_card *card, u8 *ext_csd)
 	}
 }
 
+static void mmc_part_add(struct mmc_card *card, unsigned int size,
+			 unsigned int part_cfg, char *name, int idx, bool ro,
+			 int area_type)
+{
+	card->part[card->nr_parts].size = size;
+	card->part[card->nr_parts].part_cfg = part_cfg;
+	sprintf(card->part[card->nr_parts].name, name, idx);
+	card->part[card->nr_parts].force_ro = ro;
+	card->part[card->nr_parts].area_type = area_type;
+	card->nr_parts++;
+}
+
 static void mmc_manage_gp_partitions(struct mmc_card *card, u8 *ext_csd)
 {
 	int idx;
diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c
index 8075ad133b7d..8dd211584040 100644
--- a/drivers/mmc/core/mmc_test.c
+++ b/drivers/mmc/core/mmc_test.c
@@ -23,6 +23,8 @@
 #include <linux/module.h>
 
 #include "core.h"
+#include "card.h"
+#include "bus.h"
 
 #define RESULT_OK		0
 #define RESULT_FAIL		1
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 8f5bf5f82aa7..611f5c6d1950 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -21,6 +21,7 @@
 #include "queue.h"
 #include "block.h"
 #include "core.h"
+#include "card.h"
 
 #define MMC_QUEUE_BOUNCESZ	65536
 
diff --git a/drivers/mmc/core/quirks.c b/drivers/mmc/core/quirks.c
index ca9cade317c7..bf25a9c88863 100644
--- a/drivers/mmc/core/quirks.c
+++ b/drivers/mmc/core/quirks.c
@@ -15,6 +15,8 @@
 #include <linux/mmc/card.h>
 #include <linux/mmc/sdio_ids.h>
 
+#include "card.h"
+
 #ifndef SDIO_VENDOR_ID_TI
 #define SDIO_VENDOR_ID_TI		0x0097
 #endif
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index a614f37faf27..d66b08d6f509 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -22,6 +22,7 @@
 #include <linux/mmc/sd.h>
 
 #include "core.h"
+#include "card.h"
 #include "bus.h"
 #include "mmc_ops.h"
 #include "sd.h"
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 6810b3a8e130..5419c0dfad31 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -20,6 +20,7 @@
 #include <linux/mmc/sdio_ids.h>
 
 #include "core.h"
+#include "card.h"
 #include "bus.h"
 #include "sd.h"
 #include "sdio_bus.h"
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 86f5b3223aae..e992a7f8a16f 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -25,6 +25,7 @@
 #include <linux/of.h>
 
 #include "core.h"
+#include "card.h"
 #include "sdio_cis.h"
 #include "sdio_bus.h"
 
diff --git a/drivers/mmc/core/sdio_io.c b/drivers/mmc/core/sdio_io.c
index 76fe6d599b77..74195d772f5a 100644
--- a/drivers/mmc/core/sdio_io.c
+++ b/drivers/mmc/core/sdio_io.c
@@ -17,6 +17,7 @@
 
 #include "sdio_ops.h"
 #include "core.h"
+#include "card.h"
 
 /**
  *	sdio_claim_host - exclusively claim a bus for a certain SDIO function
diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index d0846350e2fe..d29faf2addfe 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -28,6 +28,7 @@
 
 #include "sdio_ops.h"
 #include "core.h"
+#include "card.h"
 
 static int process_sdio_pending_irqs(struct mmc_host *host)
 {
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index ca64f5b67983..29d00c91c25c 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -248,13 +248,6 @@ struct mmc_card {
 #define MMC_TYPE_SDIO		2		/* SDIO card */
 #define MMC_TYPE_SD_COMBO	3		/* SD combo (IO+mem) card */
 	unsigned int		state;		/* (our) card state */
-#define MMC_STATE_PRESENT	(1<<0)		/* present in sysfs */
-#define MMC_STATE_READONLY	(1<<1)		/* card is read-only */
-#define MMC_STATE_BLOCKADDR	(1<<2)		/* card uses block-addressing */
-#define MMC_CARD_SDXC		(1<<3)		/* card is SDXC */
-#define MMC_CARD_REMOVED	(1<<4)		/* card has been removed */
-#define MMC_STATE_DOING_BKOPS	(1<<5)		/* card is doing BKOPS */
-#define MMC_STATE_SUSPENDED	(1<<6)		/* card is suspended */
 	unsigned int		quirks; 	/* card quirks */
 #define MMC_QUIRK_LENIENT_FN0	(1<<0)		/* allow SDIO FN0 writes outside of the VS CCCR range */
 #define MMC_QUIRK_BLKSZ_FOR_BYTE_MODE (1<<1)	/* use func->cur_blksize */
@@ -273,7 +266,6 @@ struct mmc_card {
 #define MMC_QUIRK_TRIM_BROKEN	(1<<12)		/* Skip trim */
 #define MMC_QUIRK_BROKEN_HPI	(1<<13)		/* Disable broken HPI support */
 
-
 	unsigned int		erase_size;	/* erase size in sectors */
  	unsigned int		erase_shift;	/* if erase unit is power 2 */
  	unsigned int		pref_erase;	/* in sectors */
@@ -309,245 +301,13 @@ struct mmc_card {
 	unsigned int    nr_parts;
 };
 
-/*
- * This function fill contents in mmc_part.
- */
-static inline void mmc_part_add(struct mmc_card *card, unsigned int size,
-			unsigned int part_cfg, char *name, int idx, bool ro,
-			int area_type)
-{
-	card->part[card->nr_parts].size = size;
-	card->part[card->nr_parts].part_cfg = part_cfg;
-	sprintf(card->part[card->nr_parts].name, name, idx);
-	card->part[card->nr_parts].force_ro = ro;
-	card->part[card->nr_parts].area_type = area_type;
-	card->nr_parts++;
-}
-
 static inline bool mmc_large_sector(struct mmc_card *card)
 {
 	return card->ext_csd.data_sector_size == 4096;
 }
 
-/*
- *  The world is not perfect and supplies us with broken mmc/sdio devices.
- *  For at least some of these bugs we need a work-around.
- */
-
-struct mmc_fixup {
-	/* CID-specific fields. */
-	const char *name;
-
-	/* Valid revision range */
-	u64 rev_start, rev_end;
-
-	unsigned int manfid;
-	unsigned short oemid;
-
-	/* SDIO-specfic fields. You can use SDIO_ANY_ID here of course */
-	u16 cis_vendor, cis_device;
-
-	/* for MMC cards */
-	unsigned int ext_csd_rev;
-
-	void (*vendor_fixup)(struct mmc_card *card, int data);
-	int data;
-};
-
-#define CID_MANFID_ANY (-1u)
-#define CID_OEMID_ANY ((unsigned short) -1)
-#define CID_NAME_ANY (NULL)
-
-#define EXT_CSD_REV_ANY (-1u)
-
-#define CID_MANFID_SANDISK      0x2
-#define CID_MANFID_TOSHIBA      0x11
-#define CID_MANFID_MICRON       0x13
-#define CID_MANFID_SAMSUNG      0x15
-#define CID_MANFID_KINGSTON     0x70
-#define CID_MANFID_HYNIX	0x90
-
-#define END_FIXUP { NULL }
-
-#define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end,	\
-		   _cis_vendor, _cis_device,				\
-		   _fixup, _data, _ext_csd_rev)				\
-	{						   \
-		.name = (_name),			   \
-		.manfid = (_manfid),			   \
-		.oemid = (_oemid),			   \
-		.rev_start = (_rev_start),		   \
-		.rev_end = (_rev_end),			   \
-		.cis_vendor = (_cis_vendor),		   \
-		.cis_device = (_cis_device),		   \
-		.vendor_fixup = (_fixup),		   \
-		.data = (_data),			   \
-		.ext_csd_rev = (_ext_csd_rev),		   \
-	 }
-
-#define MMC_FIXUP_REV(_name, _manfid, _oemid, _rev_start, _rev_end,	\
-		      _fixup, _data, _ext_csd_rev)			\
-	_FIXUP_EXT(_name, _manfid,					\
-		   _oemid, _rev_start, _rev_end,			\
-		   SDIO_ANY_ID, SDIO_ANY_ID,				\
-		   _fixup, _data, _ext_csd_rev)				\
-
-#define MMC_FIXUP(_name, _manfid, _oemid, _fixup, _data) \
-	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data,	\
-		      EXT_CSD_REV_ANY)
-
-#define MMC_FIXUP_EXT_CSD_REV(_name, _manfid, _oemid, _fixup, _data,	\
-			      _ext_csd_rev)				\
-	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data,	\
-		      _ext_csd_rev)
-
-#define SDIO_FIXUP(_vendor, _device, _fixup, _data)			\
-	_FIXUP_EXT(CID_NAME_ANY, CID_MANFID_ANY,			\
-		    CID_OEMID_ANY, 0, -1ull,				\
-		   _vendor, _device,					\
-		   _fixup, _data, EXT_CSD_REV_ANY)			\
-
-#define cid_rev(hwrev, fwrev, year, month)	\
-	(((u64) hwrev) << 40 |                  \
-	 ((u64) fwrev) << 32 |                  \
-	 ((u64) year) << 16 |                   \
-	 ((u64) month))
-
-#define cid_rev_card(card)		  \
-	cid_rev(card->cid.hwrev,	  \
-		    card->cid.fwrev,      \
-		    card->cid.year,	  \
-		    card->cid.month)
-
-/*
- * Unconditionally quirk add/remove.
- */
-
-static inline void __maybe_unused add_quirk(struct mmc_card *card, int data)
-{
-	card->quirks |= data;
-}
-
-static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data)
-{
-	card->quirks &= ~data;
-}
-
 #define mmc_card_mmc(c)		((c)->type == MMC_TYPE_MMC)
 #define mmc_card_sd(c)		((c)->type == MMC_TYPE_SD)
 #define mmc_card_sdio(c)	((c)->type == MMC_TYPE_SDIO)
 
-#define mmc_card_present(c)	((c)->state & MMC_STATE_PRESENT)
-#define mmc_card_readonly(c)	((c)->state & MMC_STATE_READONLY)
-#define mmc_card_blockaddr(c)	((c)->state & MMC_STATE_BLOCKADDR)
-#define mmc_card_ext_capacity(c) ((c)->state & MMC_CARD_SDXC)
-#define mmc_card_removed(c)	((c) && ((c)->state & MMC_CARD_REMOVED))
-#define mmc_card_doing_bkops(c)	((c)->state & MMC_STATE_DOING_BKOPS)
-#define mmc_card_suspended(c)	((c)->state & MMC_STATE_SUSPENDED)
-
-#define mmc_card_set_present(c)	((c)->state |= MMC_STATE_PRESENT)
-#define mmc_card_set_readonly(c) ((c)->state |= MMC_STATE_READONLY)
-#define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR)
-#define mmc_card_set_ext_capacity(c) ((c)->state |= MMC_CARD_SDXC)
-#define mmc_card_set_removed(c) ((c)->state |= MMC_CARD_REMOVED)
-#define mmc_card_set_doing_bkops(c)	((c)->state |= MMC_STATE_DOING_BKOPS)
-#define mmc_card_clr_doing_bkops(c)	((c)->state &= ~MMC_STATE_DOING_BKOPS)
-#define mmc_card_set_suspended(c) ((c)->state |= MMC_STATE_SUSPENDED)
-#define mmc_card_clr_suspended(c) ((c)->state &= ~MMC_STATE_SUSPENDED)
-
-/*
- * Quirk add/remove for MMC products.
- */
-
-static inline void __maybe_unused add_quirk_mmc(struct mmc_card *card, int data)
-{
-	if (mmc_card_mmc(card))
-		card->quirks |= data;
-}
-
-static inline void __maybe_unused remove_quirk_mmc(struct mmc_card *card,
-						   int data)
-{
-	if (mmc_card_mmc(card))
-		card->quirks &= ~data;
-}
-
-/*
- * Quirk add/remove for SD products.
- */
-
-static inline void __maybe_unused add_quirk_sd(struct mmc_card *card, int data)
-{
-	if (mmc_card_sd(card))
-		card->quirks |= data;
-}
-
-static inline void __maybe_unused remove_quirk_sd(struct mmc_card *card,
-						   int data)
-{
-	if (mmc_card_sd(card))
-		card->quirks &= ~data;
-}
-
-static inline int mmc_card_lenient_fn0(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_LENIENT_FN0;
-}
-
-static inline int mmc_blksz_for_byte_mode(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_BLKSZ_FOR_BYTE_MODE;
-}
-
-static inline int mmc_card_disable_cd(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_DISABLE_CD;
-}
-
-static inline int mmc_card_nonstd_func_interface(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_NONSTD_FUNC_IF;
-}
-
-static inline int mmc_card_broken_byte_mode_512(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_BROKEN_BYTE_MODE_512;
-}
-
-static inline int mmc_card_long_read_time(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_LONG_READ_TIME;
-}
-
-static inline int mmc_card_broken_irq_polling(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_BROKEN_IRQ_POLLING;
-}
-
-static inline int mmc_card_broken_hpi(const struct mmc_card *c)
-{
-	return c->quirks & MMC_QUIRK_BROKEN_HPI;
-}
-
-#define mmc_card_name(c)	((c)->cid.prod_name)
-#define mmc_card_id(c)		(dev_name(&(c)->dev))
-
-#define mmc_dev_to_card(d)	container_of(d, struct mmc_card, dev)
-
-/*
- * MMC device driver (e.g., Flash card, I/O card...)
- */
-struct mmc_driver {
-	struct device_driver drv;
-	int (*probe)(struct mmc_card *);
-	void (*remove)(struct mmc_card *);
-	void (*shutdown)(struct mmc_card *);
-};
-
-extern int mmc_register_driver(struct mmc_driver *);
-extern void mmc_unregister_driver(struct mmc_driver *);
-
-extern void mmc_fixup_device(struct mmc_card *card,
-			     const struct mmc_fixup *table);
-
 #endif /* LINUX_MMC_CARD_H */
-- 
cgit v1.2.3


From 5857b29b96dcf208e4903ec6f20d132e6a77cac2 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:15 +0100
Subject: mmc: core: Move public functions from host.h to private headers

A significant amount of functions are available through the public mmc
host.h header file. Let's slim down this public mmc interface, as to
prevent users from abusing it, by moving some of the functions to private
mmc host.h header file.

This change concentrates on moving the functions into private mmc headers,
following changes may continue with additional clean-ups.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 drivers/mmc/core/block.c    |  1 +
 drivers/mmc/core/bus.c      |  1 +
 drivers/mmc/core/debugfs.c  |  1 +
 drivers/mmc/core/host.h     | 49 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/mmc/core/mmc_test.c |  1 +
 drivers/mmc/core/sd.c       |  1 +
 drivers/mmc/core/sdio.c     |  1 +
 include/linux/mmc/host.h    | 48 ++------------------------------------------
 8 files changed, 56 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index a8300beae918..7bd03381810d 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -49,6 +49,7 @@
 #include "block.h"
 #include "core.h"
 #include "card.h"
+#include "host.h"
 #include "bus.h"
 #include "mmc_ops.h"
 #include "sd_ops.h"
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 3be2e6a75cd9..301246513a37 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -24,6 +24,7 @@
 
 #include "core.h"
 #include "card.h"
+#include "host.h"
 #include "sdio_cis.h"
 #include "bus.h"
 
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 2843e6aef7ef..a1fba5732d66 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -21,6 +21,7 @@
 
 #include "core.h"
 #include "card.h"
+#include "host.h"
 #include "mmc_ops.h"
 
 #ifdef CONFIG_FAIL_MMC_REQUEST
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 366ce79c3498..fb6a76a03833 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -11,7 +11,7 @@
 #ifndef _MMC_CORE_HOST_H
 #define _MMC_CORE_HOST_H
 
-struct mmc_host;
+#include <linux/mmc/host.h>
 
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
@@ -21,6 +21,53 @@ void mmc_retune_disable(struct mmc_host *host);
 void mmc_retune_hold(struct mmc_host *host);
 void mmc_retune_release(struct mmc_host *host);
 int mmc_retune(struct mmc_host *host);
+void mmc_retune_pause(struct mmc_host *host);
+void mmc_retune_unpause(struct mmc_host *host);
+
+static inline void mmc_retune_recheck(struct mmc_host *host)
+{
+	if (host->hold_retune <= 1)
+		host->retune_now = 1;
+}
+
+static inline int mmc_host_cmd23(struct mmc_host *host)
+{
+	return host->caps & MMC_CAP_CMD23;
+}
+
+static inline int mmc_boot_partition_access(struct mmc_host *host)
+{
+	return !(host->caps2 & MMC_CAP2_BOOTPART_NOACC);
+}
+
+static inline int mmc_host_uhs(struct mmc_host *host)
+{
+	return host->caps &
+		(MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25 |
+		 MMC_CAP_UHS_SDR50 | MMC_CAP_UHS_SDR104 |
+		 MMC_CAP_UHS_DDR50);
+}
+
+static inline bool mmc_card_hs200(struct mmc_card *card)
+{
+	return card->host->ios.timing == MMC_TIMING_MMC_HS200;
+}
+
+static inline bool mmc_card_ddr52(struct mmc_card *card)
+{
+	return card->host->ios.timing == MMC_TIMING_MMC_DDR52;
+}
+
+static inline bool mmc_card_hs400(struct mmc_card *card)
+{
+	return card->host->ios.timing == MMC_TIMING_MMC_HS400;
+}
+
+static inline bool mmc_card_hs400es(struct mmc_card *card)
+{
+	return card->host->ios.enhanced_strobe;
+}
+
 
 #endif
 
diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c
index 8dd211584040..83d193c09d98 100644
--- a/drivers/mmc/core/mmc_test.c
+++ b/drivers/mmc/core/mmc_test.c
@@ -24,6 +24,7 @@
 
 #include "core.h"
 #include "card.h"
+#include "host.h"
 #include "bus.h"
 
 #define RESULT_OK		0
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index d66b08d6f509..8b4f13943712 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -23,6 +23,7 @@
 
 #include "core.h"
 #include "card.h"
+#include "host.h"
 #include "bus.h"
 #include "mmc_ops.h"
 #include "sd.h"
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 5419c0dfad31..261c88632c43 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -21,6 +21,7 @@
 
 #include "core.h"
 #include "card.h"
+#include "host.h"
 #include "bus.h"
 #include "sd.h"
 #include "sdio_bus.h"
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 7de05195bff6..97699d55b2ae 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -473,56 +473,20 @@ static inline int mmc_card_wake_sdio_irq(struct mmc_host *host)
 	return host->pm_flags & MMC_PM_WAKE_SDIO_IRQ;
 }
 
-static inline int mmc_host_cmd23(struct mmc_host *host)
-{
-	return host->caps & MMC_CAP_CMD23;
-}
-
-static inline int mmc_boot_partition_access(struct mmc_host *host)
-{
-	return !(host->caps2 & MMC_CAP2_BOOTPART_NOACC);
-}
-
-static inline int mmc_host_uhs(struct mmc_host *host)
-{
-	return host->caps &
-		(MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25 |
-		 MMC_CAP_UHS_SDR50 | MMC_CAP_UHS_SDR104 |
-		 MMC_CAP_UHS_DDR50);
-}
-
+/* TODO: Move to private header */
 static inline int mmc_card_hs(struct mmc_card *card)
 {
 	return card->host->ios.timing == MMC_TIMING_SD_HS ||
 		card->host->ios.timing == MMC_TIMING_MMC_HS;
 }
 
+/* TODO: Move to private header */
 static inline int mmc_card_uhs(struct mmc_card *card)
 {
 	return card->host->ios.timing >= MMC_TIMING_UHS_SDR12 &&
 		card->host->ios.timing <= MMC_TIMING_UHS_DDR50;
 }
 
-static inline bool mmc_card_hs200(struct mmc_card *card)
-{
-	return card->host->ios.timing == MMC_TIMING_MMC_HS200;
-}
-
-static inline bool mmc_card_ddr52(struct mmc_card *card)
-{
-	return card->host->ios.timing == MMC_TIMING_MMC_DDR52;
-}
-
-static inline bool mmc_card_hs400(struct mmc_card *card)
-{
-	return card->host->ios.timing == MMC_TIMING_MMC_HS400;
-}
-
-static inline bool mmc_card_hs400es(struct mmc_card *card)
-{
-	return card->host->ios.enhanced_strobe;
-}
-
 void mmc_retune_timer_stop(struct mmc_host *host);
 
 static inline void mmc_retune_needed(struct mmc_host *host)
@@ -531,12 +495,6 @@ static inline void mmc_retune_needed(struct mmc_host *host)
 		host->need_retune = 1;
 }
 
-static inline void mmc_retune_recheck(struct mmc_host *host)
-{
-	if (host->hold_retune <= 1)
-		host->retune_now = 1;
-}
-
 static inline bool mmc_can_retune(struct mmc_host *host)
 {
 	return host->can_retune == 1;
@@ -544,7 +502,5 @@ static inline bool mmc_can_retune(struct mmc_host *host)
 
 int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
 int mmc_abort_tuning(struct mmc_host *host, u32 opcode);
-void mmc_retune_pause(struct mmc_host *host);
-void mmc_retune_unpause(struct mmc_host *host);
 
 #endif /* LINUX_MMC_HOST_H */
-- 
cgit v1.2.3


From 23888bfe9bd730d42e326cc5f6f1188beb948f97 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 13 Jan 2017 14:14:16 +0100
Subject: mmc: core: Don't use extern declarations of public mmc functions

Using extern when declaring functions in the public header, core.h, is
redundant. Let's just remove the use of it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
---
 include/linux/mmc/core.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 6440e10c86cd..6dcb339fcd45 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -158,13 +158,14 @@ struct mmc_request {
 struct mmc_card;
 struct mmc_async_req;
 
-extern struct mmc_async_req *mmc_start_req(struct mmc_host *,
-					   struct mmc_async_req *,
-					   enum mmc_blk_status *);
-extern void mmc_wait_for_req(struct mmc_host *, struct mmc_request *);
-extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int);
-
-extern int mmc_hw_reset(struct mmc_host *host);
-extern void mmc_set_data_timeout(struct mmc_data *, const struct mmc_card *);
+struct mmc_async_req *mmc_start_req(struct mmc_host *host,
+				struct mmc_async_req *areq,
+				enum mmc_blk_status *ret_stat);
+void mmc_wait_for_req(struct mmc_host *host, struct mmc_request *mrq);
+int mmc_wait_for_cmd(struct mmc_host *host, struct mmc_command *cmd,
+		int retries);
+
+int mmc_hw_reset(struct mmc_host *host);
+void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card);
 
 #endif /* LINUX_MMC_CORE_H */
-- 
cgit v1.2.3


From 20dd03734cac41a0545dd24f5e81d8ff0c80874b Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 19 Jan 2017 21:07:17 +0100
Subject: mmc: host: tmio: SDIO_STATUS_QUIRK is rather SDIO_STATUS_SETBITS

QUIRK sounds like there is something wrong, but actually there are just
some bits which need to be 1. Rename it to be more clear.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 6 ++----
 drivers/mmc/host/tmio_mmc_pio.c   | 2 +-
 include/linux/mfd/tmio.h          | 6 ++----
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 59db14b4827c..8bebf35c7576 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -641,10 +641,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	 */
 	mmc_data->flags |= TMIO_MMC_HAVE_CMD12_CTRL;
 
-	/*
-	 * All SDHI need SDIO_INFO1 reserved bit
-	 */
-	mmc_data->flags |= TMIO_MMC_SDIO_STATUS_QUIRK;
+	/* All SDHI have SDIO status bits which must be 1 */
+	mmc_data->flags |= TMIO_MMC_SDIO_STATUS_SETBITS;
 
 	ret = tmio_mmc_host_probe(host, mmc_data);
 	if (ret < 0)
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 7cbfec7a5d94..6e1acb628edc 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -723,7 +723,7 @@ static void __tmio_mmc_sdio_irq(struct tmio_mmc_host *host)
 	ireg = status & TMIO_SDIO_MASK_ALL & ~host->sdio_irq_mask;
 
 	sdio_status = status & ~TMIO_SDIO_MASK_ALL;
-	if (pdata->flags & TMIO_MMC_SDIO_STATUS_QUIRK)
+	if (pdata->flags & TMIO_MMC_SDIO_STATUS_SETBITS)
 		sdio_status |= 6;
 
 	sd_ctrl_write16(host, CTL_SDIO_STATUS, sdio_status);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index fba44abd05ba..a1520d88ebf3 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -94,10 +94,8 @@
  */
 #define TMIO_MMC_HAVE_CMD12_CTRL	(1 << 7)
 
-/*
- * Some controllers needs to set 1 on SDIO status reserved bits
- */
-#define TMIO_MMC_SDIO_STATUS_QUIRK	(1 << 8)
+/* Controller has some SDIO status bits which must be 1 */
+#define TMIO_MMC_SDIO_STATUS_SETBITS	(1 << 8)
 
 /*
  * Some controllers have a 32-bit wide data port register
-- 
cgit v1.2.3


From 20f921bb01140207983e9fc3af4bb60f1f9fc0d5 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 25 Jan 2017 12:04:17 +0100
Subject: mmc: core: Invent MMC_CAP_3_3V_DDR

According the JEDEC specification an eMMC card supporting 1.8V vccq in DDR
mode should also be capable of 3.3V. However, it's been reported that some
mmc hosts supports 3.3V, but not 1.8V.

Currently the mmc core implements an error handling when the host fails to
set 1.8V for vccq, by falling back to 3.3V. Unfortunate, this seems to be
insufficient for some mmc hosts. To enable these to use eMMC DDR mode let's
invent a new mmc cap, MMC_CAP_3_3V_DDR, which tells whether they support
the eMMC 3.3V DDR mode.

In case MMC_CAP_3_3V_DDR is set, but not MMC_CAP_1_8V_DDR, let's change to
remain on the 3.3V, as it's the default voltage level for vccq, set by the
earlier power up sequence.

As this change introduces MMC_CAP_3_3V_DDR, let's take the opportunity to
do some re-formatting of the related defines in the header file.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Tested-by: Jan Glauber <jglauber@cavium.com>
Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
---
 drivers/mmc/core/mmc.c   | 11 +++++++----
 include/linux/mmc/host.h | 21 ++++++++++-----------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 5844f29d3bed..f1a451e48538 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -213,7 +213,7 @@ static void mmc_select_card_type(struct mmc_card *card)
 		avail_type |= EXT_CSD_CARD_TYPE_HS_52;
 	}
 
-	if (caps & MMC_CAP_1_8V_DDR &&
+	if (caps & (MMC_CAP_1_8V_DDR | MMC_CAP_3_3V_DDR) &&
 	    card_type & EXT_CSD_CARD_TYPE_DDR_1_8V) {
 		hs_max_dtr = MMC_HIGH_DDR_MAX_DTR;
 		avail_type |= EXT_CSD_CARD_TYPE_DDR_1_8V;
@@ -1120,11 +1120,14 @@ static int mmc_select_hs_ddr(struct mmc_card *card)
 	 *
 	 * WARNING: eMMC rules are NOT the same as SD DDR
 	 */
-	err = -EINVAL;
-	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_1_2V)
+	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_1_2V) {
 		err = mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120);
+		if (!err)
+			return 0;
+	}
 
-	if (err && (card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_1_8V))
+	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_1_8V &&
+	    host->caps & MMC_CAP_1_8V_DDR)
 		err = mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180);
 
 	/* make sure vccq is 3.3v after switching disaster */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 97699d55b2ae..83f1c4a9f03b 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -259,17 +259,16 @@ struct mmc_host {
 #define MMC_CAP_NONREMOVABLE	(1 << 8)	/* Nonremovable e.g. eMMC */
 #define MMC_CAP_WAIT_WHILE_BUSY	(1 << 9)	/* Waits while card is busy */
 #define MMC_CAP_ERASE		(1 << 10)	/* Allow erase/trim commands */
-#define MMC_CAP_1_8V_DDR	(1 << 11)	/* can support */
-						/* DDR mode at 1.8V */
-#define MMC_CAP_1_2V_DDR	(1 << 12)	/* can support */
-						/* DDR mode at 1.2V */
-#define MMC_CAP_POWER_OFF_CARD	(1 << 13)	/* Can power off after boot */
-#define MMC_CAP_BUS_WIDTH_TEST	(1 << 14)	/* CMD14/CMD19 bus width ok */
-#define MMC_CAP_UHS_SDR12	(1 << 15)	/* Host supports UHS SDR12 mode */
-#define MMC_CAP_UHS_SDR25	(1 << 16)	/* Host supports UHS SDR25 mode */
-#define MMC_CAP_UHS_SDR50	(1 << 17)	/* Host supports UHS SDR50 mode */
-#define MMC_CAP_UHS_SDR104	(1 << 18)	/* Host supports UHS SDR104 mode */
-#define MMC_CAP_UHS_DDR50	(1 << 19)	/* Host supports UHS DDR50 mode */
+#define MMC_CAP_3_3V_DDR	(1 << 11)	/* Host supports eMMC DDR 3.3V */
+#define MMC_CAP_1_8V_DDR	(1 << 12)	/* Host supports eMMC DDR 1.8V */
+#define MMC_CAP_1_2V_DDR	(1 << 13)	/* Host supports eMMC DDR 1.2V */
+#define MMC_CAP_POWER_OFF_CARD	(1 << 14)	/* Can power off after boot */
+#define MMC_CAP_BUS_WIDTH_TEST	(1 << 15)	/* CMD14/CMD19 bus width ok */
+#define MMC_CAP_UHS_SDR12	(1 << 16)	/* Host supports UHS SDR12 mode */
+#define MMC_CAP_UHS_SDR25	(1 << 17)	/* Host supports UHS SDR25 mode */
+#define MMC_CAP_UHS_SDR50	(1 << 18)	/* Host supports UHS SDR50 mode */
+#define MMC_CAP_UHS_SDR104	(1 << 19)	/* Host supports UHS SDR104 mode */
+#define MMC_CAP_UHS_DDR50	(1 << 20)	/* Host supports UHS DDR50 mode */
 #define MMC_CAP_DRIVER_TYPE_A	(1 << 23)	/* Host supports Driver Type A */
 #define MMC_CAP_DRIVER_TYPE_C	(1 << 24)	/* Host supports Driver Type C */
 #define MMC_CAP_DRIVER_TYPE_D	(1 << 25)	/* Host supports Driver Type D */
-- 
cgit v1.2.3


From c3399ef55d8e8295293808eba32e3f7056526324 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 1 Feb 2017 13:47:53 +0100
Subject: mmc: core: rename mmc_start_req() to *areq()

With the coexisting __mmc_start_request(), mmc_start_request()
and __mmc_start_req() it is a bit confusing that mmc_start_req()
actually does not start a normal request, but an asynchronous
request.

Rename it to mmc_start_areq() to make it explicit what the
function is doing, also fix the kerneldoc for this function
while we're at it.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c    |  8 ++++----
 drivers/mmc/core/core.c     | 14 +++++++-------
 drivers/mmc/core/mmc_test.c |  8 ++++----
 include/linux/mmc/core.h    |  2 +-
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 2b9ae44f72f9..8e1aa8d80e76 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1609,8 +1609,8 @@ static void mmc_blk_rw_start_new(struct mmc_queue *mq, struct mmc_card *card,
 		blk_end_request_all(req, -EIO);
 	} else {
 		mmc_blk_rw_rq_prep(mq->mqrq_cur, card, 0, mq);
-		mmc_start_req(card->host,
-			      &mq->mqrq_cur->mmc_active, NULL);
+		mmc_start_areq(card->host,
+			       &mq->mqrq_cur->mmc_active, NULL);
 	}
 }
 
@@ -1648,7 +1648,7 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req)
 		} else
 			new_areq = NULL;
 
-		old_areq = mmc_start_req(card->host, new_areq, &status);
+		old_areq = mmc_start_areq(card->host, new_areq, &status);
 		if (!old_areq) {
 			/*
 			 * We have just put the first request into the pipeline
@@ -1769,7 +1769,7 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req)
 			 */
 			mmc_blk_rw_rq_prep(mq_rq, card,
 					disable_multi, mq);
-			mmc_start_req(card->host,
+			mmc_start_areq(card->host,
 					&mq_rq->mmc_active, NULL);
 			mq_rq->brq.retune_retry_done = retune_retry_done;
 		}
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 3b12981e7627..1985841ff1f1 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -631,10 +631,10 @@ static void mmc_post_req(struct mmc_host *host, struct mmc_request *mrq,
 }
 
 /**
- *	mmc_start_req - start a non-blocking request
+ *	mmc_start_areq - start an asynchronous request
  *	@host: MMC host to start command
- *	@areq: async request to start
- *	@error: out parameter returns 0 for success, otherwise non zero
+ *	@areq: asynchronous request to start
+ *	@ret_stat: out parameter for status
  *
  *	Start a new MMC custom command request for a host.
  *	If there is on ongoing async request wait for completion
@@ -646,9 +646,9 @@ static void mmc_post_req(struct mmc_host *host, struct mmc_request *mrq,
  *	return the completed request. If there is no ongoing request, NULL
  *	is returned without waiting. NULL is not an error condition.
  */
-struct mmc_async_req *mmc_start_req(struct mmc_host *host,
-				    struct mmc_async_req *areq,
-				    enum mmc_blk_status *ret_stat)
+struct mmc_async_req *mmc_start_areq(struct mmc_host *host,
+				     struct mmc_async_req *areq,
+				     enum mmc_blk_status *ret_stat)
 {
 	enum mmc_blk_status status = MMC_BLK_SUCCESS;
 	int start_err = 0;
@@ -699,7 +699,7 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
 		*ret_stat = status;
 	return data;
 }
-EXPORT_SYMBOL(mmc_start_req);
+EXPORT_SYMBOL(mmc_start_areq);
 
 /**
  *	mmc_wait_for_req - start a request and wait for completion
diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c
index 83d193c09d98..f99ac3123fd2 100644
--- a/drivers/mmc/core/mmc_test.c
+++ b/drivers/mmc/core/mmc_test.c
@@ -853,7 +853,7 @@ static int mmc_test_nonblock_transfer(struct mmc_test_card *test,
 	for (i = 0; i < count; i++) {
 		mmc_test_prepare_mrq(test, cur_areq->mrq, sg, sg_len, dev_addr,
 				     blocks, blksz, write);
-		done_areq = mmc_start_req(test->card->host, cur_areq, &status);
+		done_areq = mmc_start_areq(test->card->host, cur_areq, &status);
 
 		if (status != MMC_BLK_SUCCESS || (!done_areq && i > 0)) {
 			ret = RESULT_FAIL;
@@ -872,7 +872,7 @@ static int mmc_test_nonblock_transfer(struct mmc_test_card *test,
 		dev_addr += blocks;
 	}
 
-	done_areq = mmc_start_req(test->card->host, NULL, &status);
+	done_areq = mmc_start_areq(test->card->host, NULL, &status);
 	if (status != MMC_BLK_SUCCESS)
 		ret = RESULT_FAIL;
 
@@ -2402,7 +2402,7 @@ static int mmc_test_ongoing_transfer(struct mmc_test_card *test,
 
 	/* Start ongoing data request */
 	if (use_areq) {
-		mmc_start_req(host, &test_areq.areq, &blkstat);
+		mmc_start_areq(host, &test_areq.areq, &blkstat);
 		if (blkstat != MMC_BLK_SUCCESS) {
 			ret = RESULT_FAIL;
 			goto out_free;
@@ -2440,7 +2440,7 @@ static int mmc_test_ongoing_transfer(struct mmc_test_card *test,
 
 	/* Wait for data request to complete */
 	if (use_areq) {
-		mmc_start_req(host, NULL, &blkstat);
+		mmc_start_areq(host, NULL, &blkstat);
 		if (blkstat != MMC_BLK_SUCCESS)
 			ret = RESULT_FAIL;
 	} else {
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 6dcb339fcd45..a0c63ea28796 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -158,7 +158,7 @@ struct mmc_request {
 struct mmc_card;
 struct mmc_async_req;
 
-struct mmc_async_req *mmc_start_req(struct mmc_host *host,
+struct mmc_async_req *mmc_start_areq(struct mmc_host *host,
 				struct mmc_async_req *areq,
 				enum mmc_blk_status *ret_stat);
 void mmc_wait_for_req(struct mmc_host *host, struct mmc_request *mrq);
-- 
cgit v1.2.3


From 30b888ba950d9f77326b50a4aa2d7d99557d5718 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Sat, 28 Jan 2017 09:55:20 -0500
Subject: radix-tree: Add radix_tree_iter_tag_clear()

The counterpart to radix_tree_iter_tag_set(), used by the IDR code

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Rehas Sachdeva <aquannie@gmail.com>
---
 include/linux/radix-tree.h |  4 ++-
 lib/radix-tree.c           | 68 +++++++++++++++++++++++++++-------------------
 2 files changed, 43 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 32683c7c2e0d..8bf4ef448ce1 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -332,7 +332,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
 int radix_tree_tag_get(const struct radix_tree_root *,
 			unsigned long index, unsigned int tag);
-void radix_tree_iter_tag_set(struct radix_tree_root *root,
+void radix_tree_iter_tag_set(struct radix_tree_root *,
+		const struct radix_tree_iter *iter, unsigned int tag);
+void radix_tree_iter_tag_clear(struct radix_tree_root *,
 		const struct radix_tree_iter *iter, unsigned int tag);
 unsigned int
 radix_tree_gang_lookup_tag(const struct radix_tree_root *, void **results,
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6f86fbac0e36..40f3091c5a6b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1266,6 +1266,22 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 }
 #endif
 
+static void node_tag_set(struct radix_tree_root *root,
+				struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
+{
+	while (node) {
+		if (tag_get(node, tag, offset))
+			return;
+		tag_set(node, tag, offset);
+		offset = node->offset;
+		node = node->parent;
+	}
+
+	if (!root_tag_get(root, tag))
+		root_tag_set(root, tag);
+}
+
 /**
  *	radix_tree_tag_set - set a tag on a radix tree node
  *	@root:		radix tree root
@@ -1307,6 +1323,18 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+/**
+ * radix_tree_iter_tag_set - set a tag on the current iterator entry
+ * @root:	radix tree root
+ * @iter:	iterator state
+ * @tag:	tag to set
+ */
+void radix_tree_iter_tag_set(struct radix_tree_root *root,
+			const struct radix_tree_iter *iter, unsigned int tag)
+{
+	node_tag_set(root, iter->node, tag, iter_offset(iter));
+}
+
 static void node_tag_clear(struct radix_tree_root *root,
 				struct radix_tree_node *node,
 				unsigned int tag, unsigned int offset)
@@ -1327,34 +1355,6 @@ static void node_tag_clear(struct radix_tree_root *root,
 		root_tag_clear(root, tag);
 }
 
-static void node_tag_set(struct radix_tree_root *root,
-				struct radix_tree_node *node,
-				unsigned int tag, unsigned int offset)
-{
-	while (node) {
-		if (tag_get(node, tag, offset))
-			return;
-		tag_set(node, tag, offset);
-		offset = node->offset;
-		node = node->parent;
-	}
-
-	if (!root_tag_get(root, tag))
-		root_tag_set(root, tag);
-}
-
-/**
- * radix_tree_iter_tag_set - set a tag on the current iterator entry
- * @root:	radix tree root
- * @iter:	iterator state
- * @tag:	tag to set
- */
-void radix_tree_iter_tag_set(struct radix_tree_root *root,
-			const struct radix_tree_iter *iter, unsigned int tag)
-{
-	node_tag_set(root, iter->node, tag, iter_offset(iter));
-}
-
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
@@ -1394,6 +1394,18 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
+/**
+  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
+  * @root: radix tree root
+  * @iter: iterator state
+  * @tag: tag to clear
+  */
+void radix_tree_iter_tag_clear(struct radix_tree_root *root,
+			const struct radix_tree_iter *iter, unsigned int tag)
+{
+	node_tag_clear(root, iter->node, tag, iter_offset(iter));
+}
+
 /**
  * radix_tree_tag_get - get a tag on a radix tree node
  * @root:		radix tree root
-- 
cgit v1.2.3


From 0ac398ef391b53122976325ab6953456ce8e8310 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Sat, 28 Jan 2017 09:56:22 -0500
Subject: radix-tree: Add radix_tree_iter_delete

Factor the deletion code out into __radix_tree_delete() and provide a
nice iterator-based wrapper around it.  If we free the node, advance
the iterator to avoid reading from freed memory.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/radix-tree.h |  2 +
 lib/radix-tree.c           | 91 ++++++++++++++++++++++++++++++----------------
 2 files changed, 62 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 8bf4ef448ce1..05f715cb8062 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -311,6 +311,8 @@ void __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node,
 			      radix_tree_update_node_t update_node,
 			      void *private);
+void radix_tree_iter_delete(struct radix_tree_root *,
+				struct radix_tree_iter *iter, void **slot);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 void radix_tree_clear_tags(struct radix_tree_root *root,
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 40f3091c5a6b..7bf7d4e60e43 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -581,10 +581,12 @@ out:
  *	radix_tree_shrink    -    shrink radix tree to minimum height
  *	@root		radix tree root
  */
-static inline void radix_tree_shrink(struct radix_tree_root *root,
+static inline bool radix_tree_shrink(struct radix_tree_root *root,
 				     radix_tree_update_node_t update_node,
 				     void *private)
 {
+	bool shrunk = false;
+
 	for (;;) {
 		struct radix_tree_node *node = root->rnode;
 		struct radix_tree_node *child;
@@ -645,20 +647,26 @@ static inline void radix_tree_shrink(struct radix_tree_root *root,
 
 		WARN_ON_ONCE(!list_empty(&node->private_list));
 		radix_tree_node_free(node);
+		shrunk = true;
 	}
+
+	return shrunk;
 }
 
-static void delete_node(struct radix_tree_root *root,
+static bool delete_node(struct radix_tree_root *root,
 			struct radix_tree_node *node,
 			radix_tree_update_node_t update_node, void *private)
 {
+	bool deleted = false;
+
 	do {
 		struct radix_tree_node *parent;
 
 		if (node->count) {
 			if (node == entry_to_node(root->rnode))
-				radix_tree_shrink(root, update_node, private);
-			return;
+				deleted |= radix_tree_shrink(root, update_node,
+								private);
+			return deleted;
 		}
 
 		parent = node->parent;
@@ -672,9 +680,12 @@ static void delete_node(struct radix_tree_root *root,
 
 		WARN_ON_ONCE(!list_empty(&node->private_list));
 		radix_tree_node_free(node);
+		deleted = true;
 
 		node = parent;
 	} while (node);
+
+	return deleted;
 }
 
 /**
@@ -1859,25 +1870,55 @@ void __radix_tree_delete_node(struct radix_tree_root *root,
 	delete_node(root, node, update_node, private);
 }
 
+static bool __radix_tree_delete(struct radix_tree_root *root,
+				struct radix_tree_node *node, void **slot)
+{
+	unsigned offset = get_slot_offset(node, slot);
+	int tag;
+
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		node_tag_clear(root, node, tag, offset);
+
+	replace_slot(root, node, slot, NULL, true);
+	return node && delete_node(root, node, NULL, NULL);
+}
+
 /**
- *	radix_tree_delete_item    -    delete an item from a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *	@item:		expected item
+ * radix_tree_iter_delete - delete the entry at this iterator position
+ * @root: radix tree root
+ * @iter: iterator state
+ * @slot: pointer to slot
  *
- *	Remove @item at @index from the radix tree rooted at @root.
+ * Delete the entry at the position currently pointed to by the iterator.
+ * This may result in the current node being freed; if it is, the iterator
+ * is advanced so that it will not reference the freed memory.  This
+ * function may be called without any locking if there are no other threads
+ * which can access this tree.
+ */
+void radix_tree_iter_delete(struct radix_tree_root *root,
+				struct radix_tree_iter *iter, void **slot)
+{
+	if (__radix_tree_delete(root, iter->node, slot))
+		iter->index = iter->next_index;
+}
+
+/**
+ * radix_tree_delete_item - delete an item from a radix tree
+ * @root: radix tree root
+ * @index: index key
+ * @item: expected item
+ *
+ * Remove @item at @index from the radix tree rooted at @root.
  *
- *	Returns the address of the deleted item, or NULL if it was not present
- *	or the entry at the given @index was not @item.
+ * Return: the deleted entry, or %NULL if it was not present
+ * or the entry at the given @index was not @item.
  */
 void *radix_tree_delete_item(struct radix_tree_root *root,
 			     unsigned long index, void *item)
 {
 	struct radix_tree_node *node;
-	unsigned int offset;
 	void **slot;
 	void *entry;
-	int tag;
 
 	entry = __radix_tree_lookup(root, index, &node, &slot);
 	if (!entry)
@@ -1886,32 +1927,20 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 	if (item && entry != item)
 		return NULL;
 
-	if (!node) {
-		root_tag_clear_all(root);
-		root->rnode = NULL;
-		return entry;
-	}
-
-	offset = get_slot_offset(node, slot);
-
-	/* Clear all tags associated with the item to be deleted.  */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
-		node_tag_clear(root, node, tag, offset);
-
-	__radix_tree_replace(root, node, slot, NULL, NULL, NULL);
+	__radix_tree_delete(root, node, slot);
 
 	return entry;
 }
 EXPORT_SYMBOL(radix_tree_delete_item);
 
 /**
- *	radix_tree_delete    -    delete an item from a radix tree
- *	@root:		radix tree root
- *	@index:		index key
+ * radix_tree_delete - delete an entry from a radix tree
+ * @root: radix tree root
+ * @index: index key
  *
- *	Remove the item at @index from the radix tree rooted at @root.
+ * Remove the entry at @index from the radix tree rooted at @root.
  *
- *	Returns the address of the deleted item, or NULL if it was not present.
+ * Return: The deleted entry, or %NULL if it was not present.
  */
 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
-- 
cgit v1.2.3


From ca86cad7380e373fa17bc0ee8aff121380323e69 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sat, 4 Feb 2017 13:10:38 -0500
Subject: audit: log module name on init_module

This adds a new auxiliary record MODULE_INIT to the SYSCALL event.

We get finit_module for free since it made most sense to hook this in to
load_module().

https://github.com/linux-audit/audit-kernel/issues/7
https://github.com/linux-audit/audit-kernel/wiki/RFE-Module-Load-Record-Format

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
[PM: corrected links in the commit description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h      | 12 ++++++++++++
 include/uapi/linux/audit.h |  1 +
 kernel/audit.h             |  3 +++
 kernel/auditsc.c           | 14 ++++++++++++++
 kernel/module.c            |  5 ++++-
 5 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2be99b276d29..aba3a2684300 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -360,6 +360,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *old);
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
+extern void __audit_log_kern_module(char *name);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -450,6 +451,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 		__audit_mmap_fd(fd, flags);
 }
 
+static inline void audit_log_kern_module(char *name)
+{
+	if (!audit_dummy_context())
+		__audit_log_kern_module(name);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
@@ -561,6 +568,11 @@ static inline void audit_log_capset(const struct cred *new,
 { }
 static inline void audit_mmap_fd(int fd, int flags)
 { }
+
+static inline void audit_log_kern_module(char *name)
+{
+}
+
 static inline void audit_ptrace(struct task_struct *t)
 { }
 #define audit_n_rules 0
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 3f24110ae63c..3c02bb2ff779 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -111,6 +111,7 @@
 #define AUDIT_PROCTITLE		1327	/* Proctitle emit event */
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
+#define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/audit.h b/kernel/audit.h
index 431444c3708b..144b7ebd2deb 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -199,6 +199,9 @@ struct audit_context {
 		struct {
 			int			argc;
 		} execve;
+		struct {
+			char			*name;
+		} module;
 	};
 	int fds[2];
 	struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb5f504592c6..bde3aac4deed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1268,6 +1268,11 @@ static void show_special(struct audit_context *context, int *call_panic)
 	case AUDIT_EXECVE: {
 		audit_log_execve_info(context, &ab);
 		break; }
+	case AUDIT_KERN_MODULE:
+		audit_log_format(ab, "name=");
+		audit_log_untrustedstring(ab, context->module.name);
+		kfree(context->module.name);
+		break;
 	}
 	audit_log_end(ab);
 }
@@ -2368,6 +2373,15 @@ void __audit_mmap_fd(int fd, int flags)
 	context->type = AUDIT_MMAP;
 }
 
+void __audit_log_kern_module(char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	strcpy(context->module.name, name);
+	context->type = AUDIT_KERN_MODULE;
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
diff --git a/kernel/module.c b/kernel/module.c
index 529efae9f481..5432dbedf8cf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,6 +61,7 @@
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
 #include <linux/dynamic_debug.h>
+#include <linux/audit.h>
 #include <uapi/linux/module.h>
 #include "module-internal.h"
 
@@ -3593,6 +3594,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		goto free_copy;
 	}
 
+	audit_log_kern_module(mod->name);
+
 	/* Reserve our place in the list. */
 	err = add_unformed_module(mod);
 	if (err)
@@ -3681,7 +3684,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		       mod->name, after_dashes);
 	}
 
-	/* Link in to syfs. */
+	/* Link in to sysfs. */
 	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto coming_cleanup;
-- 
cgit v1.2.3


From 251af29c320d86071664f02c76f0d063a19fefdf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 11 Feb 2017 10:37:38 -0500
Subject: nlm: Ensure callback code also checks that the files match

It is not sufficient to just check that the lock pids match when
granting a callback, we also need to ensure that we're granting
the callback on the right file.

Reported-by: Pankaj Singh <psingh.ait@gmail.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/lockd/lockd.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index c15373894a42..b37dee3acaba 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -355,7 +355,8 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp)
 static inline int nlm_compare_locks(const struct file_lock *fl1,
 				    const struct file_lock *fl2)
 {
-	return	fl1->fl_pid   == fl2->fl_pid
+	return file_inode(fl1->fl_file) == file_inode(fl2->fl_file)
+	     && fl1->fl_pid   == fl2->fl_pid
 	     && fl1->fl_owner == fl2->fl_owner
 	     && fl1->fl_start == fl2->fl_start
 	     && fl1->fl_end   == fl2->fl_end
-- 
cgit v1.2.3


From 0a835c4f090af2c76fc2932c539c3b32fd21fbbb Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Tue, 20 Dec 2016 10:27:56 -0500
Subject: Reimplement IDR and IDA using the radix tree

The IDR is very similar to the radix tree.  It has some functionality that
the radix tree did not have (alloc next free, cyclic allocation, a
callback-based for_each, destroy tree), which is readily implementable on
top of the radix tree.  A few small changes were needed in order to use a
tag to represent nodes with free space below them.  More extensive
changes were needed to support storing NULL as a valid entry in an IDR.
Plain radix trees still interpret NULL as a not-present entry.

The IDA is reimplemented as a client of the newly enhanced radix tree.  As
in the current implementation, it uses a bitmap at the last level of the
tree.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/idr.h                     |  145 ++--
 include/linux/radix-tree.h              |   49 +-
 init/main.c                             |    3 +-
 lib/idr.c                               | 1178 ++++++-------------------------
 lib/radix-tree.c                        |  375 +++++++---
 tools/testing/radix-tree/.gitignore     |    1 +
 tools/testing/radix-tree/Makefile       |   10 +-
 tools/testing/radix-tree/idr-test.c     |  342 +++++++++
 tools/testing/radix-tree/linux/gfp.h    |    8 +-
 tools/testing/radix-tree/linux/idr.h    |    1 +
 tools/testing/radix-tree/linux/kernel.h |    1 +
 tools/testing/radix-tree/main.c         |    6 +
 tools/testing/radix-tree/test.h         |    2 +
 13 files changed, 995 insertions(+), 1126 deletions(-)
 create mode 100644 tools/testing/radix-tree/idr-test.c
 create mode 100644 tools/testing/radix-tree/linux/idr.h

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 3c01b89aed67..f58c0a3addc3 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -12,47 +12,28 @@
 #ifndef __IDR_H__
 #define __IDR_H__
 
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/init.h>
-#include <linux/rcupdate.h>
+#include <linux/radix-tree.h>
+#include <linux/gfp.h>
+
+struct idr {
+	struct radix_tree_root	idr_rt;
+	unsigned int		idr_next;
+};
 
 /*
- * Using 6 bits at each layer allows us to allocate 7 layers out of each page.
- * 8 bits only gave us 3 layers out of every pair of pages, which is less
- * efficient except for trees with a largest element between 192-255 inclusive.
+ * The IDR API does not expose the tagging functionality of the radix tree
+ * to users.  Use tag 0 to track whether a node has free space below it.
  */
-#define IDR_BITS 6
-#define IDR_SIZE (1 << IDR_BITS)
-#define IDR_MASK ((1 << IDR_BITS)-1)
-
-struct idr_layer {
-	int			prefix;	/* the ID prefix of this idr_layer */
-	int			layer;	/* distance from leaf */
-	struct idr_layer __rcu	*ary[1<<IDR_BITS];
-	int			count;	/* When zero, we can release it */
-	union {
-		/* A zero bit means "space here" */
-		DECLARE_BITMAP(bitmap, IDR_SIZE);
-		struct rcu_head		rcu_head;
-	};
-};
+#define IDR_FREE	0
 
-struct idr {
-	struct idr_layer __rcu	*hint;	/* the last layer allocated from */
-	struct idr_layer __rcu	*top;
-	int			layers;	/* only valid w/o concurrent changes */
-	int			cur;	/* current pos for cyclic allocation */
-	spinlock_t		lock;
-	int			id_free_cnt;
-	struct idr_layer	*id_free;
-};
+/* Set the IDR flag and the IDR_FREE tag */
+#define IDR_RT_MARKER		((__force gfp_t)(3 << __GFP_BITS_SHIFT))
 
-#define IDR_INIT(name)							\
+#define IDR_INIT							\
 {									\
-	.lock			= __SPIN_LOCK_UNLOCKED(name.lock),	\
+	.idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER)			\
 }
-#define DEFINE_IDR(name)	struct idr name = IDR_INIT(name)
+#define DEFINE_IDR(name)	struct idr name = IDR_INIT
 
 /**
  * idr_get_cursor - Return the current position of the cyclic allocator
@@ -62,9 +43,9 @@ struct idr {
  * idr_alloc_cyclic() if it is free (otherwise the search will start from
  * this position).
  */
-static inline unsigned int idr_get_cursor(struct idr *idr)
+static inline unsigned int idr_get_cursor(const struct idr *idr)
 {
-	return READ_ONCE(idr->cur);
+	return READ_ONCE(idr->idr_next);
 }
 
 /**
@@ -77,7 +58,7 @@ static inline unsigned int idr_get_cursor(struct idr *idr)
  */
 static inline void idr_set_cursor(struct idr *idr, unsigned int val)
 {
-	WRITE_ONCE(idr->cur, val);
+	WRITE_ONCE(idr->idr_next, val);
 }
 
 /**
@@ -97,22 +78,31 @@ static inline void idr_set_cursor(struct idr *idr, unsigned int val)
  * period).
  */
 
-/*
- * This is what we export.
- */
-
-void *idr_find_slowpath(struct idr *idp, int id);
 void idr_preload(gfp_t gfp_mask);
-int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask);
-int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask);
-int idr_for_each(struct idr *idp,
+int idr_alloc(struct idr *, void *entry, int start, int end, gfp_t);
+int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
+int idr_for_each(const struct idr *,
 		 int (*fn)(int id, void *p, void *data), void *data);
-void *idr_get_next(struct idr *idp, int *nextid);
-void *idr_replace(struct idr *idp, void *ptr, int id);
-void idr_remove(struct idr *idp, int id);
-void idr_destroy(struct idr *idp);
-void idr_init(struct idr *idp);
-bool idr_is_empty(struct idr *idp);
+void *idr_get_next(struct idr *, int *nextid);
+void *idr_replace(struct idr *, void *, int id);
+void idr_destroy(struct idr *);
+
+static inline void idr_remove(struct idr *idr, int id)
+{
+	radix_tree_delete(&idr->idr_rt, id);
+}
+
+static inline void idr_init(struct idr *idr)
+{
+	INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
+	idr->idr_next = 0;
+}
+
+static inline bool idr_is_empty(const struct idr *idr)
+{
+	return radix_tree_empty(&idr->idr_rt) &&
+		radix_tree_tagged(&idr->idr_rt, IDR_FREE);
+}
 
 /**
  * idr_preload_end - end preload section started with idr_preload()
@@ -137,19 +127,14 @@ static inline void idr_preload_end(void)
  * This function can be called under rcu_read_lock(), given that the leaf
  * pointers lifetimes are correctly managed.
  */
-static inline void *idr_find(struct idr *idr, int id)
+static inline void *idr_find(const struct idr *idr, int id)
 {
-	struct idr_layer *hint = rcu_dereference_raw(idr->hint);
-
-	if (hint && (id & ~IDR_MASK) == hint->prefix)
-		return rcu_dereference_raw(hint->ary[id & IDR_MASK]);
-
-	return idr_find_slowpath(idr, id);
+	return radix_tree_lookup(&idr->idr_rt, id);
 }
 
 /**
  * idr_for_each_entry - iterate over an idr's elements of a given type
- * @idp:     idr handle
+ * @idr:     idr handle
  * @entry:   the type * to use as cursor
  * @id:      id entry's key
  *
@@ -157,57 +142,60 @@ static inline void *idr_find(struct idr *idr, int id)
  * after normal terminatinon @entry is left with the value NULL.  This
  * is convenient for a "not found" value.
  */
-#define idr_for_each_entry(idp, entry, id)			\
-	for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
+#define idr_for_each_entry(idr, entry, id)			\
+	for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id)
 
 /**
- * idr_for_each_entry - continue iteration over an idr's elements of a given type
- * @idp:     idr handle
+ * idr_for_each_entry_continue - continue iteration over an idr's elements of a given type
+ * @idr:     idr handle
  * @entry:   the type * to use as cursor
  * @id:      id entry's key
  *
  * Continue to iterate over list of given type, continuing after
  * the current position.
  */
-#define idr_for_each_entry_continue(idp, entry, id)			\
-	for ((entry) = idr_get_next((idp), &(id));			\
+#define idr_for_each_entry_continue(idr, entry, id)			\
+	for ((entry) = idr_get_next((idr), &(id));			\
 	     entry;							\
-	     ++id, (entry) = idr_get_next((idp), &(id)))
+	     ++id, (entry) = idr_get_next((idr), &(id)))
 
 /*
  * IDA - IDR based id allocator, use when translation from id to
  * pointer isn't necessary.
- *
- * IDA_BITMAP_LONGS is calculated to be one less to accommodate
- * ida_bitmap->nr_busy so that the whole struct fits in 128 bytes.
  */
 #define IDA_CHUNK_SIZE		128	/* 128 bytes per chunk */
-#define IDA_BITMAP_LONGS	(IDA_CHUNK_SIZE / sizeof(long) - 1)
+#define IDA_BITMAP_LONGS	(IDA_CHUNK_SIZE / sizeof(long))
 #define IDA_BITMAP_BITS 	(IDA_BITMAP_LONGS * sizeof(long) * 8)
 
 struct ida_bitmap {
-	long			nr_busy;
 	unsigned long		bitmap[IDA_BITMAP_LONGS];
 };
 
 struct ida {
-	struct idr		idr;
+	struct radix_tree_root	ida_rt;
 	struct ida_bitmap	*free_bitmap;
 };
 
-#define IDA_INIT(name)		{ .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
-#define DEFINE_IDA(name)	struct ida name = IDA_INIT(name)
+#define IDA_INIT	{						\
+	.ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT),		\
+}
+#define DEFINE_IDA(name)	struct ida name = IDA_INIT
 
 int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
 int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);
 void ida_remove(struct ida *ida, int id);
 void ida_destroy(struct ida *ida);
-void ida_init(struct ida *ida);
 
 int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
 		   gfp_t gfp_mask);
 void ida_simple_remove(struct ida *ida, unsigned int id);
 
+static inline void ida_init(struct ida *ida)
+{
+	INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT);
+	ida->free_bitmap = NULL;
+}
+
 /**
  * ida_get_new - allocate new ID
  * @ida:	idr handle
@@ -220,11 +208,8 @@ static inline int ida_get_new(struct ida *ida, int *p_id)
 	return ida_get_new_above(ida, 0, p_id);
 }
 
-static inline bool ida_is_empty(struct ida *ida)
+static inline bool ida_is_empty(const struct ida *ida)
 {
-	return idr_is_empty(&ida->idr);
+	return radix_tree_empty(&ida->ida_rt);
 }
-
-void __init idr_init_cache(void);
-
 #endif /* __IDR_H__ */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 05f715cb8062..2ba0c1f46c84 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -105,7 +105,10 @@ struct radix_tree_node {
 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
-/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
+/* The top bits of gfp_mask are used to store the root tags and the IDR flag */
+#define ROOT_IS_IDR	((__force gfp_t)(1 << __GFP_BITS_SHIFT))
+#define ROOT_TAG_SHIFT	(__GFP_BITS_SHIFT + 1)
+
 struct radix_tree_root {
 	gfp_t			gfp_mask;
 	struct radix_tree_node	__rcu *rnode;
@@ -358,10 +361,14 @@ int radix_tree_split(struct radix_tree_root *, unsigned long index,
 			unsigned new_order);
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
 			unsigned new_order, void *);
+void **idr_get_free(struct radix_tree_root *, struct radix_tree_iter *,
+			gfp_t, int end);
 
-#define RADIX_TREE_ITER_TAG_MASK	0x00FF	/* tag index in lower byte */
-#define RADIX_TREE_ITER_TAGGED		0x0100	/* lookup tagged slots */
-#define RADIX_TREE_ITER_CONTIG		0x0200	/* stop at first hole */
+enum {
+	RADIX_TREE_ITER_TAG_MASK = 0x0f,	/* tag index in lower nybble */
+	RADIX_TREE_ITER_TAGGED   = 0x10,	/* lookup tagged slots */
+	RADIX_TREE_ITER_CONTIG   = 0x20,	/* stop at first hole */
+};
 
 /**
  * radix_tree_iter_init - initialize radix tree iterator
@@ -402,6 +409,40 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
 void **radix_tree_next_chunk(const struct radix_tree_root *,
 			     struct radix_tree_iter *iter, unsigned flags);
 
+/**
+ * radix_tree_iter_lookup - look up an index in the radix tree
+ * @root: radix tree root
+ * @iter: iterator state
+ * @index: key to look up
+ *
+ * If @index is present in the radix tree, this function returns the slot
+ * containing it and updates @iter to describe the entry.  If @index is not
+ * present, it returns NULL.
+ */
+static inline void **radix_tree_iter_lookup(const struct radix_tree_root *root,
+			struct radix_tree_iter *iter, unsigned long index)
+{
+	radix_tree_iter_init(iter, index);
+	return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
+}
+
+/**
+ * radix_tree_iter_find - find a present entry
+ * @root: radix tree root
+ * @iter: iterator state
+ * @index: start location
+ *
+ * This function returns the slot containing the entry with the lowest index
+ * which is at least @index.  If @index is larger than any present entry, this
+ * function returns NULL.  The @iter is updated to describe the entry found.
+ */
+static inline void **radix_tree_iter_find(const struct radix_tree_root *root,
+			struct radix_tree_iter *iter, unsigned long index)
+{
+	radix_tree_iter_init(iter, index);
+	return radix_tree_next_chunk(root, iter, 0);
+}
+
 /**
  * radix_tree_iter_retry - retry this chunk of the iteration
  * @iter:	iterator state
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..a65e3aad31bc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -553,7 +553,7 @@ asmlinkage __visible void __init start_kernel(void)
 	if (WARN(!irqs_disabled(),
 		 "Interrupts were enabled *very* early, fixing it\n"))
 		local_irq_disable();
-	idr_init_cache();
+	radix_tree_init();
 
 	/*
 	 * Allow workqueue creation and work item queueing/cancelling
@@ -568,7 +568,6 @@ asmlinkage __visible void __init start_kernel(void)
 	trace_init();
 
 	context_tracking_init();
-	radix_tree_init();
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
diff --git a/lib/idr.c b/lib/idr.c
index 52d2979a05e8..b87056e2cc4c 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -1,1068 +1,369 @@
-/*
- * 2002-10-18  written by Jim Houston jim.houston@ccur.com
- *	Copyright (C) 2002 by Concurrent Computer Corporation
- *	Distributed under the GNU GPL license version 2.
- *
- * Modified by George Anzinger to reuse immediately and to use
- * find bit instructions.  Also removed _irq on spinlocks.
- *
- * Modified by Nadia Derbey to make it RCU safe.
- *
- * Small id to pointer translation service.
- *
- * It uses a radix tree like structure as a sparse array indexed
- * by the id to obtain the pointer.  The bitmap makes allocating
- * a new id quick.
- *
- * You call it to allocate an id (an int) an associate with that id a
- * pointer or what ever, we treat it as a (void *).  You can pass this
- * id to a user for him to pass back at a later time.  You then pass
- * that id to this code and it returns your pointer.
- */
-
-#ifndef TEST                        // to test in user space...
-#include <linux/slab.h>
-#include <linux/init.h>
+#include <linux/bitmap.h>
 #include <linux/export.h>
-#endif
-#include <linux/err.h>
-#include <linux/string.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/percpu.h>
 
-#define MAX_IDR_SHIFT		(sizeof(int) * 8 - 1)
-#define MAX_IDR_BIT		(1U << MAX_IDR_SHIFT)
-
-/* Leave the possibility of an incomplete final layer */
-#define MAX_IDR_LEVEL ((MAX_IDR_SHIFT + IDR_BITS - 1) / IDR_BITS)
-
-/* Number of id_layer structs to leave in free list */
-#define MAX_IDR_FREE (MAX_IDR_LEVEL * 2)
-
-static struct kmem_cache *idr_layer_cache;
-static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
-static DEFINE_PER_CPU(int, idr_preload_cnt);
 static DEFINE_SPINLOCK(simple_ida_lock);
 
-/* the maximum ID which can be allocated given idr->layers */
-static int idr_max(int layers)
-{
-	int bits = min_t(int, layers * IDR_BITS, MAX_IDR_SHIFT);
-
-	return (1 << bits) - 1;
-}
-
-/*
- * Prefix mask for an idr_layer at @layer.  For layer 0, the prefix mask is
- * all bits except for the lower IDR_BITS.  For layer 1, 2 * IDR_BITS, and
- * so on.
- */
-static int idr_layer_prefix_mask(int layer)
-{
-	return ~idr_max(layer + 1);
-}
-
-static struct idr_layer *get_from_free_list(struct idr *idp)
-{
-	struct idr_layer *p;
-	unsigned long flags;
-
-	spin_lock_irqsave(&idp->lock, flags);
-	if ((p = idp->id_free)) {
-		idp->id_free = p->ary[0];
-		idp->id_free_cnt--;
-		p->ary[0] = NULL;
-	}
-	spin_unlock_irqrestore(&idp->lock, flags);
-	return(p);
-}
-
-/**
- * idr_layer_alloc - allocate a new idr_layer
- * @gfp_mask: allocation mask
- * @layer_idr: optional idr to allocate from
- *
- * If @layer_idr is %NULL, directly allocate one using @gfp_mask or fetch
- * one from the per-cpu preload buffer.  If @layer_idr is not %NULL, fetch
- * an idr_layer from @idr->id_free.
- *
- * @layer_idr is to maintain backward compatibility with the old alloc
- * interface - idr_pre_get() and idr_get_new*() - and will be removed
- * together with per-pool preload buffer.
- */
-static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
-{
-	struct idr_layer *new;
-
-	/* this is the old path, bypass to get_from_free_list() */
-	if (layer_idr)
-		return get_from_free_list(layer_idr);
-
-	/*
-	 * Try to allocate directly from kmem_cache.  We want to try this
-	 * before preload buffer; otherwise, non-preloading idr_alloc()
-	 * users will end up taking advantage of preloading ones.  As the
-	 * following is allowed to fail for preloaded cases, suppress
-	 * warning this time.
-	 */
-	new = kmem_cache_zalloc(idr_layer_cache, gfp_mask | __GFP_NOWARN);
-	if (new)
-		return new;
-
-	/*
-	 * Try to fetch one from the per-cpu preload buffer if in process
-	 * context.  See idr_preload() for details.
-	 */
-	if (!in_interrupt()) {
-		preempt_disable();
-		new = __this_cpu_read(idr_preload_head);
-		if (new) {
-			__this_cpu_write(idr_preload_head, new->ary[0]);
-			__this_cpu_dec(idr_preload_cnt);
-			new->ary[0] = NULL;
-		}
-		preempt_enable();
-		if (new)
-			return new;
-	}
-
-	/*
-	 * Both failed.  Try kmem_cache again w/o adding __GFP_NOWARN so
-	 * that memory allocation failure warning is printed as intended.
-	 */
-	return kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-}
-
-static void idr_layer_rcu_free(struct rcu_head *head)
-{
-	struct idr_layer *layer;
-
-	layer = container_of(head, struct idr_layer, rcu_head);
-	kmem_cache_free(idr_layer_cache, layer);
-}
-
-static inline void free_layer(struct idr *idr, struct idr_layer *p)
-{
-	if (idr->hint == p)
-		RCU_INIT_POINTER(idr->hint, NULL);
-	call_rcu(&p->rcu_head, idr_layer_rcu_free);
-}
-
-/* only called when idp->lock is held */
-static void __move_to_free_list(struct idr *idp, struct idr_layer *p)
-{
-	p->ary[0] = idp->id_free;
-	idp->id_free = p;
-	idp->id_free_cnt++;
-}
-
-static void move_to_free_list(struct idr *idp, struct idr_layer *p)
-{
-	unsigned long flags;
-
-	/*
-	 * Depends on the return element being zeroed.
-	 */
-	spin_lock_irqsave(&idp->lock, flags);
-	__move_to_free_list(idp, p);
-	spin_unlock_irqrestore(&idp->lock, flags);
-}
-
-static void idr_mark_full(struct idr_layer **pa, int id)
-{
-	struct idr_layer *p = pa[0];
-	int l = 0;
-
-	__set_bit(id & IDR_MASK, p->bitmap);
-	/*
-	 * If this layer is full mark the bit in the layer above to
-	 * show that this part of the radix tree is full.  This may
-	 * complete the layer above and require walking up the radix
-	 * tree.
-	 */
-	while (bitmap_full(p->bitmap, IDR_SIZE)) {
-		if (!(p = pa[++l]))
-			break;
-		id = id >> IDR_BITS;
-		__set_bit((id & IDR_MASK), p->bitmap);
-	}
-}
-
-static int __idr_pre_get(struct idr *idp, gfp_t gfp_mask)
-{
-	while (idp->id_free_cnt < MAX_IDR_FREE) {
-		struct idr_layer *new;
-		new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-		if (new == NULL)
-			return (0);
-		move_to_free_list(idp, new);
-	}
-	return 1;
-}
-
-/**
- * sub_alloc - try to allocate an id without growing the tree depth
- * @idp: idr handle
- * @starting_id: id to start search at
- * @pa: idr_layer[MAX_IDR_LEVEL] used as backtrack buffer
- * @gfp_mask: allocation mask for idr_layer_alloc()
- * @layer_idr: optional idr passed to idr_layer_alloc()
- *
- * Allocate an id in range [@starting_id, INT_MAX] from @idp without
- * growing its depth.  Returns
- *
- *  the allocated id >= 0 if successful,
- *  -EAGAIN if the tree needs to grow for allocation to succeed,
- *  -ENOSPC if the id space is exhausted,
- *  -ENOMEM if more idr_layers need to be allocated.
- */
-static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa,
-		     gfp_t gfp_mask, struct idr *layer_idr)
-{
-	int n, m, sh;
-	struct idr_layer *p, *new;
-	int l, id, oid;
-
-	id = *starting_id;
- restart:
-	p = idp->top;
-	l = idp->layers;
-	pa[l--] = NULL;
-	while (1) {
-		/*
-		 * We run around this while until we reach the leaf node...
-		 */
-		n = (id >> (IDR_BITS*l)) & IDR_MASK;
-		m = find_next_zero_bit(p->bitmap, IDR_SIZE, n);
-		if (m == IDR_SIZE) {
-			/* no space available go back to previous layer. */
-			l++;
-			oid = id;
-			id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1;
-
-			/* if already at the top layer, we need to grow */
-			if (id > idr_max(idp->layers)) {
-				*starting_id = id;
-				return -EAGAIN;
-			}
-			p = pa[l];
-			BUG_ON(!p);
-
-			/* If we need to go up one layer, continue the
-			 * loop; otherwise, restart from the top.
-			 */
-			sh = IDR_BITS * (l + 1);
-			if (oid >> sh == id >> sh)
-				continue;
-			else
-				goto restart;
-		}
-		if (m != n) {
-			sh = IDR_BITS*l;
-			id = ((id >> sh) ^ n ^ m) << sh;
-		}
-		if ((id >= MAX_IDR_BIT) || (id < 0))
-			return -ENOSPC;
-		if (l == 0)
-			break;
-		/*
-		 * Create the layer below if it is missing.
-		 */
-		if (!p->ary[m]) {
-			new = idr_layer_alloc(gfp_mask, layer_idr);
-			if (!new)
-				return -ENOMEM;
-			new->layer = l-1;
-			new->prefix = id & idr_layer_prefix_mask(new->layer);
-			rcu_assign_pointer(p->ary[m], new);
-			p->count++;
-		}
-		pa[l--] = p;
-		p = p->ary[m];
-	}
-
-	pa[l] = p;
-	return id;
-}
-
-static int idr_get_empty_slot(struct idr *idp, int starting_id,
-			      struct idr_layer **pa, gfp_t gfp_mask,
-			      struct idr *layer_idr)
-{
-	struct idr_layer *p, *new;
-	int layers, v, id;
-	unsigned long flags;
-
-	id = starting_id;
-build_up:
-	p = idp->top;
-	layers = idp->layers;
-	if (unlikely(!p)) {
-		if (!(p = idr_layer_alloc(gfp_mask, layer_idr)))
-			return -ENOMEM;
-		p->layer = 0;
-		layers = 1;
-	}
-	/*
-	 * Add a new layer to the top of the tree if the requested
-	 * id is larger than the currently allocated space.
-	 */
-	while (id > idr_max(layers)) {
-		layers++;
-		if (!p->count) {
-			/* special case: if the tree is currently empty,
-			 * then we grow the tree by moving the top node
-			 * upwards.
-			 */
-			p->layer++;
-			WARN_ON_ONCE(p->prefix);
-			continue;
-		}
-		if (!(new = idr_layer_alloc(gfp_mask, layer_idr))) {
-			/*
-			 * The allocation failed.  If we built part of
-			 * the structure tear it down.
-			 */
-			spin_lock_irqsave(&idp->lock, flags);
-			for (new = p; p && p != idp->top; new = p) {
-				p = p->ary[0];
-				new->ary[0] = NULL;
-				new->count = 0;
-				bitmap_clear(new->bitmap, 0, IDR_SIZE);
-				__move_to_free_list(idp, new);
-			}
-			spin_unlock_irqrestore(&idp->lock, flags);
-			return -ENOMEM;
-		}
-		new->ary[0] = p;
-		new->count = 1;
-		new->layer = layers-1;
-		new->prefix = id & idr_layer_prefix_mask(new->layer);
-		if (bitmap_full(p->bitmap, IDR_SIZE))
-			__set_bit(0, new->bitmap);
-		p = new;
-	}
-	rcu_assign_pointer(idp->top, p);
-	idp->layers = layers;
-	v = sub_alloc(idp, &id, pa, gfp_mask, layer_idr);
-	if (v == -EAGAIN)
-		goto build_up;
-	return(v);
-}
-
-/*
- * @id and @pa are from a successful allocation from idr_get_empty_slot().
- * Install the user pointer @ptr and mark the slot full.
- */
-static void idr_fill_slot(struct idr *idr, void *ptr, int id,
-			  struct idr_layer **pa)
-{
-	/* update hint used for lookup, cleared from free_layer() */
-	rcu_assign_pointer(idr->hint, pa[0]);
-
-	rcu_assign_pointer(pa[0]->ary[id & IDR_MASK], (struct idr_layer *)ptr);
-	pa[0]->count++;
-	idr_mark_full(pa, id);
-}
-
-
-/**
- * idr_preload - preload for idr_alloc()
- * @gfp_mask: allocation mask to use for preloading
- *
- * Preload per-cpu layer buffer for idr_alloc().  Can only be used from
- * process context and each idr_preload() invocation should be matched with
- * idr_preload_end().  Note that preemption is disabled while preloaded.
- *
- * The first idr_alloc() in the preloaded section can be treated as if it
- * were invoked with @gfp_mask used for preloading.  This allows using more
- * permissive allocation masks for idrs protected by spinlocks.
- *
- * For example, if idr_alloc() below fails, the failure can be treated as
- * if idr_alloc() were called with GFP_KERNEL rather than GFP_NOWAIT.
- *
- *	idr_preload(GFP_KERNEL);
- *	spin_lock(lock);
- *
- *	id = idr_alloc(idr, ptr, start, end, GFP_NOWAIT);
- *
- *	spin_unlock(lock);
- *	idr_preload_end();
- *	if (id < 0)
- *		error;
- */
-void idr_preload(gfp_t gfp_mask)
-{
-	/*
-	 * Consuming preload buffer from non-process context breaks preload
-	 * allocation guarantee.  Disallow usage from those contexts.
-	 */
-	WARN_ON_ONCE(in_interrupt());
-	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
-
-	preempt_disable();
-
-	/*
-	 * idr_alloc() is likely to succeed w/o full idr_layer buffer and
-	 * return value from idr_alloc() needs to be checked for failure
-	 * anyway.  Silently give up if allocation fails.  The caller can
-	 * treat failures from idr_alloc() as if idr_alloc() were called
-	 * with @gfp_mask which should be enough.
-	 */
-	while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
-		struct idr_layer *new;
-
-		preempt_enable();
-		new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-		preempt_disable();
-		if (!new)
-			break;
-
-		/* link the new one to per-cpu preload list */
-		new->ary[0] = __this_cpu_read(idr_preload_head);
-		__this_cpu_write(idr_preload_head, new);
-		__this_cpu_inc(idr_preload_cnt);
-	}
-}
-EXPORT_SYMBOL(idr_preload);
-
 /**
- * idr_alloc - allocate new idr entry
- * @idr: the (initialized) idr
+ * idr_alloc - allocate an id
+ * @idr: idr handle
  * @ptr: pointer to be associated with the new id
  * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive, <= 0 for max)
- * @gfp_mask: memory allocation flags
+ * @end: the maximum id (exclusive)
+ * @gfp: memory allocation flags
  *
- * Allocate an id in [start, end) and associate it with @ptr.  If no ID is
- * available in the specified range, returns -ENOSPC.  On memory allocation
- * failure, returns -ENOMEM.
+ * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
+ * if there are no unused IDs in that range.
  *
  * Note that @end is treated as max when <= 0.  This is to always allow
  * using @start + N as @end as long as N is inside integer range.
  *
- * The user is responsible for exclusively synchronizing all operations
- * which may modify @idr.  However, read-only accesses such as idr_find()
- * or iteration can be performed under RCU read lock provided the user
- * destroys @ptr in RCU-safe way after removal from idr.
+ * Simultaneous modifications to the @idr are not allowed and should be
+ * prevented by the user, usually with a lock.  idr_alloc() may be called
+ * concurrently with read-only accesses to the @idr, such as idr_find() and
+ * idr_for_each_entry().
  */
-int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask)
+int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
 {
-	int max = end > 0 ? end - 1 : INT_MAX;	/* inclusive upper limit */
-	struct idr_layer *pa[MAX_IDR_LEVEL + 1];
-	int id;
-
-	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+	void **slot;
+	struct radix_tree_iter iter;
 
-	/* sanity checks */
 	if (WARN_ON_ONCE(start < 0))
 		return -EINVAL;
-	if (unlikely(max < start))
-		return -ENOSPC;
+	if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
+		return -EINVAL;
 
-	/* allocate id */
-	id = idr_get_empty_slot(idr, start, pa, gfp_mask, NULL);
-	if (unlikely(id < 0))
-		return id;
-	if (unlikely(id > max))
-		return -ENOSPC;
+	radix_tree_iter_init(&iter, start);
+	slot = idr_get_free(&idr->idr_rt, &iter, gfp, end);
+	if (IS_ERR(slot))
+		return PTR_ERR(slot);
 
-	idr_fill_slot(idr, ptr, id, pa);
-	return id;
+	radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
+	radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);
+	return iter.index;
 }
 EXPORT_SYMBOL_GPL(idr_alloc);
 
 /**
  * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
- * @idr: the (initialized) idr
+ * @idr: idr handle
  * @ptr: pointer to be associated with the new id
  * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive, <= 0 for max)
- * @gfp_mask: memory allocation flags
- *
- * Essentially the same as idr_alloc, but prefers to allocate progressively
- * higher ids if it can. If the "cur" counter wraps, then it will start again
- * at the "start" end of the range and allocate one that has already been used.
- */
-int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end,
-			gfp_t gfp_mask)
-{
-	int id;
-
-	id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask);
-	if (id == -ENOSPC)
-		id = idr_alloc(idr, ptr, start, end, gfp_mask);
-
-	if (likely(id >= 0))
-		idr->cur = id + 1;
-	return id;
-}
-EXPORT_SYMBOL(idr_alloc_cyclic);
-
-static void idr_remove_warning(int id)
-{
-	WARN(1, "idr_remove called for id=%d which is not allocated.\n", id);
-}
-
-static void sub_remove(struct idr *idp, int shift, int id)
-{
-	struct idr_layer *p = idp->top;
-	struct idr_layer **pa[MAX_IDR_LEVEL + 1];
-	struct idr_layer ***paa = &pa[0];
-	struct idr_layer *to_free;
-	int n;
-
-	*paa = NULL;
-	*++paa = &idp->top;
-
-	while ((shift > 0) && p) {
-		n = (id >> shift) & IDR_MASK;
-		__clear_bit(n, p->bitmap);
-		*++paa = &p->ary[n];
-		p = p->ary[n];
-		shift -= IDR_BITS;
-	}
-	n = id & IDR_MASK;
-	if (likely(p != NULL && test_bit(n, p->bitmap))) {
-		__clear_bit(n, p->bitmap);
-		RCU_INIT_POINTER(p->ary[n], NULL);
-		to_free = NULL;
-		while(*paa && ! --((**paa)->count)){
-			if (to_free)
-				free_layer(idp, to_free);
-			to_free = **paa;
-			**paa-- = NULL;
-		}
-		if (!*paa)
-			idp->layers = 0;
-		if (to_free)
-			free_layer(idp, to_free);
-	} else
-		idr_remove_warning(id);
-}
-
-/**
- * idr_remove - remove the given id and free its slot
- * @idp: idr handle
- * @id: unique key
- */
-void idr_remove(struct idr *idp, int id)
-{
-	struct idr_layer *p;
-	struct idr_layer *to_free;
-
-	if (id < 0)
-		return;
-
-	if (id > idr_max(idp->layers)) {
-		idr_remove_warning(id);
-		return;
-	}
-
-	sub_remove(idp, (idp->layers - 1) * IDR_BITS, id);
-	if (idp->top && idp->top->count == 1 && (idp->layers > 1) &&
-	    idp->top->ary[0]) {
-		/*
-		 * Single child at leftmost slot: we can shrink the tree.
-		 * This level is not needed anymore since when layers are
-		 * inserted, they are inserted at the top of the existing
-		 * tree.
-		 */
-		to_free = idp->top;
-		p = idp->top->ary[0];
-		rcu_assign_pointer(idp->top, p);
-		--idp->layers;
-		to_free->count = 0;
-		bitmap_clear(to_free->bitmap, 0, IDR_SIZE);
-		free_layer(idp, to_free);
-	}
-}
-EXPORT_SYMBOL(idr_remove);
-
-static void __idr_remove_all(struct idr *idp)
-{
-	int n, id, max;
-	int bt_mask;
-	struct idr_layer *p;
-	struct idr_layer *pa[MAX_IDR_LEVEL + 1];
-	struct idr_layer **paa = &pa[0];
-
-	n = idp->layers * IDR_BITS;
-	*paa = idp->top;
-	RCU_INIT_POINTER(idp->top, NULL);
-	max = idr_max(idp->layers);
-
-	id = 0;
-	while (id >= 0 && id <= max) {
-		p = *paa;
-		while (n > IDR_BITS && p) {
-			n -= IDR_BITS;
-			p = p->ary[(id >> n) & IDR_MASK];
-			*++paa = p;
-		}
-
-		bt_mask = id;
-		id += 1 << n;
-		/* Get the highest bit that the above add changed from 0->1. */
-		while (n < fls(id ^ bt_mask)) {
-			if (*paa)
-				free_layer(idp, *paa);
-			n += IDR_BITS;
-			--paa;
-		}
-	}
-	idp->layers = 0;
-}
-
-/**
- * idr_destroy - release all cached layers within an idr tree
- * @idp: idr handle
- *
- * Free all id mappings and all idp_layers.  After this function, @idp is
- * completely unused and can be freed / recycled.  The caller is
- * responsible for ensuring that no one else accesses @idp during or after
- * idr_destroy().
+ * @end: the maximum id (exclusive)
+ * @gfp: memory allocation flags
  *
- * A typical clean-up sequence for objects stored in an idr tree will use
- * idr_for_each() to free all objects, if necessary, then idr_destroy() to
- * free up the id mappings and cached idr_layers.
+ * Allocates an ID larger than the last ID allocated if one is available.
+ * If not, it will attempt to allocate the smallest ID that is larger or
+ * equal to @start.
  */
-void idr_destroy(struct idr *idp)
-{
-	__idr_remove_all(idp);
-
-	while (idp->id_free_cnt) {
-		struct idr_layer *p = get_from_free_list(idp);
-		kmem_cache_free(idr_layer_cache, p);
-	}
-}
-EXPORT_SYMBOL(idr_destroy);
-
-void *idr_find_slowpath(struct idr *idp, int id)
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
 {
-	int n;
-	struct idr_layer *p;
+	int id, curr = idr->idr_next;
 
-	if (id < 0)
-		return NULL;
+	if (curr < start)
+		curr = start;
 
-	p = rcu_dereference_raw(idp->top);
-	if (!p)
-		return NULL;
-	n = (p->layer+1) * IDR_BITS;
+	id = idr_alloc(idr, ptr, curr, end, gfp);
+	if ((id == -ENOSPC) && (curr > start))
+		id = idr_alloc(idr, ptr, start, curr, gfp);
 
-	if (id > idr_max(p->layer + 1))
-		return NULL;
-	BUG_ON(n == 0);
+	if (id >= 0)
+		idr->idr_next = id + 1U;
 
-	while (n > 0 && p) {
-		n -= IDR_BITS;
-		BUG_ON(n != p->layer*IDR_BITS);
-		p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]);
-	}
-	return((void *)p);
+	return id;
 }
-EXPORT_SYMBOL(idr_find_slowpath);
+EXPORT_SYMBOL(idr_alloc_cyclic);
 
 /**
  * idr_for_each - iterate through all stored pointers
- * @idp: idr handle
+ * @idr: idr handle
  * @fn: function to be called for each pointer
- * @data: data passed back to callback function
+ * @data: data passed to callback function
  *
- * Iterate over the pointers registered with the given idr.  The
- * callback function will be called for each pointer currently
- * registered, passing the id, the pointer and the data pointer passed
- * to this function.  It is not safe to modify the idr tree while in
- * the callback, so functions such as idr_get_new and idr_remove are
- * not allowed.
+ * The callback function will be called for each entry in @idr, passing
+ * the id, the pointer and the data pointer passed to this function.
  *
- * We check the return of @fn each time. If it returns anything other
- * than %0, we break out and return that value.
+ * If @fn returns anything other than %0, the iteration stops and that
+ * value is returned from this function.
  *
- * The caller must serialize idr_for_each() vs idr_get_new() and idr_remove().
+ * idr_for_each() can be called concurrently with idr_alloc() and
+ * idr_remove() if protected by RCU.  Newly added entries may not be
+ * seen and deleted entries may be seen, but adding and removing entries
+ * will not cause other entries to be skipped, nor spurious ones to be seen.
  */
-int idr_for_each(struct idr *idp,
-		 int (*fn)(int id, void *p, void *data), void *data)
+int idr_for_each(const struct idr *idr,
+		int (*fn)(int id, void *p, void *data), void *data)
 {
-	int n, id, max, error = 0;
-	struct idr_layer *p;
-	struct idr_layer *pa[MAX_IDR_LEVEL + 1];
-	struct idr_layer **paa = &pa[0];
-
-	n = idp->layers * IDR_BITS;
-	*paa = rcu_dereference_raw(idp->top);
-	max = idr_max(idp->layers);
-
-	id = 0;
-	while (id >= 0 && id <= max) {
-		p = *paa;
-		while (n > 0 && p) {
-			n -= IDR_BITS;
-			p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]);
-			*++paa = p;
-		}
-
-		if (p) {
-			error = fn(id, (void *)p, data);
-			if (error)
-				break;
-		}
+	struct radix_tree_iter iter;
+	void **slot;
 
-		id += 1 << n;
-		while (n < fls(id)) {
-			n += IDR_BITS;
-			--paa;
-		}
+	radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
+		int ret = fn(iter.index, rcu_dereference_raw(*slot), data);
+		if (ret)
+			return ret;
 	}
 
-	return error;
+	return 0;
 }
 EXPORT_SYMBOL(idr_for_each);
 
 /**
- * idr_get_next - lookup next object of id to given id.
- * @idp: idr handle
- * @nextidp:  pointer to lookup key
- *
- * Returns pointer to registered object with id, which is next number to
- * given id. After being looked up, *@nextidp will be updated for the next
- * iteration.
- *
- * This function can be called under rcu_read_lock(), given that the leaf
- * pointers lifetimes are correctly managed.
+ * idr_get_next - Find next populated entry
+ * @idr: idr handle
+ * @nextid: Pointer to lowest possible ID to return
+ *
+ * Returns the next populated entry in the tree with an ID greater than
+ * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
+ * to the ID of the found value.  To use in a loop, the value pointed to by
+ * nextid must be incremented by the user.
  */
-void *idr_get_next(struct idr *idp, int *nextidp)
+void *idr_get_next(struct idr *idr, int *nextid)
 {
-	struct idr_layer *p, *pa[MAX_IDR_LEVEL + 1];
-	struct idr_layer **paa = &pa[0];
-	int id = *nextidp;
-	int n, max;
+	struct radix_tree_iter iter;
+	void **slot;
 
-	/* find first ent */
-	p = *paa = rcu_dereference_raw(idp->top);
-	if (!p)
+	slot = radix_tree_iter_find(&idr->idr_rt, &iter, *nextid);
+	if (!slot)
 		return NULL;
-	n = (p->layer + 1) * IDR_BITS;
-	max = idr_max(p->layer + 1);
-
-	while (id >= 0 && id <= max) {
-		p = *paa;
-		while (n > 0 && p) {
-			n -= IDR_BITS;
-			p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]);
-			*++paa = p;
-		}
 
-		if (p) {
-			*nextidp = id;
-			return p;
-		}
-
-		/*
-		 * Proceed to the next layer at the current level.  Unlike
-		 * idr_for_each(), @id isn't guaranteed to be aligned to
-		 * layer boundary at this point and adding 1 << n may
-		 * incorrectly skip IDs.  Make sure we jump to the
-		 * beginning of the next layer using round_up().
-		 */
-		id = round_up(id + 1, 1 << n);
-		while (n < fls(id)) {
-			n += IDR_BITS;
-			--paa;
-		}
-	}
-	return NULL;
+	*nextid = iter.index;
+	return rcu_dereference_raw(*slot);
 }
 EXPORT_SYMBOL(idr_get_next);
 
-
 /**
  * idr_replace - replace pointer for given id
- * @idp: idr handle
- * @ptr: pointer you want associated with the id
- * @id: lookup key
+ * @idr: idr handle
+ * @ptr: New pointer to associate with the ID
+ * @id: Lookup key
  *
- * Replace the pointer registered with an id and return the old value.
- * A %-ENOENT return indicates that @id was not found.
- * A %-EINVAL return indicates that @id was not within valid constraints.
+ * Replace the pointer registered with an ID and return the old value.
+ * This function can be called under the RCU read lock concurrently with
+ * idr_alloc() and idr_remove() (as long as the ID being removed is not
+ * the one being replaced!).
  *
- * The caller must serialize with writers.
+ * Returns: 0 on success.  %-ENOENT indicates that @id was not found.
+ * %-EINVAL indicates that @id or @ptr were not valid.
  */
-void *idr_replace(struct idr *idp, void *ptr, int id)
+void *idr_replace(struct idr *idr, void *ptr, int id)
 {
-	int n;
-	struct idr_layer *p, *old_p;
+	struct radix_tree_node *node;
+	void **slot = NULL;
+	void *entry;
 
-	if (id < 0)
+	if (WARN_ON_ONCE(id < 0))
+		return ERR_PTR(-EINVAL);
+	if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
 		return ERR_PTR(-EINVAL);
 
-	p = idp->top;
-	if (!p)
-		return ERR_PTR(-ENOENT);
-
-	if (id > idr_max(p->layer + 1))
-		return ERR_PTR(-ENOENT);
-
-	n = p->layer * IDR_BITS;
-	while ((n > 0) && p) {
-		p = p->ary[(id >> n) & IDR_MASK];
-		n -= IDR_BITS;
-	}
-
-	n = id & IDR_MASK;
-	if (unlikely(p == NULL || !test_bit(n, p->bitmap)))
+	entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
+	if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
 		return ERR_PTR(-ENOENT);
 
-	old_p = p->ary[n];
-	rcu_assign_pointer(p->ary[n], ptr);
+	__radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL, NULL);
 
-	return old_p;
+	return entry;
 }
 EXPORT_SYMBOL(idr_replace);
 
-void __init idr_init_cache(void)
-{
-	idr_layer_cache = kmem_cache_create("idr_layer_cache",
-				sizeof(struct idr_layer), 0, SLAB_PANIC, NULL);
-}
-
-/**
- * idr_init - initialize idr handle
- * @idp:	idr handle
- *
- * This function is use to set up the handle (@idp) that you will pass
- * to the rest of the functions.
- */
-void idr_init(struct idr *idp)
-{
-	memset(idp, 0, sizeof(struct idr));
-	spin_lock_init(&idp->lock);
-}
-EXPORT_SYMBOL(idr_init);
-
-static int idr_has_entry(int id, void *p, void *data)
-{
-	return 1;
-}
-
-bool idr_is_empty(struct idr *idp)
-{
-	return !idr_for_each(idp, idr_has_entry, NULL);
-}
-EXPORT_SYMBOL(idr_is_empty);
-
 /**
  * DOC: IDA description
- * IDA - IDR based ID allocator
- *
- * This is id allocator without id -> pointer translation.  Memory
- * usage is much lower than full blown idr because each id only
- * occupies a bit.  ida uses a custom leaf node which contains
- * IDA_BITMAP_BITS slots.
  *
- * 2007-04-25  written by Tejun Heo <htejun@gmail.com>
+ * The IDA is an ID allocator which does not provide the ability to
+ * associate an ID with a pointer.  As such, it only needs to store one
+ * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
+ * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
+ * then initialise it using ida_init()).  To allocate a new ID, call
+ * ida_simple_get().  To free an ID, call ida_simple_remove().
+ *
+ * If you have more complex locking requirements, use a loop around
+ * ida_pre_get() and ida_get_new() to allocate a new ID.  Then use
+ * ida_remove() to free an ID.  You must make sure that ida_get_new() and
+ * ida_remove() cannot be called at the same time as each other for the
+ * same IDA.
+ *
+ * You can also use ida_get_new_above() if you need an ID to be allocated
+ * above a particular number.  ida_destroy() can be used to dispose of an
+ * IDA without needing to free the individual IDs in it.  You can use
+ * ida_is_empty() to find out whether the IDA has any IDs currently allocated.
+ *
+ * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
+ * limitation, it should be quite straightforward to raise the maximum.
  */
 
-static void free_bitmap(struct ida *ida, struct ida_bitmap *bitmap)
-{
-	unsigned long flags;
-
-	if (!ida->free_bitmap) {
-		spin_lock_irqsave(&ida->idr.lock, flags);
-		if (!ida->free_bitmap) {
-			ida->free_bitmap = bitmap;
-			bitmap = NULL;
-		}
-		spin_unlock_irqrestore(&ida->idr.lock, flags);
-	}
-
-	kfree(bitmap);
-}
-
 /**
  * ida_pre_get - reserve resources for ida allocation
- * @ida:	ida handle
- * @gfp_mask:	memory allocation flag
- *
- * This function should be called prior to locking and calling the
- * following function.  It preallocates enough memory to satisfy the
- * worst possible allocation.
+ * @ida: ida handle
+ * @gfp: memory allocation flags
  *
- * If the system is REALLY out of memory this function returns %0,
- * otherwise %1.
+ * This function should be called before calling ida_get_new_above().  If it
+ * is unable to allocate memory, it will return %0.  On success, it returns %1.
  */
-int ida_pre_get(struct ida *ida, gfp_t gfp_mask)
+int ida_pre_get(struct ida *ida, gfp_t gfp)
 {
-	/* allocate idr_layers */
-	if (!__idr_pre_get(&ida->idr, gfp_mask))
-		return 0;
+	struct ida_bitmap *bitmap;
 
-	/* allocate free_bitmap */
-	if (!ida->free_bitmap) {
-		struct ida_bitmap *bitmap;
+	/*
+	 * This looks weird, but the IDA API has no preload_end() equivalent.
+	 * Instead, ida_get_new() can return -EAGAIN, prompting the caller
+	 * to return to the ida_pre_get() step.
+	 */
+	idr_preload(gfp);
+	idr_preload_end();
 
-		bitmap = kmalloc(sizeof(struct ida_bitmap), gfp_mask);
+	if (!ida->free_bitmap) {
+		bitmap = kmalloc(sizeof(struct ida_bitmap), gfp);
 		if (!bitmap)
 			return 0;
-
-		free_bitmap(ida, bitmap);
+		bitmap = xchg(&ida->free_bitmap, bitmap);
+		kfree(bitmap);
 	}
 
 	return 1;
 }
 EXPORT_SYMBOL(ida_pre_get);
 
+#define IDA_MAX (0x80000000U / IDA_BITMAP_BITS)
+
 /**
  * ida_get_new_above - allocate new ID above or equal to a start id
- * @ida:	ida handle
- * @starting_id: id to start search at
- * @p_id:	pointer to the allocated handle
+ * @ida: ida handle
+ * @start: id to start search at
+ * @id: pointer to the allocated handle
  *
- * Allocate new ID above or equal to @starting_id.  It should be called
- * with any required locks.
+ * Allocate new ID above or equal to @start.  It should be called
+ * with any required locks to ensure that concurrent calls to
+ * ida_get_new_above() / ida_get_new() / ida_remove() are not allowed.
+ * Consider using ida_simple_get() if you do not have complex locking
+ * requirements.
  *
  * If memory is required, it will return %-EAGAIN, you should unlock
  * and go back to the ida_pre_get() call.  If the ida is full, it will
- * return %-ENOSPC.
- *
- * Note that callers must ensure that concurrent access to @ida is not possible.
- * See ida_simple_get() for a varaint which takes care of locking.
+ * return %-ENOSPC.  On success, it will return 0.
  *
- * @p_id returns a value in the range @starting_id ... %0x7fffffff.
+ * @id returns a value in the range @start ... %0x7fffffff.
  */
-int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
+int ida_get_new_above(struct ida *ida, int start, int *id)
 {
-	struct idr_layer *pa[MAX_IDR_LEVEL + 1];
+	struct radix_tree_root *root = &ida->ida_rt;
+	void **slot;
+	struct radix_tree_iter iter;
 	struct ida_bitmap *bitmap;
-	unsigned long flags;
-	int idr_id = starting_id / IDA_BITMAP_BITS;
-	int offset = starting_id % IDA_BITMAP_BITS;
-	int t, id;
-
- restart:
-	/* get vacant slot */
-	t = idr_get_empty_slot(&ida->idr, idr_id, pa, 0, &ida->idr);
-	if (t < 0)
-		return t == -ENOMEM ? -EAGAIN : t;
-
-	if (t * IDA_BITMAP_BITS >= MAX_IDR_BIT)
-		return -ENOSPC;
-
-	if (t != idr_id)
-		offset = 0;
-	idr_id = t;
-
-	/* if bitmap isn't there, create a new one */
-	bitmap = (void *)pa[0]->ary[idr_id & IDR_MASK];
-	if (!bitmap) {
-		spin_lock_irqsave(&ida->idr.lock, flags);
-		bitmap = ida->free_bitmap;
-		ida->free_bitmap = NULL;
-		spin_unlock_irqrestore(&ida->idr.lock, flags);
-
-		if (!bitmap)
-			return -EAGAIN;
-
-		memset(bitmap, 0, sizeof(struct ida_bitmap));
-		rcu_assign_pointer(pa[0]->ary[idr_id & IDR_MASK],
-				(void *)bitmap);
-		pa[0]->count++;
-	}
-
-	/* lookup for empty slot */
-	t = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, offset);
-	if (t == IDA_BITMAP_BITS) {
-		/* no empty slot after offset, continue to the next chunk */
-		idr_id++;
-		offset = 0;
-		goto restart;
-	}
-
-	id = idr_id * IDA_BITMAP_BITS + t;
-	if (id >= MAX_IDR_BIT)
-		return -ENOSPC;
-
-	__set_bit(t, bitmap->bitmap);
-	if (++bitmap->nr_busy == IDA_BITMAP_BITS)
-		idr_mark_full(pa, idr_id);
+	unsigned long index;
+	unsigned bit;
+	int new;
+
+	index = start / IDA_BITMAP_BITS;
+	bit = start % IDA_BITMAP_BITS;
+
+	slot = radix_tree_iter_init(&iter, index);
+	for (;;) {
+		if (slot)
+			slot = radix_tree_next_slot(slot, &iter,
+						RADIX_TREE_ITER_TAGGED);
+		if (!slot) {
+			slot = idr_get_free(root, &iter, GFP_NOWAIT, IDA_MAX);
+			if (IS_ERR(slot)) {
+				if (slot == ERR_PTR(-ENOMEM))
+					return -EAGAIN;
+				return PTR_ERR(slot);
+			}
+		}
+		if (iter.index > index)
+			bit = 0;
+		new = iter.index * IDA_BITMAP_BITS;
+		bitmap = rcu_dereference_raw(*slot);
+		if (bitmap) {
+			bit = find_next_zero_bit(bitmap->bitmap,
+							IDA_BITMAP_BITS, bit);
+			new += bit;
+			if (new < 0)
+				return -ENOSPC;
+			if (bit == IDA_BITMAP_BITS)
+				continue;
 
-	*p_id = id;
+			__set_bit(bit, bitmap->bitmap);
+			if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
+				radix_tree_iter_tag_clear(root, &iter,
+								IDR_FREE);
+		} else {
+			new += bit;
+			if (new < 0)
+				return -ENOSPC;
+			bitmap = ida->free_bitmap;
+			if (!bitmap)
+				return -EAGAIN;
+			ida->free_bitmap = NULL;
+			memset(bitmap, 0, sizeof(*bitmap));
+			__set_bit(bit, bitmap->bitmap);
+			radix_tree_iter_replace(root, &iter, slot, bitmap);
+		}
 
-	/* Each leaf node can handle nearly a thousand slots and the
-	 * whole idea of ida is to have small memory foot print.
-	 * Throw away extra resources one by one after each successful
-	 * allocation.
-	 */
-	if (ida->idr.id_free_cnt || ida->free_bitmap) {
-		struct idr_layer *p = get_from_free_list(&ida->idr);
-		if (p)
-			kmem_cache_free(idr_layer_cache, p);
+		*id = new;
+		return 0;
 	}
-
-	return 0;
 }
 EXPORT_SYMBOL(ida_get_new_above);
 
 /**
- * ida_remove - remove the given ID
- * @ida:	ida handle
- * @id:		ID to free
+ * ida_remove - Free the given ID
+ * @ida: ida handle
+ * @id: ID to free
+ *
+ * This function should not be called at the same time as ida_get_new_above().
  */
 void ida_remove(struct ida *ida, int id)
 {
-	struct idr_layer *p = ida->idr.top;
-	int shift = (ida->idr.layers - 1) * IDR_BITS;
-	int idr_id = id / IDA_BITMAP_BITS;
-	int offset = id % IDA_BITMAP_BITS;
-	int n;
+	unsigned long index = id / IDA_BITMAP_BITS;
+	unsigned offset = id % IDA_BITMAP_BITS;
 	struct ida_bitmap *bitmap;
+	struct radix_tree_iter iter;
+	void **slot;
 
-	if (idr_id > idr_max(ida->idr.layers))
+	slot = radix_tree_iter_lookup(&ida->ida_rt, &iter, index);
+	if (!slot)
 		goto err;
 
-	/* clear full bits while looking up the leaf idr_layer */
-	while ((shift > 0) && p) {
-		n = (idr_id >> shift) & IDR_MASK;
-		__clear_bit(n, p->bitmap);
-		p = p->ary[n];
-		shift -= IDR_BITS;
-	}
-
-	if (p == NULL)
-		goto err;
-
-	n = idr_id & IDR_MASK;
-	__clear_bit(n, p->bitmap);
-
-	bitmap = (void *)p->ary[n];
-	if (!bitmap || !test_bit(offset, bitmap->bitmap))
+	bitmap = rcu_dereference_raw(*slot);
+	if (!test_bit(offset, bitmap->bitmap))
 		goto err;
 
-	/* update bitmap and remove it if empty */
 	__clear_bit(offset, bitmap->bitmap);
-	if (--bitmap->nr_busy == 0) {
-		__set_bit(n, p->bitmap);	/* to please idr_remove() */
-		idr_remove(&ida->idr, idr_id);
-		free_bitmap(ida, bitmap);
+	radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE);
+	if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
+		kfree(bitmap);
+		radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
 	}
-
 	return;
-
  err:
 	WARN(1, "ida_remove called for id=%d which is not allocated.\n", id);
 }
 EXPORT_SYMBOL(ida_remove);
 
 /**
- * ida_destroy - release all cached layers within an ida tree
- * @ida:		ida handle
+ * ida_destroy - Free the contents of an ida
+ * @ida: ida handle
+ *
+ * Calling this function releases all resources associated with an IDA.  When
+ * this call returns, the IDA is empty and can be reused or freed.  The caller
+ * should not allow ida_remove() or ida_get_new_above() to be called at the
+ * same time.
  */
 void ida_destroy(struct ida *ida)
 {
-	idr_destroy(&ida->idr);
+	struct radix_tree_iter iter;
+	void **slot;
+
+	radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) {
+		struct ida_bitmap *bitmap = rcu_dereference_raw(*slot);
+		kfree(bitmap);
+		radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
+	}
+
 	kfree(ida->free_bitmap);
+	ida->free_bitmap = NULL;
 }
 EXPORT_SYMBOL(ida_destroy);
 
@@ -1141,18 +442,3 @@ void ida_simple_remove(struct ida *ida, unsigned int id)
 	spin_unlock_irqrestore(&simple_ida_lock, flags);
 }
 EXPORT_SYMBOL(ida_simple_remove);
-
-/**
- * ida_init - initialize ida handle
- * @ida:	ida handle
- *
- * This function is use to set up the handle (@ida) that you will pass
- * to the rest of the functions.
- */
-void ida_init(struct ida *ida)
-{
-	memset(ida, 0, sizeof(struct ida));
-	idr_init(&ida->idr);
-
-}
-EXPORT_SYMBOL(ida_init);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7bf7d4e60e43..eaea14b8f2ca 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -22,20 +22,21 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/idr.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/radix-tree.h>
+#include <linux/kmemleak.h>
 #include <linux/percpu.h>
+#include <linux/preempt.h>		/* in_interrupt() */
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
 #include <linux/slab.h>
-#include <linux/kmemleak.h>
-#include <linux/cpu.h>
 #include <linux/string.h>
-#include <linux/bitops.h>
-#include <linux/rcupdate.h>
-#include <linux/preempt.h>		/* in_interrupt() */
 
 
 /* Number of nodes in fully populated tree of given height */
@@ -59,6 +60,15 @@ static struct kmem_cache *radix_tree_node_cachep;
  */
 #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)
 
+/*
+ * The IDR does not have to be as high as the radix tree since it uses
+ * signed integers, not unsigned longs.
+ */
+#define IDR_INDEX_BITS		(8 /* CHAR_BIT */ * sizeof(int) - 1)
+#define IDR_MAX_PATH		(DIV_ROUND_UP(IDR_INDEX_BITS, \
+						RADIX_TREE_MAP_SHIFT))
+#define IDR_PRELOAD_SIZE	(IDR_MAX_PATH * 2 - 1)
+
 /*
  * Per-cpu pool of preloaded nodes
  */
@@ -149,27 +159,32 @@ static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
 
 static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
 {
-	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
+	root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
 }
 
 static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
 {
-	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
+	root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
 }
 
 static inline void root_tag_clear_all(struct radix_tree_root *root)
 {
-	root->gfp_mask &= __GFP_BITS_MASK;
+	root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1;
 }
 
 static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
 {
-	return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+	return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT));
 }
 
 static inline unsigned root_tags_get(const struct radix_tree_root *root)
 {
-	return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT;
+	return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT;
+}
+
+static inline bool is_idr(const struct radix_tree_root *root)
+{
+	return !!(root->gfp_mask & ROOT_IS_IDR);
 }
 
 /*
@@ -187,6 +202,11 @@ static inline int any_tag_set(const struct radix_tree_node *node,
 	return 0;
 }
 
+static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+	bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
+}
+
 /**
  * radix_tree_find_next_bit - find the next set bit in a memory region
  *
@@ -240,6 +260,13 @@ static inline unsigned long node_maxindex(const struct radix_tree_node *node)
 	return shift_maxindex(node->shift);
 }
 
+static unsigned long next_index(unsigned long index,
+				const struct radix_tree_node *node,
+				unsigned long offset)
+{
+	return (index & ~node_maxindex(node)) + (offset << node->shift);
+}
+
 #ifndef __KERNEL__
 static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
@@ -278,11 +305,52 @@ static void radix_tree_dump(struct radix_tree_root *root)
 {
 	pr_debug("radix root: %p rnode %p tags %x\n",
 			root, root->rnode,
-			root->gfp_mask >> __GFP_BITS_SHIFT);
+			root->gfp_mask >> ROOT_TAG_SHIFT);
 	if (!radix_tree_is_internal_node(root->rnode))
 		return;
 	dump_node(entry_to_node(root->rnode), 0);
 }
+
+static void dump_ida_node(void *entry, unsigned long index)
+{
+	unsigned long i;
+
+	if (!entry)
+		return;
+
+	if (radix_tree_is_internal_node(entry)) {
+		struct radix_tree_node *node = entry_to_node(entry);
+
+		pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n",
+			node, node->offset, index * IDA_BITMAP_BITS,
+			((index | node_maxindex(node)) + 1) *
+				IDA_BITMAP_BITS - 1,
+			node->parent, node->tags[0][0], node->shift,
+			node->count);
+		for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+			dump_ida_node(node->slots[i],
+					index | (i << node->shift));
+	} else {
+		struct ida_bitmap *bitmap = entry;
+
+		pr_debug("ida btmp: %p offset %d indices %lu-%lu data", bitmap,
+				(int)(index & RADIX_TREE_MAP_MASK),
+				index * IDA_BITMAP_BITS,
+				(index + 1) * IDA_BITMAP_BITS - 1);
+		for (i = 0; i < IDA_BITMAP_LONGS; i++)
+			pr_cont(" %lx", bitmap->bitmap[i]);
+		pr_cont("\n");
+	}
+}
+
+static void ida_dump(struct ida *ida)
+{
+	struct radix_tree_root *root = &ida->ida_rt;
+	pr_debug("ida: %p %p free %d bitmap %p\n", ida, root->rnode,
+				root->gfp_mask >> ROOT_TAG_SHIFT,
+				ida->free_bitmap);
+	dump_ida_node(root->rnode, 0);
+}
 #endif
 
 /*
@@ -290,13 +358,11 @@ static void radix_tree_dump(struct radix_tree_root *root)
  * that the caller has pinned this thread of control to the current CPU.
  */
 static struct radix_tree_node *
-radix_tree_node_alloc(struct radix_tree_root *root,
-			struct radix_tree_node *parent,
+radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
 			unsigned int shift, unsigned int offset,
 			unsigned int count, unsigned int exceptional)
 {
 	struct radix_tree_node *ret = NULL;
-	gfp_t gfp_mask = root_gfp_mask(root);
 
 	/*
 	 * Preload code isn't irq safe and it doesn't make sense to use
@@ -533,7 +599,7 @@ static unsigned radix_tree_load_root(const struct radix_tree_root *root,
 /*
  *	Extend a radix tree so it can store key @index.
  */
-static int radix_tree_extend(struct radix_tree_root *root,
+static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
 				unsigned long index, unsigned int shift)
 {
 	struct radix_tree_node *slot;
@@ -546,19 +612,27 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		maxshift += RADIX_TREE_MAP_SHIFT;
 
 	slot = root->rnode;
-	if (!slot)
+	if (!slot && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
 		goto out;
 
 	do {
-		struct radix_tree_node *node = radix_tree_node_alloc(root,
-							NULL, shift, 0, 1, 0);
+		struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
+								shift, 0, 1, 0);
 		if (!node)
 			return -ENOMEM;
 
-		/* Propagate the aggregated tag info into the new root */
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (root_tag_get(root, tag))
-				tag_set(node, tag, 0);
+		if (is_idr(root)) {
+			all_tag_set(node, IDR_FREE);
+			if (!root_tag_get(root, IDR_FREE)) {
+				tag_clear(node, IDR_FREE, 0);
+				root_tag_set(root, IDR_FREE);
+			}
+		} else {
+			/* Propagate the aggregated tag info to the new child */
+			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+				if (root_tag_get(root, tag))
+					tag_set(node, tag, 0);
+			}
 		}
 
 		BUG_ON(shift > BITS_PER_LONG);
@@ -619,6 +693,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
 		 * one (root->rnode) as far as dependent read barriers go.
 		 */
 		root->rnode = child;
+		if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
+			root_tag_clear(root, IDR_FREE);
 
 		/*
 		 * We have a dilemma here. The node's slot[0] must not be
@@ -674,7 +750,12 @@ static bool delete_node(struct radix_tree_root *root,
 			parent->slots[node->offset] = NULL;
 			parent->count--;
 		} else {
-			root_tag_clear_all(root);
+			/*
+			 * Shouldn't the tags already have all been cleared
+			 * by the caller?
+			 */
+			if (!is_idr(root))
+				root_tag_clear_all(root);
 			root->rnode = NULL;
 		}
 
@@ -714,6 +795,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 	unsigned long maxindex;
 	unsigned int shift, offset = 0;
 	unsigned long max = index | ((1UL << order) - 1);
+	gfp_t gfp = root_gfp_mask(root);
 
 	shift = radix_tree_load_root(root, &child, &maxindex);
 
@@ -721,7 +803,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 	if (order > 0 && max == ((1UL << order) - 1))
 		max++;
 	if (max > maxindex) {
-		int error = radix_tree_extend(root, max, shift);
+		int error = radix_tree_extend(root, gfp, max, shift);
 		if (error < 0)
 			return error;
 		shift = error;
@@ -732,7 +814,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		if (child == NULL) {
 			/* Have to add a child node.  */
-			child = radix_tree_node_alloc(root, node, shift,
+			child = radix_tree_node_alloc(gfp, node, shift,
 							offset, 0, 0);
 			if (!child)
 				return -ENOMEM;
@@ -755,7 +837,6 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 	return 0;
 }
 
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
 /*
  * Free any nodes below this node.  The tree is presumed to not need
  * shrinking, and any user data in the tree is presumed to not need a
@@ -791,6 +872,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
 	}
 }
 
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
 static inline int insert_entries(struct radix_tree_node *node, void **slot,
 				void *item, unsigned order, bool replace)
 {
@@ -996,69 +1078,70 @@ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
-static inline int slot_count(struct radix_tree_node *node,
-						void **slot)
+static inline void replace_sibling_entries(struct radix_tree_node *node,
+				void **slot, int count, int exceptional)
 {
-	int n = 1;
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
 	void *ptr = node_to_entry(slot);
-	unsigned offset = get_slot_offset(node, slot);
-	int i;
+	unsigned offset = get_slot_offset(node, slot) + 1;
 
-	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-		if (node->slots[offset + i] != ptr)
+	while (offset < RADIX_TREE_MAP_SIZE) {
+		if (node->slots[offset] != ptr)
 			break;
-		n++;
+		if (count < 0) {
+			node->slots[offset] = NULL;
+			node->count--;
+		}
+		node->exceptional += exceptional;
+		offset++;
 	}
 #endif
-	return n;
 }
 
-static void replace_slot(struct radix_tree_root *root,
-			 struct radix_tree_node *node,
-			 void **slot, void *item,
-			 bool warn_typeswitch)
+static void replace_slot(void **slot, void *item, struct radix_tree_node *node,
+				int count, int exceptional)
 {
-	void *old = rcu_dereference_raw(*slot);
-	int count, exceptional;
-
-	WARN_ON_ONCE(radix_tree_is_internal_node(item));
-
-	count = !!item - !!old;
-	exceptional = !!radix_tree_exceptional_entry(item) -
-		      !!radix_tree_exceptional_entry(old);
-
-	WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
+	if (WARN_ON_ONCE(radix_tree_is_internal_node(item)))
+		return;
 
-	if (node) {
+	if (node && (count || exceptional)) {
 		node->count += count;
-		if (exceptional) {
-			exceptional *= slot_count(node, slot);
-			node->exceptional += exceptional;
-		}
+		node->exceptional += exceptional;
+		replace_sibling_entries(node, slot, count, exceptional);
 	}
 
 	rcu_assign_pointer(*slot, item);
 }
 
-static inline void delete_sibling_entries(struct radix_tree_node *node,
-						void **slot)
+static bool node_tag_get(const struct radix_tree_root *root,
+				const struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
 {
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-	bool exceptional = radix_tree_exceptional_entry(*slot);
-	void *ptr = node_to_entry(slot);
-	unsigned offset = get_slot_offset(node, slot);
-	int i;
+	if (node)
+		return tag_get(node, tag, offset);
+	return root_tag_get(root, tag);
+}
 
-	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-		if (node->slots[offset + i] != ptr)
-			break;
-		node->slots[offset + i] = NULL;
-		node->count--;
-		if (exceptional)
-			node->exceptional--;
+/*
+ * IDR users want to be able to store NULL in the tree, so if the slot isn't
+ * free, don't adjust the count, even if it's transitioning between NULL and
+ * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
+ * have empty bits, but it only stores NULL in slots when they're being
+ * deleted.
+ */
+static int calculate_count(struct radix_tree_root *root,
+				struct radix_tree_node *node, void **slot,
+				void *item, void *old)
+{
+	if (is_idr(root)) {
+		unsigned offset = get_slot_offset(node, slot);
+		bool free = node_tag_get(root, node, IDR_FREE, offset);
+		if (!free)
+			return 0;
+		if (!old)
+			return 1;
 	}
-#endif
+	return !!item - !!old;
 }
 
 /**
@@ -1078,15 +1161,19 @@ void __radix_tree_replace(struct radix_tree_root *root,
 			  void **slot, void *item,
 			  radix_tree_update_node_t update_node, void *private)
 {
-	if (!item)
-		delete_sibling_entries(node, slot);
+	void *old = rcu_dereference_raw(*slot);
+	int exceptional = !!radix_tree_exceptional_entry(item) -
+				!!radix_tree_exceptional_entry(old);
+	int count = calculate_count(root, node, slot, item, old);
+
 	/*
 	 * This function supports replacing exceptional entries and
 	 * deleting entries, but that needs accounting against the
 	 * node unless the slot is root->rnode.
 	 */
-	replace_slot(root, node, slot, item,
-		     !node && slot != (void **)&root->rnode);
+	WARN_ON_ONCE(!node && (slot != (void **)&root->rnode) &&
+			(count || exceptional));
+	replace_slot(slot, item, node, count, exceptional);
 
 	if (!node)
 		return;
@@ -1116,7 +1203,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
 void radix_tree_replace_slot(struct radix_tree_root *root,
 			     void **slot, void *item)
 {
-	replace_slot(root, NULL, slot, item, true);
+	__radix_tree_replace(root, NULL, slot, item, NULL, NULL);
 }
 
 /**
@@ -1191,6 +1278,7 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 	void **slot;
 	unsigned int offset, end;
 	unsigned n, tag, tags = 0;
+	gfp_t gfp = root_gfp_mask(root);
 
 	if (!__radix_tree_lookup(root, index, &parent, &slot))
 		return -ENOENT;
@@ -1228,7 +1316,7 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 
 	for (;;) {
 		if (node->shift > order) {
-			child = radix_tree_node_alloc(root, node,
+			child = radix_tree_node_alloc(gfp, node,
 					node->shift - RADIX_TREE_MAP_SHIFT,
 					offset, 0, 0);
 			if (!child)
@@ -1444,8 +1532,6 @@ int radix_tree_tag_get(const struct radix_tree_root *root,
 	radix_tree_load_root(root, &node, &maxindex);
 	if (index > maxindex)
 		return 0;
-	if (node == NULL)
-		return 0;
 
 	while (radix_tree_is_internal_node(node)) {
 		unsigned offset;
@@ -1453,8 +1539,6 @@ int radix_tree_tag_get(const struct radix_tree_root *root,
 		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, index);
 
-		if (!node)
-			return 0;
 		if (!tag_get(parent, tag, offset))
 			return 0;
 		if (node == RADIX_TREE_RETRY)
@@ -1481,6 +1565,11 @@ static void set_iter_tags(struct radix_tree_iter *iter,
 	unsigned tag_long = offset / BITS_PER_LONG;
 	unsigned tag_bit  = offset % BITS_PER_LONG;
 
+	if (!node) {
+		iter->tags = 1;
+		return;
+	}
+
 	iter->tags = node->tags[tag][tag_long] >> tag_bit;
 
 	/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
@@ -1873,13 +1962,18 @@ void __radix_tree_delete_node(struct radix_tree_root *root,
 static bool __radix_tree_delete(struct radix_tree_root *root,
 				struct radix_tree_node *node, void **slot)
 {
+	void *old = rcu_dereference_raw(*slot);
+	int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0;
 	unsigned offset = get_slot_offset(node, slot);
 	int tag;
 
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
-		node_tag_clear(root, node, tag, offset);
+	if (is_idr(root))
+		node_tag_set(root, node, IDR_FREE, offset);
+	else
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			node_tag_clear(root, node, tag, offset);
 
-	replace_slot(root, node, slot, NULL, true);
+	replace_slot(slot, NULL, node, -1, exceptional);
 	return node && delete_node(root, node, NULL, NULL);
 }
 
@@ -1916,12 +2010,13 @@ void radix_tree_iter_delete(struct radix_tree_root *root,
 void *radix_tree_delete_item(struct radix_tree_root *root,
 			     unsigned long index, void *item)
 {
-	struct radix_tree_node *node;
+	struct radix_tree_node *node = NULL;
 	void **slot;
 	void *entry;
 
 	entry = __radix_tree_lookup(root, index, &node, &slot);
-	if (!entry)
+	if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
+						get_slot_offset(node, slot))))
 		return NULL;
 
 	if (item && entry != item)
@@ -1957,8 +2052,7 @@ void radix_tree_clear_tags(struct radix_tree_root *root,
 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
 			node_tag_clear(root, node, tag, offset);
 	} else {
-		/* Clear root node tags */
-		root->gfp_mask &= __GFP_BITS_MASK;
+		root_tag_clear_all(root);
 	}
 }
 
@@ -1973,6 +2067,111 @@ int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
+/**
+ * idr_preload - preload for idr_alloc()
+ * @gfp_mask: allocation mask to use for preloading
+ *
+ * Preallocate memory to use for the next call to idr_alloc().  This function
+ * returns with preemption disabled.  It will be enabled by idr_preload_end().
+ */
+void idr_preload(gfp_t gfp_mask)
+{
+	__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE);
+}
+EXPORT_SYMBOL(idr_preload);
+
+void **idr_get_free(struct radix_tree_root *root,
+			struct radix_tree_iter *iter, gfp_t gfp, int end)
+{
+	struct radix_tree_node *node = NULL, *child;
+	void **slot = (void **)&root->rnode;
+	unsigned long maxindex, start = iter->next_index;
+	unsigned long max = end > 0 ? end - 1 : INT_MAX;
+	unsigned int shift, offset = 0;
+
+ grow:
+	shift = radix_tree_load_root(root, &child, &maxindex);
+	if (!radix_tree_tagged(root, IDR_FREE))
+		start = max(start, maxindex + 1);
+	if (start > max)
+		return ERR_PTR(-ENOSPC);
+
+	if (start > maxindex) {
+		int error = radix_tree_extend(root, gfp, start, shift);
+		if (error < 0)
+			return ERR_PTR(error);
+		shift = error;
+		child = rcu_dereference_raw(root->rnode);
+	}
+
+	while (shift) {
+		shift -= RADIX_TREE_MAP_SHIFT;
+		if (child == NULL) {
+			/* Have to add a child node.  */
+			child = radix_tree_node_alloc(gfp, node, shift, offset,
+							0, 0);
+			if (!child)
+				return ERR_PTR(-ENOMEM);
+			all_tag_set(child, IDR_FREE);
+			rcu_assign_pointer(*slot, node_to_entry(child));
+			if (node)
+				node->count++;
+		} else if (!radix_tree_is_internal_node(child))
+			break;
+
+		node = entry_to_node(child);
+		offset = radix_tree_descend(node, &child, start);
+		if (!tag_get(node, IDR_FREE, offset)) {
+			offset = radix_tree_find_next_bit(node, IDR_FREE,
+							offset + 1);
+			start = next_index(start, node, offset);
+			if (start > max)
+				return ERR_PTR(-ENOSPC);
+			while (offset == RADIX_TREE_MAP_SIZE) {
+				offset = node->offset + 1;
+				node = node->parent;
+				if (!node)
+					goto grow;
+				shift = node->shift;
+			}
+			child = rcu_dereference_raw(node->slots[offset]);
+		}
+		slot = &node->slots[offset];
+	}
+
+	iter->index = start;
+	if (node)
+		iter->next_index = 1 + min(max, (start | node_maxindex(node)));
+	else
+		iter->next_index = 1;
+	iter->node = node;
+	__set_iter_shift(iter, shift);
+	set_iter_tags(iter, node, offset, IDR_FREE);
+
+	return slot;
+}
+
+/**
+ * idr_destroy - release all internal memory from an IDR
+ * @idr: idr handle
+ *
+ * After this function is called, the IDR is empty, and may be reused or
+ * the data structure containing it may be freed.
+ *
+ * A typical clean-up sequence for objects stored in an idr tree will use
+ * idr_for_each() to free all objects, if necessary, then idr_destroy() to
+ * free the memory used to keep track of those objects.
+ */
+void idr_destroy(struct idr *idr)
+{
+	struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.rnode);
+	if (radix_tree_is_internal_node(node))
+		radix_tree_free_nodes(node);
+	idr->idr_rt.rnode = NULL;
+	root_tag_set(&idr->idr_rt, IDR_FREE);
+}
+EXPORT_SYMBOL(idr_destroy);
+
 static void
 radix_tree_node_ctor(void *arg)
 {
diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore
index 11d888ca6a92..3b5534b643f0 100644
--- a/tools/testing/radix-tree/.gitignore
+++ b/tools/testing/radix-tree/.gitignore
@@ -1,2 +1,3 @@
 main
 radix-tree.c
+idr.c
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 5274e88cd293..3597a3a9f269 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -2,8 +2,8 @@
 CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE
 LDFLAGS += -lpthread -lurcu
 TARGETS = main
-OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_bit.o \
-	 regression1.o regression2.o regression3.o multiorder.o \
+OFILES = main.o radix-tree.o idr.o linux.o test.o tag_check.o find_bit.o \
+	 regression1.o regression2.o regression3.o multiorder.o idr-test.o \
 	 iteration_check.o benchmark.o
 
 ifdef BENCHMARK
@@ -23,7 +23,11 @@ vpath %.c ../../lib
 $(OFILES): *.h */*.h \
 	../../include/linux/*.h \
 	../../include/asm/*.h \
-	../../../include/linux/radix-tree.h
+	../../../include/linux/radix-tree.h \
+	../../../include/linux/idr.h
 
 radix-tree.c: ../../../lib/radix-tree.c
 	sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
+
+idr.c: ../../../lib/idr.c
+	sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c
new file mode 100644
index 000000000000..4dffad9284e0
--- /dev/null
+++ b/tools/testing/radix-tree/idr-test.c
@@ -0,0 +1,342 @@
+/*
+ * idr-test.c: Test the IDR API
+ * Copyright (c) 2016 Matthew Wilcox <willy@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include "test.h"
+
+#define DUMMY_PTR	((void *)0x12)
+
+int item_idr_free(int id, void *p, void *data)
+{
+	struct item *item = p;
+	assert(item->index == id);
+	free(p);
+
+	return 0;
+}
+
+void item_idr_remove(struct idr *idr, int id)
+{
+	struct item *item = idr_find(idr, id);
+	assert(item->index == id);
+	idr_remove(idr, id);
+	free(item);
+}
+
+void idr_alloc_test(void)
+{
+	unsigned long i;
+	DEFINE_IDR(idr);
+
+	assert(idr_alloc_cyclic(&idr, DUMMY_PTR, 0, 0x4000, GFP_KERNEL) == 0);
+	assert(idr_alloc_cyclic(&idr, DUMMY_PTR, 0x3ffd, 0x4000, GFP_KERNEL) == 0x3ffd);
+	idr_remove(&idr, 0x3ffd);
+	idr_remove(&idr, 0);
+
+	for (i = 0x3ffe; i < 0x4003; i++) {
+		int id;
+		struct item *item;
+
+		if (i < 0x4000)
+			item = item_create(i, 0);
+		else
+			item = item_create(i - 0x3fff, 0);
+
+		id = idr_alloc_cyclic(&idr, item, 1, 0x4000, GFP_KERNEL);
+		assert(id == item->index);
+	}
+
+	idr_for_each(&idr, item_idr_free, &idr);
+	idr_destroy(&idr);
+}
+
+void idr_replace_test(void)
+{
+	DEFINE_IDR(idr);
+
+	idr_alloc(&idr, (void *)-1, 10, 11, GFP_KERNEL);
+	idr_replace(&idr, &idr, 10);
+
+	idr_destroy(&idr);
+}
+
+/*
+ * Unlike the radix tree, you can put a NULL pointer -- with care -- into
+ * the IDR.  Some interfaces, like idr_find() do not distinguish between
+ * "present, value is NULL" and "not present", but that's exactly what some
+ * users want.
+ */
+void idr_null_test(void)
+{
+	int i;
+	DEFINE_IDR(idr);
+
+	assert(idr_is_empty(&idr));
+
+	assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 0);
+	assert(!idr_is_empty(&idr));
+	idr_remove(&idr, 0);
+	assert(idr_is_empty(&idr));
+
+	assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 0);
+	assert(!idr_is_empty(&idr));
+	idr_destroy(&idr);
+	assert(idr_is_empty(&idr));
+
+	for (i = 0; i < 10; i++) {
+		assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == i);
+	}
+
+	assert(idr_replace(&idr, DUMMY_PTR, 3) == NULL);
+	assert(idr_replace(&idr, DUMMY_PTR, 4) == NULL);
+	assert(idr_replace(&idr, NULL, 4) == DUMMY_PTR);
+	assert(idr_replace(&idr, DUMMY_PTR, 11) == ERR_PTR(-ENOENT));
+	idr_remove(&idr, 5);
+	assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 5);
+	idr_remove(&idr, 5);
+
+	for (i = 0; i < 9; i++) {
+		idr_remove(&idr, i);
+		assert(!idr_is_empty(&idr));
+	}
+	idr_remove(&idr, 8);
+	assert(!idr_is_empty(&idr));
+	idr_remove(&idr, 9);
+	assert(idr_is_empty(&idr));
+
+	assert(idr_alloc(&idr, NULL, 0, 0, GFP_KERNEL) == 0);
+	assert(idr_replace(&idr, DUMMY_PTR, 3) == ERR_PTR(-ENOENT));
+	assert(idr_replace(&idr, DUMMY_PTR, 0) == NULL);
+	assert(idr_replace(&idr, NULL, 0) == DUMMY_PTR);
+
+	idr_destroy(&idr);
+	assert(idr_is_empty(&idr));
+
+	for (i = 1; i < 10; i++) {
+		assert(idr_alloc(&idr, NULL, 1, 0, GFP_KERNEL) == i);
+	}
+
+	idr_destroy(&idr);
+	assert(idr_is_empty(&idr));
+}
+
+void idr_nowait_test(void)
+{
+	unsigned int i;
+	DEFINE_IDR(idr);
+
+	idr_preload(GFP_KERNEL);
+
+	for (i = 0; i < 3; i++) {
+		struct item *item = item_create(i, 0);
+		assert(idr_alloc(&idr, item, i, i + 1, GFP_NOWAIT) == i);
+	}
+
+	idr_preload_end();
+
+	idr_for_each(&idr, item_idr_free, &idr);
+	idr_destroy(&idr);
+}
+
+void idr_checks(void)
+{
+	unsigned long i;
+	DEFINE_IDR(idr);
+
+	for (i = 0; i < 10000; i++) {
+		struct item *item = item_create(i, 0);
+		assert(idr_alloc(&idr, item, 0, 20000, GFP_KERNEL) == i);
+	}
+
+	assert(idr_alloc(&idr, DUMMY_PTR, 5, 30, GFP_KERNEL) < 0);
+
+	for (i = 0; i < 5000; i++)
+		item_idr_remove(&idr, i);
+
+	idr_remove(&idr, 3);
+
+	idr_for_each(&idr, item_idr_free, &idr);
+	idr_destroy(&idr);
+
+	assert(idr_is_empty(&idr));
+
+	idr_remove(&idr, 3);
+	idr_remove(&idr, 0);
+
+	for (i = INT_MAX - 3UL; i < INT_MAX + 1UL; i++) {
+		struct item *item = item_create(i, 0);
+		assert(idr_alloc(&idr, item, i, i + 10, GFP_KERNEL) == i);
+	}
+	assert(idr_alloc(&idr, DUMMY_PTR, i - 2, i, GFP_KERNEL) == -ENOSPC);
+
+	idr_for_each(&idr, item_idr_free, &idr);
+	idr_destroy(&idr);
+	idr_destroy(&idr);
+
+	assert(idr_is_empty(&idr));
+
+	for (i = 1; i < 10000; i++) {
+		struct item *item = item_create(i, 0);
+		assert(idr_alloc(&idr, item, 1, 20000, GFP_KERNEL) == i);
+	}
+
+	idr_for_each(&idr, item_idr_free, &idr);
+	idr_destroy(&idr);
+
+	idr_replace_test();
+	idr_alloc_test();
+	idr_null_test();
+	idr_nowait_test();
+}
+
+/*
+ * Check that we get the correct error when we run out of memory doing
+ * allocations.  To ensure we run out of memory, just "forget" to preload.
+ * The first test is for not having a bitmap available, and the second test
+ * is for not being able to allocate a level of the radix tree.
+ */
+void ida_check_nomem(void)
+{
+	DEFINE_IDA(ida);
+	int id, err;
+
+	err = ida_get_new(&ida, &id);
+	assert(err == -EAGAIN);
+	err = ida_get_new_above(&ida, 1UL << 30, &id);
+	assert(err == -EAGAIN);
+}
+
+/*
+ * Check what happens when we fill a leaf and then delete it.  This may
+ * discover mishandling of IDR_FREE.
+ */
+void ida_check_leaf(void)
+{
+	DEFINE_IDA(ida);
+	int id;
+	unsigned long i;
+
+	for (i = 0; i < IDA_BITMAP_BITS; i++) {
+		assert(ida_pre_get(&ida, GFP_KERNEL));
+		assert(!ida_get_new(&ida, &id));
+		assert(id == i);
+	}
+
+	ida_destroy(&ida);
+	assert(ida_is_empty(&ida));
+
+	assert(ida_pre_get(&ida, GFP_KERNEL));
+	assert(!ida_get_new(&ida, &id));
+	assert(id == 0);
+	ida_destroy(&ida);
+	assert(ida_is_empty(&ida));
+}
+
+/*
+ * Check allocations up to and slightly above the maximum allowed (2^31-1) ID.
+ * Allocating up to 2^31-1 should succeed, and then allocating the next one
+ * should fail.
+ */
+void ida_check_max(void)
+{
+	DEFINE_IDA(ida);
+	int id, err;
+	unsigned long i, j;
+
+	for (j = 1; j < 65537; j *= 2) {
+		unsigned long base = (1UL << 31) - j;
+		for (i = 0; i < j; i++) {
+			assert(ida_pre_get(&ida, GFP_KERNEL));
+			assert(!ida_get_new_above(&ida, base, &id));
+			assert(id == base + i);
+		}
+		assert(ida_pre_get(&ida, GFP_KERNEL));
+		err = ida_get_new_above(&ida, base, &id);
+		assert(err == -ENOSPC);
+		ida_destroy(&ida);
+		assert(ida_is_empty(&ida));
+		rcu_barrier();
+	}
+}
+
+void ida_checks(void)
+{
+	DEFINE_IDA(ida);
+	int id;
+	unsigned long i;
+
+	radix_tree_cpu_dead(1);
+	ida_check_nomem();
+
+	for (i = 0; i < 10000; i++) {
+		assert(ida_pre_get(&ida, GFP_KERNEL));
+		assert(!ida_get_new(&ida, &id));
+		assert(id == i);
+	}
+
+	ida_remove(&ida, 20);
+	ida_remove(&ida, 21);
+	for (i = 0; i < 3; i++) {
+		assert(ida_pre_get(&ida, GFP_KERNEL));
+		assert(!ida_get_new(&ida, &id));
+		if (i == 2)
+			assert(id == 10000);
+	}
+
+	for (i = 0; i < 5000; i++)
+		ida_remove(&ida, i);
+
+	assert(ida_pre_get(&ida, GFP_KERNEL));
+	assert(!ida_get_new_above(&ida, 5000, &id));
+	assert(id == 10001);
+
+	ida_destroy(&ida);
+
+	assert(ida_is_empty(&ida));
+
+	assert(ida_pre_get(&ida, GFP_KERNEL));
+	assert(!ida_get_new_above(&ida, 1, &id));
+	assert(id == 1);
+
+	ida_remove(&ida, id);
+	assert(ida_is_empty(&ida));
+	ida_destroy(&ida);
+	assert(ida_is_empty(&ida));
+
+	assert(ida_pre_get(&ida, GFP_KERNEL));
+	assert(!ida_get_new_above(&ida, 1, &id));
+	ida_destroy(&ida);
+	assert(ida_is_empty(&ida));
+
+	assert(ida_pre_get(&ida, GFP_KERNEL));
+	assert(!ida_get_new_above(&ida, 1, &id));
+	assert(id == 1);
+	assert(ida_pre_get(&ida, GFP_KERNEL));
+	assert(!ida_get_new_above(&ida, 1025, &id));
+	assert(id == 1025);
+	assert(ida_pre_get(&ida, GFP_KERNEL));
+	assert(!ida_get_new_above(&ida, 10000, &id));
+	assert(id == 10000);
+	ida_remove(&ida, 1025);
+	ida_destroy(&ida);
+	assert(ida_is_empty(&ida));
+
+	ida_check_leaf();
+	ida_check_max();
+
+	radix_tree_cpu_dead(1);
+}
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h
index 701209816a6b..39a0dcb9475a 100644
--- a/tools/testing/radix-tree/linux/gfp.h
+++ b/tools/testing/radix-tree/linux/gfp.h
@@ -15,10 +15,12 @@
 #define __GFP_DIRECT_RECLAIM	0x400000u
 #define __GFP_KSWAPD_RECLAIM	0x2000000u
 
-#define __GFP_RECLAIM		(__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
+#define __GFP_RECLAIM	(__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
+
+#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
 
-#define GFP_ATOMIC		(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
-#define GFP_KERNEL		(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
 
 static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 {
diff --git a/tools/testing/radix-tree/linux/idr.h b/tools/testing/radix-tree/linux/idr.h
new file mode 100644
index 000000000000..4e342f2e37cf
--- /dev/null
+++ b/tools/testing/radix-tree/linux/idr.h
@@ -0,0 +1 @@
+#include "../../../../include/linux/idr.h"
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index dd1d9aefb14f..63fce553781a 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -20,6 +20,7 @@
 
 #define printk printf
 #define pr_debug printk
+#define pr_cont printk
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index f7e9801a6754..ddd90a11db3f 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -3,6 +3,7 @@
 #include <unistd.h>
 #include <time.h>
 #include <assert.h>
+#include <limits.h>
 
 #include <linux/slab.h>
 #include <linux/radix-tree.h>
@@ -314,6 +315,11 @@ static void single_thread_tests(bool long_run)
 	rcu_barrier();
 	printf("after dynamic_height_check: %d allocated, preempt %d\n",
 		nr_allocated, preempt_count);
+	idr_checks();
+	ida_checks();
+	rcu_barrier();
+	printf("after idr_checks: %d allocated, preempt %d\n",
+		nr_allocated, preempt_count);
 	big_gang_check(long_run);
 	rcu_barrier();
 	printf("after big_gang_check: %d allocated, preempt %d\n",
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 056a23b56467..b30e11d9d271 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -34,6 +34,8 @@ void tag_check(void);
 void multiorder_checks(void);
 void iteration_test(unsigned order, unsigned duration);
 void benchmark(void);
+void idr_checks(void);
+void ida_checks(void);
 
 struct item *
 item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
-- 
cgit v1.2.3


From 7ad3d4d85c7af9632055a6ac0aa15b6b6a321c6b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Fri, 16 Dec 2016 11:55:56 -0500
Subject: ida: Move ida_bitmap to a percpu variable

When we preload the IDA, we allocate an IDA bitmap.  Instead of storing
that preallocated bitmap in the IDA, we store it in a percpu variable.
Generally there are more IDAs in the system than CPUs, so this cuts down
on the number of preallocated bitmaps that are unused, and about half
of the IDA users did not call ida_destroy() so they were leaking IDA
bitmaps.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/idr.h                     |  5 ++--
 lib/idr.c                               | 39 ++--------------------------
 lib/radix-tree.c                        | 45 ++++++++++++++++++++++++++++++---
 tools/testing/radix-tree/linux/kernel.h |  2 --
 tools/testing/radix-tree/linux/percpu.h |  5 +++-
 5 files changed, 51 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index f58c0a3addc3..2027c7aba50d 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -14,6 +14,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/gfp.h>
+#include <linux/percpu.h>
 
 struct idr {
 	struct radix_tree_root	idr_rt;
@@ -171,9 +172,10 @@ struct ida_bitmap {
 	unsigned long		bitmap[IDA_BITMAP_LONGS];
 };
 
+DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap);
+
 struct ida {
 	struct radix_tree_root	ida_rt;
-	struct ida_bitmap	*free_bitmap;
 };
 
 #define IDA_INIT	{						\
@@ -193,7 +195,6 @@ void ida_simple_remove(struct ida *ida, unsigned int id);
 static inline void ida_init(struct ida *ida)
 {
 	INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT);
-	ida->free_bitmap = NULL;
 }
 
 /**
diff --git a/lib/idr.c b/lib/idr.c
index b87056e2cc4c..2abd7769c430 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -4,6 +4,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 
+DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
 static DEFINE_SPINLOCK(simple_ida_lock);
 
 /**
@@ -193,38 +194,6 @@ EXPORT_SYMBOL(idr_replace);
  * limitation, it should be quite straightforward to raise the maximum.
  */
 
-/**
- * ida_pre_get - reserve resources for ida allocation
- * @ida: ida handle
- * @gfp: memory allocation flags
- *
- * This function should be called before calling ida_get_new_above().  If it
- * is unable to allocate memory, it will return %0.  On success, it returns %1.
- */
-int ida_pre_get(struct ida *ida, gfp_t gfp)
-{
-	struct ida_bitmap *bitmap;
-
-	/*
-	 * This looks weird, but the IDA API has no preload_end() equivalent.
-	 * Instead, ida_get_new() can return -EAGAIN, prompting the caller
-	 * to return to the ida_pre_get() step.
-	 */
-	idr_preload(gfp);
-	idr_preload_end();
-
-	if (!ida->free_bitmap) {
-		bitmap = kmalloc(sizeof(struct ida_bitmap), gfp);
-		if (!bitmap)
-			return 0;
-		bitmap = xchg(&ida->free_bitmap, bitmap);
-		kfree(bitmap);
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL(ida_pre_get);
-
 #define IDA_MAX (0x80000000U / IDA_BITMAP_BITS)
 
 /**
@@ -292,10 +261,9 @@ int ida_get_new_above(struct ida *ida, int start, int *id)
 			new += bit;
 			if (new < 0)
 				return -ENOSPC;
-			bitmap = ida->free_bitmap;
+			bitmap = this_cpu_xchg(ida_bitmap, NULL);
 			if (!bitmap)
 				return -EAGAIN;
-			ida->free_bitmap = NULL;
 			memset(bitmap, 0, sizeof(*bitmap));
 			__set_bit(bit, bitmap->bitmap);
 			radix_tree_iter_replace(root, &iter, slot, bitmap);
@@ -361,9 +329,6 @@ void ida_destroy(struct ida *ida)
 		kfree(bitmap);
 		radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
 	}
-
-	kfree(ida->free_bitmap);
-	ida->free_bitmap = NULL;
 }
 EXPORT_SYMBOL(ida_destroy);
 
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index eaea14b8f2ca..7b9f8515033e 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -69,6 +69,14 @@ static struct kmem_cache *radix_tree_node_cachep;
 						RADIX_TREE_MAP_SHIFT))
 #define IDR_PRELOAD_SIZE	(IDR_MAX_PATH * 2 - 1)
 
+/*
+ * The IDA is even shorter since it uses a bitmap at the last level.
+ */
+#define IDA_INDEX_BITS		(8 * sizeof(int) - 1 - ilog2(IDA_BITMAP_BITS))
+#define IDA_MAX_PATH		(DIV_ROUND_UP(IDA_INDEX_BITS, \
+						RADIX_TREE_MAP_SHIFT))
+#define IDA_PRELOAD_SIZE	(IDA_MAX_PATH * 2 - 1)
+
 /*
  * Per-cpu pool of preloaded nodes
  */
@@ -346,9 +354,8 @@ static void dump_ida_node(void *entry, unsigned long index)
 static void ida_dump(struct ida *ida)
 {
 	struct radix_tree_root *root = &ida->ida_rt;
-	pr_debug("ida: %p %p free %d bitmap %p\n", ida, root->rnode,
-				root->gfp_mask >> ROOT_TAG_SHIFT,
-				ida->free_bitmap);
+	pr_debug("ida: %p node %p free %d\n", ida, root->rnode,
+				root->gfp_mask >> ROOT_TAG_SHIFT);
 	dump_ida_node(root->rnode, 0);
 }
 #endif
@@ -2080,6 +2087,36 @@ void idr_preload(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(idr_preload);
 
+/**
+ * ida_pre_get - reserve resources for ida allocation
+ * @ida: ida handle
+ * @gfp: memory allocation flags
+ *
+ * This function should be called before calling ida_get_new_above().  If it
+ * is unable to allocate memory, it will return %0.  On success, it returns %1.
+ */
+int ida_pre_get(struct ida *ida, gfp_t gfp)
+{
+	__radix_tree_preload(gfp, IDA_PRELOAD_SIZE);
+	/*
+	 * The IDA API has no preload_end() equivalent.  Instead,
+	 * ida_get_new() can return -EAGAIN, prompting the caller
+	 * to return to the ida_pre_get() step.
+	 */
+	preempt_enable();
+
+	if (!this_cpu_read(ida_bitmap)) {
+		struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
+		if (!bitmap)
+			return 0;
+		bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
+		kfree(bitmap);
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(ida_pre_get);
+
 void **idr_get_free(struct radix_tree_root *root,
 			struct radix_tree_iter *iter, gfp_t gfp, int end)
 {
@@ -2219,6 +2256,8 @@ static int radix_tree_cpu_dead(unsigned int cpu)
 		kmem_cache_free(radix_tree_node_cachep, node);
 		rtp->nr--;
 	}
+	kfree(per_cpu(ida_bitmap, cpu));
+	per_cpu(ida_bitmap, cpu) = NULL;
 	return 0;
 }
 
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 63fce553781a..677b8c0f60f9 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -24,6 +24,4 @@
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 
-#define xchg(ptr, x)	uatomic_xchg(ptr, x)
-
 #endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/percpu.h b/tools/testing/radix-tree/linux/percpu.h
index 5837f1d56f17..3ea01a1a88c2 100644
--- a/tools/testing/radix-tree/linux/percpu.h
+++ b/tools/testing/radix-tree/linux/percpu.h
@@ -1,7 +1,10 @@
-
+#define DECLARE_PER_CPU(type, val) extern type val
 #define DEFINE_PER_CPU(type, val) type val
 
 #define __get_cpu_var(var)	var
 #define this_cpu_ptr(var)	var
+#define this_cpu_read(var)	var
+#define this_cpu_xchg(var, val)		uatomic_xchg(&var, val)
+#define this_cpu_cmpxchg(var, old, new)	uatomic_cmpxchg(&var, old, new)
 #define per_cpu_ptr(ptr, cpu)   ({ (void)(cpu); (ptr); })
 #define per_cpu(var, cpu)	(*per_cpu_ptr(&(var), cpu))
-- 
cgit v1.2.3


From d3e709e63e97e5f3f129b639991cfe266da60bae Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 22 Dec 2016 13:30:22 -0500
Subject: idr: Return the deleted entry from idr_remove

It is a relatively common idiom (8 instances) to first look up an IDR
entry, and then remove it from the tree if it is found, possibly doing
further operations upon the entry afterwards.  If we change idr_remove()
to return the removed object, all of these users can save themselves a
walk of the IDR tree.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 drivers/atm/nicstar.c                       |  5 ++---
 drivers/block/drbd/drbd_main.c              |  6 ++----
 drivers/firewire/core-cdev.c                |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c     | 10 +++-------
 drivers/net/wireless/marvell/mwifiex/txrx.c |  4 +---
 drivers/target/target_core_user.c           |  4 +---
 include/linux/idr.h                         |  4 ++--
 net/mac80211/status.c                       |  4 +---
 9 files changed, 15 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index cb28579e8a94..d879f3bca107 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -1980,13 +1980,12 @@ static void dequeue_rx(ns_dev * card, ns_rsqe * rsqe)
 	card->lbfqc = ns_stat_lfbqc_get(stat);
 
 	id = le32_to_cpu(rsqe->buffer_handle);
-	skb = idr_find(&card->idr, id);
+	skb = idr_remove(&card->idr, id);
 	if (!skb) {
 		RXPRINTK(KERN_ERR
-			 "nicstar%d: idr_find() failed!\n", card->index);
+			 "nicstar%d: skb not found!\n", card->index);
 		return;
 	}
-	idr_remove(&card->idr, id);
 	dma_sync_single_for_cpu(&card->pcidev->dev,
 				NS_PRV_DMA(skb),
 				(NS_PRV_BUFTYPE(skb) == BUF_SM
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 83482721bc01..6bb3b80e7e51 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2915,11 +2915,9 @@ out_idr_remove_vol:
 	idr_remove(&connection->peer_devices, vnr);
 out_idr_remove_from_resource:
 	for_each_connection(connection, resource) {
-		peer_device = idr_find(&connection->peer_devices, vnr);
-		if (peer_device) {
-			idr_remove(&connection->peer_devices, vnr);
+		peer_device = idr_remove(&connection->peer_devices, vnr);
+		if (peer_device)
 			kref_put(&connection->kref, drbd_destroy_connection);
-		}
 	}
 	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
 		list_del(&peer_device->peer_devices);
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index aee149bdf4c0..a301fcf46e88 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1307,8 +1307,7 @@ static void iso_resource_work(struct work_struct *work)
 	 */
 	if (r->todo == ISO_RES_REALLOC && !success &&
 	    !client->in_shutdown &&
-	    idr_find(&client->resource_idr, r->resource.handle)) {
-		idr_remove(&client->resource_idr, r->resource.handle);
+	    idr_remove(&client->resource_idr, r->resource.handle)) {
 		client_put(client);
 		free = true;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index c02db01f6583..0218cea6be4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -70,10 +70,10 @@ static void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id)
 	struct amdgpu_bo_list *list;
 
 	mutex_lock(&fpriv->bo_list_lock);
-	list = idr_find(&fpriv->bo_list_handles, id);
+	list = idr_remove(&fpriv->bo_list_handles, id);
 	if (list) {
+		/* Another user may have a reference to this list still */
 		mutex_lock(&list->lock);
-		idr_remove(&fpriv->bo_list_handles, id);
 		mutex_unlock(&list->lock);
 		amdgpu_bo_list_free(list);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 400c66ba4c6b..cf0500671353 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -135,15 +135,11 @@ static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
 	struct amdgpu_ctx *ctx;
 
 	mutex_lock(&mgr->lock);
-	ctx = idr_find(&mgr->ctx_handles, id);
-	if (ctx) {
-		idr_remove(&mgr->ctx_handles, id);
+	ctx = idr_remove(&mgr->ctx_handles, id);
+	if (ctx)
 		kref_put(&ctx->refcount, amdgpu_ctx_do_release);
-		mutex_unlock(&mgr->lock);
-		return 0;
-	}
 	mutex_unlock(&mgr->lock);
-	return -EINVAL;
+	return ctx ? 0 : -EINVAL;
 }
 
 static int amdgpu_ctx_query(struct amdgpu_device *adev,
diff --git a/drivers/net/wireless/marvell/mwifiex/txrx.c b/drivers/net/wireless/marvell/mwifiex/txrx.c
index abdd0cf710bf..fac28bd8fbee 100644
--- a/drivers/net/wireless/marvell/mwifiex/txrx.c
+++ b/drivers/net/wireless/marvell/mwifiex/txrx.c
@@ -346,9 +346,7 @@ void mwifiex_parse_tx_status_event(struct mwifiex_private *priv,
 		return;
 
 	spin_lock_irqsave(&priv->ack_status_lock, flags);
-	ack_skb = idr_find(&priv->ack_status_frames, tx_status->tx_token_id);
-	if (ack_skb)
-		idr_remove(&priv->ack_status_frames, tx_status->tx_token_id);
+	ack_skb = idr_remove(&priv->ack_status_frames, tx_status->tx_token_id);
 	spin_unlock_irqrestore(&priv->ack_status_lock, flags);
 
 	if (ack_skb) {
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 8041710b6972..18f0ec2e1f9c 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -642,9 +642,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 		WARN_ON(tcmu_hdr_get_op(entry->hdr.len_op) != TCMU_OP_CMD);
 
 		spin_lock(&udev->commands_lock);
-		cmd = idr_find(&udev->commands, entry->hdr.cmd_id);
-		if (cmd)
-			idr_remove(&udev->commands, cmd->cmd_id);
+		cmd = idr_remove(&udev->commands, entry->hdr.cmd_id);
 		spin_unlock(&udev->commands_lock);
 
 		if (!cmd) {
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 2027c7aba50d..bf70b3ef0a07 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -88,9 +88,9 @@ void *idr_get_next(struct idr *, int *nextid);
 void *idr_replace(struct idr *, void *, int id);
 void idr_destroy(struct idr *);
 
-static inline void idr_remove(struct idr *idr, int id)
+static inline void *idr_remove(struct idr *idr, int id)
 {
-	radix_tree_delete(&idr->idr_rt, id);
+	return radix_tree_delete_item(&idr->idr_rt, id, NULL);
 }
 
 static inline void idr_init(struct idr *idr)
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ddf71c648cab..43dd3316d8a4 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -462,9 +462,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 	unsigned long flags;
 
 	spin_lock_irqsave(&local->ack_status_lock, flags);
-	skb = idr_find(&local->ack_status_frames, info->ack_frame_id);
-	if (skb)
-		idr_remove(&local->ack_status_frames, info->ack_frame_id);
+	skb = idr_remove(&local->ack_status_frames, info->ack_frame_id);
 	spin_unlock_irqrestore(&local->ack_status_lock, flags);
 
 	if (!skb)
-- 
cgit v1.2.3


From d58275bc96ae933b1b3888af80920dd6b020c505 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Mon, 16 Jan 2017 17:10:21 -0500
Subject: radix-tree: Store a pointer to the root in each node

Instead of having this mysterious private_data in each radix_tree_node,
store a pointer to the root, which can be useful for debugging.  This also
relieves the mm code from the duty of updating it.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/radix-tree.h |  2 +-
 lib/radix-tree.c           | 14 ++++++++------
 mm/workingset.c            |  6 ++----
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 2ba0c1f46c84..d250059bbfa4 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -96,7 +96,7 @@ struct radix_tree_node {
 	unsigned char	count;		/* Total entry count */
 	unsigned char	exceptional;	/* Exceptional entry count */
 	struct radix_tree_node *parent;		/* Used when ascending tree */
-	void *private_data;			/* For tree user */
+	struct radix_tree_root *root;		/* The tree we belong to */
 	union {
 		struct list_head private_list;	/* For tree user */
 		struct rcu_head	rcu_head;	/* Used when freeing node */
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 66c71312c381..dcb9a2329e65 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -374,6 +374,7 @@ static void ida_dump(struct ida *ida)
  */
 static struct radix_tree_node *
 radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
+			struct radix_tree_root *root,
 			unsigned int shift, unsigned int offset,
 			unsigned int count, unsigned int exceptional)
 {
@@ -419,11 +420,12 @@ radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
 out:
 	BUG_ON(radix_tree_is_internal_node(ret));
 	if (ret) {
-		ret->parent = parent;
 		ret->shift = shift;
 		ret->offset = offset;
 		ret->count = count;
 		ret->exceptional = exceptional;
+		ret->parent = parent;
+		ret->root = root;
 	}
 	return ret;
 }
@@ -631,7 +633,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
 
 	do {
 		struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
-								shift, 0, 1, 0);
+							root, shift, 0, 1, 0);
 		if (!node)
 			return -ENOMEM;
 
@@ -828,7 +830,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		if (child == NULL) {
 			/* Have to add a child node.  */
-			child = radix_tree_node_alloc(gfp, node, shift,
+			child = radix_tree_node_alloc(gfp, node, root, shift,
 							offset, 0, 0);
 			if (!child)
 				return -ENOMEM;
@@ -1330,7 +1332,7 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 
 	for (;;) {
 		if (node->shift > order) {
-			child = radix_tree_node_alloc(gfp, node,
+			child = radix_tree_node_alloc(gfp, node, root,
 					node->shift - RADIX_TREE_MAP_SHIFT,
 					offset, 0, 0);
 			if (!child)
@@ -2152,8 +2154,8 @@ void **idr_get_free(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		if (child == NULL) {
 			/* Have to add a child node.  */
-			child = radix_tree_node_alloc(gfp, node, shift, offset,
-							0, 0);
+			child = radix_tree_node_alloc(gfp, node, root, shift,
+							offset, 0, 0);
 			if (!child)
 				return ERR_PTR(-ENOMEM);
 			all_tag_set(child, IDR_FREE);
diff --git a/mm/workingset.c b/mm/workingset.c
index abb58ffa3c64..80c913c89f11 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -354,10 +354,8 @@ void workingset_update_node(struct radix_tree_node *node, void *private)
 	 * as node->private_list is protected by &mapping->tree_lock.
 	 */
 	if (node->count && node->count == node->exceptional) {
-		if (list_empty(&node->private_list)) {
-			node->private_data = mapping;
+		if (list_empty(&node->private_list))
 			list_lru_add(&shadow_nodes, &node->private_list);
-		}
 	} else {
 		if (!list_empty(&node->private_list))
 			list_lru_del(&shadow_nodes, &node->private_list);
@@ -435,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	 */
 
 	node = container_of(item, struct radix_tree_node, private_list);
-	mapping = node->private_data;
+	mapping = container_of(node->root, struct address_space, page_tree);
 
 	/* Coming from the list, invert the lock order */
 	if (!spin_trylock(&mapping->tree_lock)) {
-- 
cgit v1.2.3


From 12320d0ff1c9d5582f5c35e4bb8b9c70c475fd71 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Mon, 13 Feb 2017 15:22:48 -0500
Subject: radix-tree: Add rcu_dereference and rcu_assign_pointer calls

Some of these have been missing for many years.  Others were recently
introduced by me.  Fortunately, we have tools that help us find such
things.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/radix-tree.h |  2 +-
 lib/radix-tree.c           | 26 +++++++++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index d250059bbfa4..0b502de7d23a 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -561,7 +561,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 	return NULL;
 
  found:
-	if (unlikely(radix_tree_is_internal_node(*slot)))
+	if (unlikely(radix_tree_is_internal_node(rcu_dereference_raw(*slot))))
 		return __radix_tree_next_slot(slot, iter, flags);
 	return slot;
 }
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index c1c079ffadcd..723bebe40eef 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -627,7 +627,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
 	while (index > shift_maxindex(maxshift))
 		maxshift += RADIX_TREE_MAP_SHIFT;
 
-	slot = root->rnode;
+	slot = rcu_dereference_raw(root->rnode);
 	if (!slot && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
 		goto out;
 
@@ -678,7 +678,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
 	bool shrunk = false;
 
 	for (;;) {
-		struct radix_tree_node *node = root->rnode;
+		struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
 		struct radix_tree_node *child;
 
 		if (!radix_tree_is_internal_node(node))
@@ -692,7 +692,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
 		 */
 		if (node->count != 1)
 			break;
-		child = node->slots[0];
+		child = rcu_dereference_raw(node->slots[0]);
 		if (!child)
 			break;
 		if (!radix_tree_is_internal_node(child) && node->shift)
@@ -755,7 +755,8 @@ static bool delete_node(struct radix_tree_root *root,
 		struct radix_tree_node *parent;
 
 		if (node->count) {
-			if (node == entry_to_node(root->rnode))
+			if (node_to_entry(node) ==
+					rcu_dereference_raw(root->rnode))
 				deleted |= radix_tree_shrink(root, update_node,
 								private);
 			return deleted;
@@ -823,7 +824,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		if (error < 0)
 			return error;
 		shift = error;
-		child = root->rnode;
+		child = rcu_dereference_raw(root->rnode);
 	}
 
 	while (shift > order) {
@@ -868,7 +869,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
 	struct radix_tree_node *child = entry_to_node(node);
 
 	for (;;) {
-		void *entry = child->slots[offset];
+		void *entry = rcu_dereference_raw(child->slots[offset]);
 		if (radix_tree_is_internal_node(entry) &&
 					!is_sibling_entry(child, entry)) {
 			child = entry_to_node(entry);
@@ -925,7 +926,7 @@ static inline int insert_entries(struct radix_tree_node *node, void **slot,
 	}
 
 	for (i = 0; i < n; i++) {
-		struct radix_tree_node *old = slot[i];
+		struct radix_tree_node *old = rcu_dereference_raw(slot[i]);
 		if (i) {
 			rcu_assign_pointer(slot[i], child);
 			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
@@ -1102,7 +1103,7 @@ static inline void replace_sibling_entries(struct radix_tree_node *node,
 	unsigned offset = get_slot_offset(node, slot) + 1;
 
 	while (offset < RADIX_TREE_MAP_SIZE) {
-		if (node->slots[offset] != ptr)
+		if (rcu_dereference_raw(node->slots[offset]) != ptr)
 			break;
 		if (count < 0) {
 			node->slots[offset] = NULL;
@@ -1308,7 +1309,8 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 			tags |= 1 << tag;
 
 	for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
-		if (!is_sibling_entry(parent, parent->slots[end]))
+		if (!is_sibling_entry(parent,
+				rcu_dereference_raw(parent->slots[end])))
 			break;
 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
 			if (tags & (1 << tag))
@@ -1339,7 +1341,8 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 				goto nomem;
 			if (node != parent) {
 				node->count++;
-				node->slots[offset] = node_to_entry(child);
+				rcu_assign_pointer(node->slots[offset],
+							node_to_entry(child));
 				for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
 					if (tags & (1 << tag))
 						tag_set(node, tag, offset);
@@ -1755,7 +1758,8 @@ void **radix_tree_next_chunk(const struct radix_tree_root *root,
 						offset + 1);
 			else
 				while (++offset	< RADIX_TREE_MAP_SIZE) {
-					void *slot = node->slots[offset];
+					void *slot = rcu_dereference_raw(
+							node->slots[offset]);
 					if (is_sibling_entry(node, slot))
 						continue;
 					if (slot)
-- 
cgit v1.2.3


From d7b627277b57370223d682cede979a279284b12a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Mon, 13 Feb 2017 15:58:24 -0500
Subject: radix-tree: Fix __rcu annotations

Many places were missing __rcu annotations.  A few places needed a few
lines of explanation about why it was safe to not use RCU accessors.
Add a custom CFLAGS setting to the Makefile to ensure that new patches
don't miss RCU annotations.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/radix-tree.h | 110 ++++++++++++++++++++-------------------
 lib/Makefile               |   2 +
 lib/radix-tree.c           | 125 ++++++++++++++++++++++++---------------------
 3 files changed, 122 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 0b502de7d23a..3e5735064b71 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -221,10 +221,8 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
  */
 
 /**
- * radix_tree_deref_slot	- dereference a slot
- * @pslot:	pointer to slot, returned by radix_tree_lookup_slot
- * Returns:	item that was stored in that slot with any direct pointer flag
- *		removed.
+ * radix_tree_deref_slot - dereference a slot
+ * @slot: slot pointer, returned by radix_tree_lookup_slot
  *
  * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
  * locked across slot lookup and dereference. Not required if write lock is
@@ -232,26 +230,27 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
  *
  * radix_tree_deref_retry must be used to confirm validity of the pointer if
  * only the read lock is held.
+ *
+ * Return: entry stored in that slot.
  */
-static inline void *radix_tree_deref_slot(void **pslot)
+static inline void *radix_tree_deref_slot(void __rcu **slot)
 {
-	return rcu_dereference(*pslot);
+	return rcu_dereference(*slot);
 }
 
 /**
- * radix_tree_deref_slot_protected	- dereference a slot without RCU lock but with tree lock held
- * @pslot:	pointer to slot, returned by radix_tree_lookup_slot
- * Returns:	item that was stored in that slot with any direct pointer flag
- *		removed.
- *
- * Similar to radix_tree_deref_slot but only used during migration when a pages
- * mapping is being moved. The caller does not hold the RCU read lock but it
- * must hold the tree lock to prevent parallel updates.
+ * radix_tree_deref_slot_protected - dereference a slot with tree lock held
+ * @slot: slot pointer, returned by radix_tree_lookup_slot
+ *
+ * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
+ * lock but it must hold the tree lock to prevent parallel updates.
+ *
+ * Return: entry stored in that slot.
  */
-static inline void *radix_tree_deref_slot_protected(void **pslot,
+static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
 							spinlock_t *treelock)
 {
-	return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
+	return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
 }
 
 /**
@@ -287,9 +286,9 @@ static inline int radix_tree_exception(void *arg)
 	return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
 }
 
-int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
+int __radix_tree_create(struct radix_tree_root *, unsigned long index,
 			unsigned order, struct radix_tree_node **nodep,
-			void ***slotp);
+			void __rcu ***slotp);
 int __radix_tree_insert(struct radix_tree_root *, unsigned long index,
 			unsigned order, void *);
 static inline int radix_tree_insert(struct radix_tree_root *root,
@@ -298,42 +297,41 @@ static inline int radix_tree_insert(struct radix_tree_root *root,
 	return __radix_tree_insert(root, index, 0, entry);
 }
 void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
-			  struct radix_tree_node **nodep, void ***slotp);
+			  struct radix_tree_node **nodep, void __rcu ***slotp);
 void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long);
+void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
+					unsigned long index);
 typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
-void __radix_tree_replace(struct radix_tree_root *root,
-			  struct radix_tree_node *node,
-			  void **slot, void *item,
+void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
+			  void __rcu **slot, void *entry,
 			  radix_tree_update_node_t update_node, void *private);
 void radix_tree_iter_replace(struct radix_tree_root *,
-		const struct radix_tree_iter *, void **slot, void *item);
-void radix_tree_replace_slot(struct radix_tree_root *root,
-			     void **slot, void *item);
-void __radix_tree_delete_node(struct radix_tree_root *root,
-			      struct radix_tree_node *node,
+		const struct radix_tree_iter *, void __rcu **slot, void *entry);
+void radix_tree_replace_slot(struct radix_tree_root *,
+			     void __rcu **slot, void *entry);
+void __radix_tree_delete_node(struct radix_tree_root *,
+			      struct radix_tree_node *,
 			      radix_tree_update_node_t update_node,
 			      void *private);
 void radix_tree_iter_delete(struct radix_tree_root *,
-				struct radix_tree_iter *iter, void **slot);
+			struct radix_tree_iter *iter, void __rcu **slot);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-void radix_tree_clear_tags(struct radix_tree_root *root,
-			   struct radix_tree_node *node,
-			   void **slot);
+void radix_tree_clear_tags(struct radix_tree_root *, struct radix_tree_node *,
+			   void __rcu **slot);
 unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
 			void **results, unsigned long first_index,
 			unsigned int max_items);
 unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
-			void ***results, unsigned long *indices,
+			void __rcu ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
 void radix_tree_init(void);
-void *radix_tree_tag_set(struct radix_tree_root *root,
+void *radix_tree_tag_set(struct radix_tree_root *,
 			unsigned long index, unsigned int tag);
-void *radix_tree_tag_clear(struct radix_tree_root *root,
+void *radix_tree_tag_clear(struct radix_tree_root *,
 			unsigned long index, unsigned int tag);
 int radix_tree_tag_get(const struct radix_tree_root *,
 			unsigned long index, unsigned int tag);
@@ -341,15 +339,13 @@ void radix_tree_iter_tag_set(struct radix_tree_root *,
 		const struct radix_tree_iter *iter, unsigned int tag);
 void radix_tree_iter_tag_clear(struct radix_tree_root *,
 		const struct radix_tree_iter *iter, unsigned int tag);
-unsigned int
-radix_tree_gang_lookup_tag(const struct radix_tree_root *, void **results,
-		unsigned long first_index, unsigned int max_items,
-		unsigned int tag);
-unsigned int
-radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *, void ***results,
-		unsigned long first_index, unsigned int max_items,
-		unsigned int tag);
-int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag);
+unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
+		void **results, unsigned long first_index,
+		unsigned int max_items, unsigned int tag);
+unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
+		void __rcu ***results, unsigned long first_index,
+		unsigned int max_items, unsigned int tag);
+int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
@@ -361,7 +357,7 @@ int radix_tree_split(struct radix_tree_root *, unsigned long index,
 			unsigned new_order);
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
 			unsigned new_order, void *);
-void **idr_get_free(struct radix_tree_root *, struct radix_tree_iter *,
+void __rcu **idr_get_free(struct radix_tree_root *, struct radix_tree_iter *,
 			gfp_t, int end);
 
 enum {
@@ -377,7 +373,7 @@ enum {
  * @start:	iteration starting index
  * Returns:	NULL
  */
-static __always_inline void **
+static __always_inline void __rcu **
 radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
 {
 	/*
@@ -406,7 +402,7 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
  * Also it fills @iter with data about chunk: position in the tree (index),
  * its end (next_index), and constructs a bit mask for tagged iterating (tags).
  */
-void **radix_tree_next_chunk(const struct radix_tree_root *,
+void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
 			     struct radix_tree_iter *iter, unsigned flags);
 
 /**
@@ -419,7 +415,8 @@ void **radix_tree_next_chunk(const struct radix_tree_root *,
  * containing it and updates @iter to describe the entry.  If @index is not
  * present, it returns NULL.
  */
-static inline void **radix_tree_iter_lookup(const struct radix_tree_root *root,
+static inline void __rcu **
+radix_tree_iter_lookup(const struct radix_tree_root *root,
 			struct radix_tree_iter *iter, unsigned long index)
 {
 	radix_tree_iter_init(iter, index);
@@ -436,7 +433,8 @@ static inline void **radix_tree_iter_lookup(const struct radix_tree_root *root,
  * which is at least @index.  If @index is larger than any present entry, this
  * function returns NULL.  The @iter is updated to describe the entry found.
  */
-static inline void **radix_tree_iter_find(const struct radix_tree_root *root,
+static inline void __rcu **
+radix_tree_iter_find(const struct radix_tree_root *root,
 			struct radix_tree_iter *iter, unsigned long index)
 {
 	radix_tree_iter_init(iter, index);
@@ -453,7 +451,7 @@ static inline void **radix_tree_iter_find(const struct radix_tree_root *root,
  * and continue the iteration.
  */
 static inline __must_check
-void **radix_tree_iter_retry(struct radix_tree_iter *iter)
+void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
 {
 	iter->next_index = iter->index;
 	iter->tags = 0;
@@ -476,7 +474,7 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
  * have been invalidated by an insertion or deletion.  Call this function
  * before releasing the lock to continue the iteration from the next index.
  */
-void **__must_check radix_tree_iter_resume(void **slot,
+void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
 					struct radix_tree_iter *iter);
 
 /**
@@ -492,11 +490,11 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 }
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
-				unsigned flags);
+void __rcu **__radix_tree_next_slot(void __rcu **slot,
+				struct radix_tree_iter *iter, unsigned flags);
 #else
 /* Can't happen without sibling entries, but the compiler can't tell that */
-static inline void ** __radix_tree_next_slot(void **slot,
+static inline void __rcu **__radix_tree_next_slot(void __rcu **slot,
 				struct radix_tree_iter *iter, unsigned flags)
 {
 	return slot;
@@ -522,8 +520,8 @@ static inline void ** __radix_tree_next_slot(void **slot,
  * b) we are doing non-tagged iteration, and iter->index and iter->next_index
  *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
  */
-static __always_inline void **
-radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
+static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
+				struct radix_tree_iter *iter, unsigned flags)
 {
 	if (flags & RADIX_TREE_ITER_TAGGED) {
 		iter->tags >>= 1;
diff --git a/lib/Makefile b/lib/Makefile
index bc4073a8cd08..2fc096985b21 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -24,6 +24,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
 
+CFLAGS_radix-tree.o += -DCONFIG_SPARSE_RCU_POINTER
+
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_HAS_DMA) += dma-noop.o
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 723bebe40eef..9c0fa4df736b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -104,7 +104,7 @@ static inline void *node_to_entry(void *ptr)
 static inline
 bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
 {
-	void **ptr = node;
+	void __rcu **ptr = node;
 	return (parent->slots <= ptr) &&
 			(ptr < parent->slots + RADIX_TREE_MAP_SIZE);
 }
@@ -116,8 +116,8 @@ bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
 }
 #endif
 
-static inline
-unsigned long get_slot_offset(const struct radix_tree_node *parent, void **slot)
+static inline unsigned long
+get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
 {
 	return slot - parent->slots;
 }
@@ -126,12 +126,13 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
 			struct radix_tree_node **nodep, unsigned long index)
 {
 	unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
-	void **entry = rcu_dereference_raw(parent->slots[offset]);
+	void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
 	if (radix_tree_is_internal_node(entry)) {
 		if (is_sibling_entry(parent, entry)) {
-			void **sibentry = (void **) entry_to_node(entry);
+			void __rcu **sibentry;
+			sibentry = (void __rcu **) entry_to_node(entry);
 			offset = get_slot_offset(parent, sibentry);
 			entry = rcu_dereference_raw(*sibentry);
 		}
@@ -618,7 +619,7 @@ static unsigned radix_tree_load_root(const struct radix_tree_root *root,
 static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
 				unsigned long index, unsigned int shift)
 {
-	struct radix_tree_node *slot;
+	void *entry;
 	unsigned int maxshift;
 	int tag;
 
@@ -627,8 +628,8 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
 	while (index > shift_maxindex(maxshift))
 		maxshift += RADIX_TREE_MAP_SHIFT;
 
-	slot = rcu_dereference_raw(root->rnode);
-	if (!slot && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
+	entry = rcu_dereference_raw(root->rnode);
+	if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
 		goto out;
 
 	do {
@@ -652,15 +653,19 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
 		}
 
 		BUG_ON(shift > BITS_PER_LONG);
-		if (radix_tree_is_internal_node(slot)) {
-			entry_to_node(slot)->parent = node;
-		} else if (radix_tree_exceptional_entry(slot)) {
+		if (radix_tree_is_internal_node(entry)) {
+			entry_to_node(entry)->parent = node;
+		} else if (radix_tree_exceptional_entry(entry)) {
 			/* Moving an exceptional root->rnode to a node */
 			node->exceptional = 1;
 		}
-		node->slots[0] = slot;
-		slot = node_to_entry(node);
-		rcu_assign_pointer(root->rnode, slot);
+		/*
+		 * entry was already in the radix tree, so we do not need
+		 * rcu_assign_pointer here
+		 */
+		node->slots[0] = (void __rcu *)entry;
+		entry = node_to_entry(node);
+		rcu_assign_pointer(root->rnode, entry);
 		shift += RADIX_TREE_MAP_SHIFT;
 	} while (shift <= maxshift);
 out:
@@ -708,7 +713,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
 		 * (node->slots[0]), it will be safe to dereference the new
 		 * one (root->rnode) as far as dependent read barriers go.
 		 */
-		root->rnode = child;
+		root->rnode = (void __rcu *)child;
 		if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
 			root_tag_clear(root, IDR_FREE);
 
@@ -732,7 +737,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
 		 */
 		node->count = 0;
 		if (!radix_tree_is_internal_node(child)) {
-			node->slots[0] = RADIX_TREE_RETRY;
+			node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
 			if (update_node)
 				update_node(node, private);
 		}
@@ -805,10 +810,10 @@ static bool delete_node(struct radix_tree_root *root,
  */
 int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			unsigned order, struct radix_tree_node **nodep,
-			void ***slotp)
+			void __rcu ***slotp)
 {
 	struct radix_tree_node *node = NULL, *child;
-	void **slot = (void **)&root->rnode;
+	void __rcu **slot = (void __rcu **)&root->rnode;
 	unsigned long maxindex;
 	unsigned int shift, offset = 0;
 	unsigned long max = index | ((1UL << order) - 1);
@@ -890,8 +895,8 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
 }
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-static inline int insert_entries(struct radix_tree_node *node, void **slot,
-				void *item, unsigned order, bool replace)
+static inline int insert_entries(struct radix_tree_node *node,
+		void __rcu **slot, void *item, unsigned order, bool replace)
 {
 	struct radix_tree_node *child;
 	unsigned i, n, tag, offset, tags = 0;
@@ -953,8 +958,8 @@ static inline int insert_entries(struct radix_tree_node *node, void **slot,
 	return n;
 }
 #else
-static inline int insert_entries(struct radix_tree_node *node, void **slot,
-				void *item, unsigned order, bool replace)
+static inline int insert_entries(struct radix_tree_node *node,
+		void __rcu **slot, void *item, unsigned order, bool replace)
 {
 	if (*slot)
 		return -EEXIST;
@@ -981,7 +986,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 			unsigned order, void *item)
 {
 	struct radix_tree_node *node;
-	void **slot;
+	void __rcu **slot;
 	int error;
 
 	BUG_ON(radix_tree_is_internal_node(item));
@@ -1023,15 +1028,15 @@ EXPORT_SYMBOL(__radix_tree_insert);
  */
 void *__radix_tree_lookup(const struct radix_tree_root *root,
 			  unsigned long index, struct radix_tree_node **nodep,
-			  void ***slotp)
+			  void __rcu ***slotp)
 {
 	struct radix_tree_node *node, *parent;
 	unsigned long maxindex;
-	void **slot;
+	void __rcu **slot;
 
  restart:
 	parent = NULL;
-	slot = (void **)&root->rnode;
+	slot = (void __rcu **)&root->rnode;
 	radix_tree_load_root(root, &node, &maxindex);
 	if (index > maxindex)
 		return NULL;
@@ -1066,10 +1071,10 @@ void *__radix_tree_lookup(const struct radix_tree_root *root,
  *	exclusive from other writers. Any dereference of the slot must be done
  *	using radix_tree_deref_slot.
  */
-void **radix_tree_lookup_slot(const struct radix_tree_root *root,
+void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
 				unsigned long index)
 {
-	void **slot;
+	void __rcu **slot;
 
 	if (!__radix_tree_lookup(root, index, NULL, &slot))
 		return NULL;
@@ -1096,7 +1101,7 @@ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
 EXPORT_SYMBOL(radix_tree_lookup);
 
 static inline void replace_sibling_entries(struct radix_tree_node *node,
-				void **slot, int count, int exceptional)
+				void __rcu **slot, int count, int exceptional)
 {
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
 	void *ptr = node_to_entry(slot);
@@ -1115,8 +1120,8 @@ static inline void replace_sibling_entries(struct radix_tree_node *node,
 #endif
 }
 
-static void replace_slot(void **slot, void *item, struct radix_tree_node *node,
-				int count, int exceptional)
+static void replace_slot(void __rcu **slot, void *item,
+		struct radix_tree_node *node, int count, int exceptional)
 {
 	if (WARN_ON_ONCE(radix_tree_is_internal_node(item)))
 		return;
@@ -1147,7 +1152,7 @@ static bool node_tag_get(const struct radix_tree_root *root,
  * deleted.
  */
 static int calculate_count(struct radix_tree_root *root,
-				struct radix_tree_node *node, void **slot,
+				struct radix_tree_node *node, void __rcu **slot,
 				void *item, void *old)
 {
 	if (is_idr(root)) {
@@ -1175,7 +1180,7 @@ static int calculate_count(struct radix_tree_root *root,
  */
 void __radix_tree_replace(struct radix_tree_root *root,
 			  struct radix_tree_node *node,
-			  void **slot, void *item,
+			  void __rcu **slot, void *item,
 			  radix_tree_update_node_t update_node, void *private)
 {
 	void *old = rcu_dereference_raw(*slot);
@@ -1188,7 +1193,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
 	 * deleting entries, but that needs accounting against the
 	 * node unless the slot is root->rnode.
 	 */
-	WARN_ON_ONCE(!node && (slot != (void **)&root->rnode) &&
+	WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->rnode) &&
 			(count || exceptional));
 	replace_slot(slot, item, node, count, exceptional);
 
@@ -1218,7 +1223,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
  * radix_tree_iter_replace().
  */
 void radix_tree_replace_slot(struct radix_tree_root *root,
-			     void **slot, void *item)
+			     void __rcu **slot, void *item)
 {
 	__radix_tree_replace(root, NULL, slot, item, NULL, NULL);
 }
@@ -1233,7 +1238,8 @@ void radix_tree_replace_slot(struct radix_tree_root *root,
  * Caller must hold tree write locked across split and replacement.
  */
 void radix_tree_iter_replace(struct radix_tree_root *root,
-		const struct radix_tree_iter *iter, void **slot, void *item)
+				const struct radix_tree_iter *iter,
+				void __rcu **slot, void *item)
 {
 	__radix_tree_replace(root, iter->node, slot, item, NULL, NULL);
 }
@@ -1257,7 +1263,7 @@ int radix_tree_join(struct radix_tree_root *root, unsigned long index,
 			unsigned order, void *item)
 {
 	struct radix_tree_node *node;
-	void **slot;
+	void __rcu **slot;
 	int error;
 
 	BUG_ON(radix_tree_is_internal_node(item));
@@ -1292,7 +1298,7 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index,
 				unsigned order)
 {
 	struct radix_tree_node *parent, *node, *child;
-	void **slot;
+	void __rcu **slot;
 	unsigned int offset, end;
 	unsigned n, tag, tags = 0;
 	gfp_t gfp = root_gfp_mask(root);
@@ -1603,8 +1609,8 @@ static void set_iter_tags(struct radix_tree_iter *iter,
 }
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-static void **skip_siblings(struct radix_tree_node **nodep,
-			void **slot, struct radix_tree_iter *iter)
+static void __rcu **skip_siblings(struct radix_tree_node **nodep,
+			void __rcu **slot, struct radix_tree_iter *iter)
 {
 	void *sib = node_to_entry(slot - 1);
 
@@ -1621,8 +1627,8 @@ static void **skip_siblings(struct radix_tree_node **nodep,
 	return NULL;
 }
 
-void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
-					unsigned flags)
+void __rcu **__radix_tree_next_slot(void __rcu **slot,
+				struct radix_tree_iter *iter, unsigned flags)
 {
 	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
 	struct radix_tree_node *node = rcu_dereference_raw(*slot);
@@ -1675,14 +1681,15 @@ void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
 }
 EXPORT_SYMBOL(__radix_tree_next_slot);
 #else
-static void **skip_siblings(struct radix_tree_node **nodep,
-			void **slot, struct radix_tree_iter *iter)
+static void __rcu **skip_siblings(struct radix_tree_node **nodep,
+			void __rcu **slot, struct radix_tree_iter *iter)
 {
 	return slot;
 }
 #endif
 
-void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
+void __rcu **radix_tree_iter_resume(void __rcu **slot,
+					struct radix_tree_iter *iter)
 {
 	struct radix_tree_node *node;
 
@@ -1703,7 +1710,7 @@ EXPORT_SYMBOL(radix_tree_iter_resume);
  * @flags:	RADIX_TREE_ITER_* flags and tag index
  * Returns:	pointer to chunk first slot, or NULL if iteration is over
  */
-void **radix_tree_next_chunk(const struct radix_tree_root *root,
+void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
 			     struct radix_tree_iter *iter, unsigned flags)
 {
 	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
@@ -1740,7 +1747,7 @@ void **radix_tree_next_chunk(const struct radix_tree_root *root,
 		iter->tags = 1;
 		iter->node = NULL;
 		__set_iter_shift(iter, 0);
-		return (void **)&root->rnode;
+		return (void __rcu **)&root->rnode;
 	}
 
 	do {
@@ -1819,7 +1826,7 @@ radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items)
 {
 	struct radix_tree_iter iter;
-	void **slot;
+	void __rcu **slot;
 	unsigned int ret = 0;
 
 	if (unlikely(!max_items))
@@ -1861,11 +1868,11 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  */
 unsigned int
 radix_tree_gang_lookup_slot(const struct radix_tree_root *root,
-			void ***results, unsigned long *indices,
+			void __rcu ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items)
 {
 	struct radix_tree_iter iter;
-	void **slot;
+	void __rcu **slot;
 	unsigned int ret = 0;
 
 	if (unlikely(!max_items))
@@ -1902,7 +1909,7 @@ radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
 		unsigned int tag)
 {
 	struct radix_tree_iter iter;
-	void **slot;
+	void __rcu **slot;
 	unsigned int ret = 0;
 
 	if (unlikely(!max_items))
@@ -1939,11 +1946,11 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
  */
 unsigned int
 radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
-		void ***results, unsigned long first_index,
+		void __rcu ***results, unsigned long first_index,
 		unsigned int max_items, unsigned int tag)
 {
 	struct radix_tree_iter iter;
-	void **slot;
+	void __rcu **slot;
 	unsigned int ret = 0;
 
 	if (unlikely(!max_items))
@@ -1979,7 +1986,7 @@ void __radix_tree_delete_node(struct radix_tree_root *root,
 }
 
 static bool __radix_tree_delete(struct radix_tree_root *root,
-				struct radix_tree_node *node, void **slot)
+				struct radix_tree_node *node, void __rcu **slot)
 {
 	void *old = rcu_dereference_raw(*slot);
 	int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0;
@@ -2009,7 +2016,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root,
  * which can access this tree.
  */
 void radix_tree_iter_delete(struct radix_tree_root *root,
-				struct radix_tree_iter *iter, void **slot)
+				struct radix_tree_iter *iter, void __rcu **slot)
 {
 	if (__radix_tree_delete(root, iter->node, slot))
 		iter->index = iter->next_index;
@@ -2030,7 +2037,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 			     unsigned long index, void *item)
 {
 	struct radix_tree_node *node = NULL;
-	void **slot;
+	void __rcu **slot;
 	void *entry;
 
 	entry = __radix_tree_lookup(root, index, &node, &slot);
@@ -2064,7 +2071,7 @@ EXPORT_SYMBOL(radix_tree_delete);
 
 void radix_tree_clear_tags(struct radix_tree_root *root,
 			   struct radix_tree_node *node,
-			   void **slot)
+			   void __rcu **slot)
 {
 	if (node) {
 		unsigned int tag, offset = get_slot_offset(node, slot);
@@ -2129,11 +2136,11 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
 }
 EXPORT_SYMBOL(ida_pre_get);
 
-void **idr_get_free(struct radix_tree_root *root,
+void __rcu **idr_get_free(struct radix_tree_root *root,
 			struct radix_tree_iter *iter, gfp_t gfp, int end)
 {
 	struct radix_tree_node *node = NULL, *child;
-	void **slot = (void **)&root->rnode;
+	void __rcu **slot = (void __rcu **)&root->rnode;
 	unsigned long maxindex, start = iter->next_index;
 	unsigned long max = end > 0 ? end - 1 : INT_MAX;
 	unsigned int shift, offset = 0;
-- 
cgit v1.2.3


From 40137906c5f55c252194ef5834130383e639536f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 11 Feb 2017 19:26:47 +0800
Subject: rhashtable: Add nested tables

This patch adds code that handles GFP_ATOMIC kmalloc failure on
insertion.  As we cannot use vmalloc, we solve it by making our
hash table nested.  That is, we allocate single pages at each level
and reach our desired table size by nesting them.

When a nested table is created, only a single page is allocated
at the top-level.  Lower levels are allocated on demand during
insertion.  Therefore for each insertion to succeed, only two
(non-consecutive) pages are needed.

After a nested table is created, a rehash will be scheduled in
order to switch to a vmalloced table as soon as possible.  Also,
the rehash code will never rehash into a nested table.  If we
detect a nested table during a rehash, the rehash will be aborted
and a new rehash will be scheduled.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  78 +++++++++----
 lib/rhashtable.c           | 270 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 276 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 5c132d3188be..f2e12a845910 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -61,6 +61,7 @@ struct rhlist_head {
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
  * @rehash: Current bucket being rehashed
  * @hash_rnd: Random seed to fold into hash
  * @locks_mask: Mask to apply before accessing locks[]
@@ -68,10 +69,12 @@ struct rhlist_head {
  * @walkers: List of active walkers
  * @rcu: RCU structure for freeing the table
  * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
  * @buckets: size * hash buckets
  */
 struct bucket_table {
 	unsigned int		size;
+	unsigned int		nest;
 	unsigned int		rehash;
 	u32			hash_rnd;
 	unsigned int		locks_mask;
@@ -81,7 +84,7 @@ struct bucket_table {
 
 	struct bucket_table __rcu *future_tbl;
 
-	struct rhash_head __rcu	*buckets[] ____cacheline_aligned_in_smp;
+	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
 /**
@@ -374,6 +377,12 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void *arg);
 void rhashtable_destroy(struct rhashtable *ht);
 
+struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
+					    unsigned int hash);
+struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
+						   struct bucket_table *tbl,
+						   unsigned int hash);
+
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
 
@@ -389,6 +398,27 @@ void rhashtable_destroy(struct rhashtable *ht);
 #define rht_entry(tpos, pos, member) \
 	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
+static inline struct rhash_head __rcu *const *rht_bucket(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_head __rcu **rht_bucket_var(
+	struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_head __rcu **rht_bucket_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
 /**
  * rht_for_each_continue - continue iterating over hash chain
  * @pos:	the &struct rhash_head to use as a loop cursor.
@@ -408,7 +438,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+	rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
 
 /**
  * rht_for_each_entry_continue - continue iterating over hash chain
@@ -433,7 +463,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash],	\
+	rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash),	\
 				    tbl, hash, member)
 
 /**
@@ -448,13 +478,13 @@ void rhashtable_destroy(struct rhashtable *ht);
  * This hash chain list-traversal primitive allows for the looped code to
  * remove the loop cursor from the list.
  */
-#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
-	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
-	     next = !rht_is_a_nulls(pos) ?				    \
-		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
-	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
-	     pos = next,						    \
-	     next = !rht_is_a_nulls(pos) ?				    \
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
+	for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \
+	     next = !rht_is_a_nulls(pos) ?				      \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
+	     pos = next,						      \
+	     next = !rht_is_a_nulls(pos) ?				      \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
 
 /**
@@ -485,7 +515,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+	rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
 
 /**
  * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
@@ -518,8 +548,8 @@ void rhashtable_destroy(struct rhashtable *ht);
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
+	rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \
 					tbl, hash, member)
 
 /**
@@ -565,7 +595,7 @@ static inline struct rhash_head *__rhashtable_lookup(
 		.ht = ht,
 		.key = key,
 	};
-	const struct bucket_table *tbl;
+	struct bucket_table *tbl;
 	struct rhash_head *he;
 	unsigned int hash;
 
@@ -697,8 +727,12 @@ slow_path:
 	}
 
 	elasticity = ht->elasticity;
-	pprev = &tbl->buckets[hash];
-	rht_for_each(head, tbl, hash) {
+	pprev = rht_bucket_insert(ht, tbl, hash);
+	data = ERR_PTR(-ENOMEM);
+	if (!pprev)
+		goto out;
+
+	rht_for_each_continue(head, *pprev, tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -736,7 +770,7 @@ slow_path:
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		goto slow_path;
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	head = rht_dereference_bucket(*pprev, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (rhlist) {
@@ -746,7 +780,7 @@ slow_path:
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	rcu_assign_pointer(*pprev, obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
@@ -955,8 +989,8 @@ static inline int __rhashtable_remove_fast_one(
 
 	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(he, *pprev, tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -1107,8 +1141,8 @@ static inline int __rhashtable_replace_fast(
 
 	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(he, *pprev, tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 32d0ad058380..172454e6b979 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -32,6 +32,11 @@
 #define HASH_MIN_SIZE		4U
 #define BUCKET_LOCKS_PER_CPU	32UL
 
+union nested_table {
+	union nested_table __rcu *table;
+	struct rhash_head __rcu *bucket;
+};
+
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
@@ -76,6 +81,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	/* Never allocate more than 0.5 locks per bucket */
 	size = min_t(unsigned int, size, tbl->size >> 1);
 
+	if (tbl->nest)
+		size = min(size, 1U << tbl->nest);
+
 	if (sizeof(spinlock_t) != 0) {
 		tbl->locks = NULL;
 #ifdef CONFIG_NUMA
@@ -99,8 +107,45 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	return 0;
 }
 
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	const unsigned int len = 1 << shift;
+	unsigned int i;
+
+	ntbl = rcu_dereference_raw(ntbl->table);
+	if (!ntbl)
+		return;
+
+	if (size > len) {
+		size >>= shift;
+		for (i = 0; i < len; i++)
+			nested_table_free(ntbl + i, size);
+	}
+
+	kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int len = 1 << tbl->nest;
+	union nested_table *ntbl;
+	unsigned int i;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+
+	for (i = 0; i < len; i++)
+		nested_table_free(ntbl + i, size);
+
+	kfree(ntbl);
+}
+
 static void bucket_table_free(const struct bucket_table *tbl)
 {
+	if (tbl->nest)
+		nested_bucket_table_free(tbl);
+
 	if (tbl)
 		kvfree(tbl->locks);
 
@@ -112,6 +157,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 	bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+					      union nested_table __rcu **prev,
+					      unsigned int shifted,
+					      unsigned int nhash)
+{
+	union nested_table *ntbl;
+	int i;
+
+	ntbl = rcu_dereference(*prev);
+	if (ntbl)
+		return ntbl;
+
+	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+	if (ntbl && shifted) {
+		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++)
+			INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht,
+					    (i << shifted) | nhash);
+	}
+
+	rcu_assign_pointer(*prev, ntbl);
+
+	return ntbl;
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+						      size_t nbuckets,
+						      gfp_t gfp)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	struct bucket_table *tbl;
+	size_t size;
+
+	if (nbuckets < (1 << (shift + 1)))
+		return NULL;
+
+	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+	tbl = kzalloc(size, gfp);
+	if (!tbl)
+		return NULL;
+
+	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+				0, 0)) {
+		kfree(tbl);
+		return NULL;
+	}
+
+	tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+	return tbl;
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       size_t nbuckets,
 					       gfp_t gfp)
@@ -126,10 +224,17 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
 	if (tbl == NULL && gfp == GFP_KERNEL)
 		tbl = vzalloc(size);
+
+	size = nbuckets;
+
+	if (tbl == NULL && gfp != GFP_KERNEL) {
+		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+		nbuckets = 0;
+	}
 	if (tbl == NULL)
 		return NULL;
 
-	tbl->size = nbuckets;
+	tbl->size = size;
 
 	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
 		bucket_table_free(tbl);
@@ -164,12 +269,17 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl = rhashtable_last_table(ht,
 		rht_dereference_rcu(old_tbl->future_tbl, ht));
-	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
-	int err = -ENOENT;
+	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
+	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
 	unsigned int new_hash;
 
+	if (new_tbl->nest)
+		goto out;
+
+	err = -ENOENT;
+
 	rht_for_each(entry, old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
@@ -202,19 +312,26 @@ out:
 	return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
 				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	spinlock_t *old_bucket_lock;
+	int err;
 
 	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
 
 	spin_lock_bh(old_bucket_lock);
-	while (!rhashtable_rehash_one(ht, old_hash))
+	while (!(err = rhashtable_rehash_one(ht, old_hash)))
 		;
-	old_tbl->rehash++;
+
+	if (err == -ENOENT) {
+		old_tbl->rehash++;
+		err = 0;
+	}
 	spin_unlock_bh(old_bucket_lock);
+
+	return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
@@ -246,13 +363,17 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	struct bucket_table *new_tbl;
 	struct rhashtable_walker *walker;
 	unsigned int old_hash;
+	int err;
 
 	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 	if (!new_tbl)
 		return 0;
 
-	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
-		rhashtable_rehash_chain(ht, old_hash);
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+		err = rhashtable_rehash_chain(ht, old_hash);
+		if (err)
+			return err;
+	}
 
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
@@ -271,31 +392,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-/**
- * rhashtable_expand - Expand hash table while allowing concurrent lookups
- * @ht:		the hash table to expand
- *
- * A secondary bucket array is allocated and the hash entries are migrated.
- *
- * This function may only be called in a context where it is safe to call
- * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
- *
- * The caller must ensure that no concurrent resizing occurs by holding
- * ht->mutex.
- *
- * It is valid to have concurrent insertions and deletions protected by per
- * bucket locks or concurrent RCU protected lookups and traversals.
- */
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+				   struct bucket_table *old_tbl,
+				   unsigned int size)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl;
 	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
-	old_tbl = rhashtable_last_table(ht, old_tbl);
-
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -324,12 +430,9 @@ static int rhashtable_expand(struct rhashtable *ht)
  */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	unsigned int nelems = atomic_read(&ht->nelems);
 	unsigned int size = 0;
-	int err;
-
-	ASSERT_RHT_MUTEX(ht);
 
 	if (nelems)
 		size = roundup_pow_of_two(nelems * 3 / 2);
@@ -342,15 +445,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	if (rht_dereference(old_tbl->future_tbl, ht))
 		return -EEXIST;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (new_tbl == NULL)
-		return -ENOMEM;
-
-	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
-	if (err)
-		bucket_table_free(new_tbl);
-
-	return err;
+	return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -366,11 +461,14 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rhashtable_last_table(ht, tbl);
 
 	if (rht_grow_above_75(ht, tbl))
-		rhashtable_expand(ht);
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
 	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-		rhashtable_shrink(ht);
+		err = rhashtable_shrink(ht);
+	else if (tbl->nest)
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
 
-	err = rhashtable_rehash_table(ht);
+	if (!err)
+		err = rhashtable_rehash_table(ht);
 
 	mutex_unlock(&ht->mutex);
 
@@ -439,8 +537,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 	int elasticity;
 
 	elasticity = ht->elasticity;
-	pprev = &tbl->buckets[hash];
-	rht_for_each(head, tbl, hash) {
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(head, *pprev, tbl, hash) {
 		struct rhlist_head *list;
 		struct rhlist_head *plist;
 
@@ -477,6 +575,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 						  struct rhash_head *obj,
 						  void *data)
 {
+	struct rhash_head __rcu **pprev;
 	struct bucket_table *new_tbl;
 	struct rhash_head *head;
 
@@ -499,7 +598,11 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		return ERR_PTR(-EAGAIN);
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	pprev = rht_bucket_insert(ht, tbl, hash);
+	if (!pprev)
+		return ERR_PTR(-ENOMEM);
+
+	head = rht_dereference_bucket(*pprev, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (ht->rhlist) {
@@ -509,7 +612,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	rcu_assign_pointer(*pprev, obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
@@ -975,7 +1078,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void (*free_fn)(void *ptr, void *arg),
 				 void *arg)
 {
-	const struct bucket_table *tbl;
+	struct bucket_table *tbl;
 	unsigned int i;
 
 	cancel_work_sync(&ht->run_work);
@@ -986,7 +1089,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 		for (i = 0; i < tbl->size; i++) {
 			struct rhash_head *pos, *next;
 
-			for (pos = rht_dereference(tbl->buckets[i], ht),
+			for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL;
 			     !rht_is_a_nulls(pos);
@@ -1007,3 +1110,70 @@ void rhashtable_destroy(struct rhashtable *ht)
 	return rhashtable_free_and_destroy(ht, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
+					    unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	static struct rhash_head __rcu *rhnull =
+		(struct rhash_head __rcu *)NULLS_MARKER(0);
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int subhash = hash;
+	union nested_table *ntbl;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+	ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+	subhash >>= tbl->nest;
+
+	while (ntbl && size > (1 << shift)) {
+		index = subhash & ((1 << shift) - 1);
+		ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+		size >>= shift;
+		subhash >>= shift;
+	}
+
+	if (!ntbl)
+		return &rhnull;
+
+	return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
+						   struct bucket_table *tbl,
+						   unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	union nested_table *ntbl;
+	unsigned int shifted;
+	unsigned int nhash;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+	hash >>= tbl->nest;
+	nhash = index;
+	shifted = tbl->nest;
+	ntbl = nested_table_alloc(ht, &ntbl[index].table,
+				  size <= (1 << shift) ? shifted : 0, nhash);
+
+	while (ntbl && size > (1 << shift)) {
+		index = hash & ((1 << shift) - 1);
+		size >>= shift;
+		hash >>= shift;
+		nhash |= index << shifted;
+		shifted += shift;
+		ntbl = nested_table_alloc(ht, &ntbl[index].table,
+					  size <= (1 << shift) ? shifted : 0,
+					  nhash);
+	}
+
+	if (!ntbl)
+		return NULL;
+
+	return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
-- 
cgit v1.2.3


From fb585b44383c4cff85f92e67377ee1c5f07d6dc1 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Fri, 10 Feb 2017 16:43:50 +0100
Subject: net: make net_device members garp_port and mrp_port conditional

garp_port is only used in net/802/garp.c which is only compiled with
CONFIG_GARP enabled. Same goes for mrp_port which is only used in
net/802/mrp.c with CONFIG_MRP enabled.

Only include the two members in struct net_device if their respective
CONFIG_* is enabled. This saves a few bytes in struct net_device in case
CONFIG_GARP or CONFIG_MRP are not enabled.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 54c82b5df0ba..98f65ed8f8b0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1865,8 +1865,12 @@ struct net_device {
 		struct pcpu_vstats __percpu		*vstats;
 	};
 
+#if IS_ENABLED(CONFIG_GARP)
 	struct garp_port __rcu	*garp_port;
+#endif
+#if IS_ENABLED(CONFIG_MRP)
 	struct mrp_port __rcu	*mrp_port;
+#endif
 
 	struct device		dev;
 	const struct attribute_group *sysfs_groups[4];
-- 
cgit v1.2.3


From efff8e7879b8b7f8a077f495262f0bb9cfaa0b06 Mon Sep 17 00:00:00 2001
From: Uri Yanai <uri.yanai@sandisk.com>
Date: Tue, 7 Feb 2017 18:00:01 +0200
Subject: mmc: Adding AUTO_BKOPS_EN bit set for Auto BKOPS support

Adding dedicated flag for AUTO_BKOPS in card->ext_csd structure.
Read AUTO_BKOPS bit value from the device EXT_CSD and set to the
card->ext_csd structure.
In mmc_decode_ext_csd() add a print message in case the AUTO_BKOPS
is enabled

Signed-off-by: Uri Yanai <uri.yanai@sandisk.com>
Signed-off-by: Alex Lemberg <alex.lemberg@sandisk.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   | 6 ++++++
 include/linux/mmc/card.h | 1 +
 include/linux/mmc/mmc.h  | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 1b67e3085353..e01e70c24ca2 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -546,6 +546,12 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 			if (card->ext_csd.man_bkops_en)
 				pr_debug("%s: MAN_BKOPS_EN bit is set\n",
 					mmc_hostname(card->host));
+			card->ext_csd.auto_bkops_en =
+					(ext_csd[EXT_CSD_BKOPS_EN] &
+						EXT_CSD_AUTO_BKOPS_MASK);
+			if (card->ext_csd.auto_bkops_en)
+				pr_debug("%s: AUTO_BKOPS_EN bit is set\n",
+					mmc_hostname(card->host));
 		}
 
 		/* check whether the eMMC card supports HPI */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 29d00c91c25c..77e61e0a216a 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -83,6 +83,7 @@ struct mmc_ext_csd {
 	unsigned int		hpi_cmd;		/* cmd used as HPI */
 	bool			bkops;		/* background support bit */
 	bool			man_bkops_en;	/* manual bkops enable bit */
+	bool			auto_bkops_en;	/* auto bkops enable bit */
 	unsigned int            data_sector_size;       /* 512 bytes or 4KB */
 	unsigned int            data_tag_unit_size;     /* DATA TAG UNIT size */
 	unsigned int		boot_ro_lock;		/* ro lock support */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 7406d9badda0..3ffc27aaeeaf 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -407,6 +407,7 @@ static inline bool mmc_op_multi(u32 opcode)
  * BKOPS modes
  */
 #define EXT_CSD_MANUAL_BKOPS_MASK	0x01
+#define EXT_CSD_AUTO_BKOPS_MASK		0x02
 
 /*
  * Command Queue
-- 
cgit v1.2.3


From c43f1112c068f3b4b20a0a9d461c341d9caeb376 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Wed, 18 Jan 2017 14:10:33 +0200
Subject: IB/mlx5: Add additional checks before processing MADs

Check the has_smi bit in vport context and class version of MADs
before allowing MADs processing to take place.
MAD_IFC SMI commands can be executed only if smi bit is set.

Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Parvi Kaustubhi <parvik@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/mad.c  | 12 ++++++++++++
 drivers/infiniband/hw/mlx5/main.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/mlx5/driver.h       |  1 +
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 39e58489dcc2..af962e7fdc3a 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -42,12 +42,24 @@ enum {
 	MLX5_IB_VENDOR_CLASS2 = 0xa
 };
 
+static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num,
+			   struct ib_mad *in_mad)
+{
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+	    in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		return true;
+	return dev->mdev->port_caps[port_num - 1].has_smi;
+}
+
 int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
 		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
 		 const void *in_mad, void *response_mad)
 {
 	u8 op_modifier = 0;
 
+	if (!can_do_mad_ifc(dev, port, (struct ib_mad *)in_mad))
+		return -EPERM;
+
 	/* Key check traps can't be generated unless we have in_wc to
 	 * tell us where to send the trap.
 	 */
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 0187f1d7234a..1dea4073d83f 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2533,6 +2533,35 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 		ibdev->ib_active = false;
 }
 
+static int set_has_smi_cap(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_hca_vport_context vport_ctx;
+	int err;
+	int port;
+
+	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
+		dev->mdev->port_caps[port - 1].has_smi = false;
+		if (MLX5_CAP_GEN(dev->mdev, port_type) ==
+		    MLX5_CAP_PORT_TYPE_IB) {
+			if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
+				err = mlx5_query_hca_vport_context(dev->mdev, 0,
+								   port, 0,
+								   &vport_ctx);
+				if (err) {
+					mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
+						    port, err);
+					return err;
+				}
+				dev->mdev->port_caps[port - 1].has_smi =
+					vport_ctx.has_smi;
+			} else {
+				dev->mdev->port_caps[port - 1].has_smi = true;
+			}
+		}
+	}
+	return 0;
+}
+
 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
 {
 	int port;
@@ -2557,6 +2586,10 @@ static int get_port_caps(struct mlx5_ib_dev *dev)
 	if (!dprops)
 		goto out;
 
+	err = set_has_smi_cap(dev);
+	if (err)
+		goto out;
+
 	err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
 	if (err) {
 		mlx5_ib_warn(dev, "query_device failed %d\n", err);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3a309f6a4a15..b8d69aeb1784 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -289,6 +289,7 @@ struct mlx5_port_caps {
 	int	gid_table_len;
 	int	pkey_table_len;
 	u8	ext_port_cap;
+	bool	has_smi;
 };
 
 struct mlx5_cmd_mailbox {
-- 
cgit v1.2.3


From 853fe1bf7554155376bb3b231112cdff9ff79177 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 13 Feb 2017 16:25:26 -0800
Subject: cdrom: Make device operations read-only

Since function tables are a common target for attackers, it's best to keep
them in read-only memory. As such, this makes the CDROM device ops tables
const. This drops additionally n_minors, since it isn't used meaningfully,
and sets the only user of cdrom_dummy_generic_packet explicitly so the
variables can all be const.

Inspired by similar changes in grsecurity/PaX.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cdrom/cdrom-standard.tex |  9 +-----
 drivers/block/paride/pcd.c             |  2 +-
 drivers/cdrom/cdrom.c                  | 58 ++++++++++++++++------------------
 drivers/cdrom/gdrom.c                  |  4 +--
 drivers/ide/ide-cd.c                   |  2 +-
 drivers/scsi/sr.c                      |  2 +-
 include/linux/cdrom.h                  |  5 +--
 7 files changed, 37 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex
index c06233fe52ac..8f85b0e41046 100644
--- a/Documentation/cdrom/cdrom-standard.tex
+++ b/Documentation/cdrom/cdrom-standard.tex
@@ -249,7 +249,6 @@ struct& cdrom_device_ops\ \{ \hidewidth\cr
         unsigned\ long);\cr
 \noalign{\medskip}
   &const\ int& capability;& capability flags \cr
-  &int& n_minors;& number of active minor devices \cr
 \};\cr
 }
 $$
@@ -258,13 +257,7 @@ it should add a function pointer to this $struct$. When a particular
 function is not implemented, however, this $struct$ should contain a
 NULL instead. The $capability$ flags specify the capabilities of the
 \cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive
-is registered with the \UCD. The value $n_minors$ should be a positive
-value indicating the number of minor devices that are supported by
-the low-level device driver, normally~1. Although these two variables
-are `informative' rather than `operational,' they are included in
-$cdrom_device_ops$ because they describe the capability of the {\em
-driver\/} rather than the {\em drive}. Nomenclature has always been
-difficult in computer programming.
+is registered with the \UCD.
 
 Note that most functions have fewer parameters than their
 $blkdev_fops$ counterparts. This is because very little of the
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 5fd2d0e25567..10aed84244f5 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -273,7 +273,7 @@ static const struct block_device_operations pcd_bdops = {
 	.check_events	= pcd_block_check_events,
 };
 
-static struct cdrom_device_ops pcd_dops = {
+static const struct cdrom_device_ops pcd_dops = {
 	.open		= pcd_open,
 	.release	= pcd_release,
 	.drive_status	= pcd_drive_status,
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 59cca72647a6..bbbd3caa927c 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -342,8 +342,8 @@ static void cdrom_sysctl_register(void);
 
 static LIST_HEAD(cdrom_list);
 
-static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
-				      struct packet_command *cgc)
+int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
+			       struct packet_command *cgc)
 {
 	if (cgc->sense) {
 		cgc->sense->sense_key = 0x05;
@@ -354,6 +354,7 @@ static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
 	cgc->stat = -EIO;
 	return -EIO;
 }
+EXPORT_SYMBOL(cdrom_dummy_generic_packet);
 
 static int cdrom_flush_cache(struct cdrom_device_info *cdi)
 {
@@ -371,7 +372,7 @@ static int cdrom_flush_cache(struct cdrom_device_info *cdi)
 static int cdrom_get_disc_info(struct cdrom_device_info *cdi,
 			       disc_information *di)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	struct packet_command cgc;
 	int ret, buflen;
 
@@ -586,7 +587,7 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space)
 int register_cdrom(struct cdrom_device_info *cdi)
 {
 	static char banner_printed;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	int *change_capability = (int *)&cdo->capability; /* hack */
 
 	cd_dbg(CD_OPEN, "entering register_cdrom\n");
@@ -610,7 +611,6 @@ int register_cdrom(struct cdrom_device_info *cdi)
 	ENSURE(reset, CDC_RESET);
 	ENSURE(generic_packet, CDC_GENERIC_PACKET);
 	cdi->mc_flags = 0;
-	cdo->n_minors = 0;
 	cdi->options = CDO_USE_FFLAGS;
 
 	if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY))
@@ -630,8 +630,7 @@ int register_cdrom(struct cdrom_device_info *cdi)
 	else
 		cdi->cdda_method = CDDA_OLD;
 
-	if (!cdo->generic_packet)
-		cdo->generic_packet = cdrom_dummy_generic_packet;
+	WARN_ON(!cdo->generic_packet);
 
 	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
 	mutex_lock(&cdrom_mutex);
@@ -652,7 +651,6 @@ void unregister_cdrom(struct cdrom_device_info *cdi)
 	if (cdi->exit)
 		cdi->exit(cdi);
 
-	cdi->ops->n_minors--;
 	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
 }
 
@@ -1036,7 +1034,7 @@ static
 int open_for_data(struct cdrom_device_info *cdi)
 {
 	int ret;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	tracktype tracks;
 	cd_dbg(CD_OPEN, "entering open_for_data\n");
 	/* Check if the driver can report drive status.  If it can, we
@@ -1198,8 +1196,8 @@ err:
 /* This code is similar to that in open_for_data. The routine is called
    whenever an audio play operation is requested.
 */
-static int check_for_audio_disc(struct cdrom_device_info * cdi,
-				struct cdrom_device_ops * cdo)
+static int check_for_audio_disc(struct cdrom_device_info *cdi,
+				const struct cdrom_device_ops *cdo)
 {
         int ret;
 	tracktype tracks;
@@ -1254,7 +1252,7 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi,
 
 void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	int opened_for_data;
 
 	cd_dbg(CD_CLOSE, "entering cdrom_release\n");
@@ -1294,7 +1292,7 @@ static int cdrom_read_mech_status(struct cdrom_device_info *cdi,
 				  struct cdrom_changer_info *buf)
 {
 	struct packet_command cgc;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	int length;
 
 	/*
@@ -1643,7 +1641,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 	int ret;
 	u_char buf[20];
 	struct packet_command cgc;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	rpc_state_t rpc_state;
 
 	memset(buf, 0, sizeof(buf));
@@ -1791,7 +1789,7 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s,
 {
 	unsigned char buf[21], *base;
 	struct dvd_layer *layer;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	int ret, layer_num = s->physical.layer_num;
 
 	if (layer_num >= DVD_LAYERS)
@@ -1842,7 +1840,7 @@ static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s,
 {
 	int ret;
 	u_char buf[8];
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	init_cdrom_command(cgc, buf, sizeof(buf), CGC_DATA_READ);
 	cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE;
@@ -1866,7 +1864,7 @@ static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s,
 {
 	int ret, size;
 	u_char *buf;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	size = sizeof(s->disckey.value) + 4;
 
@@ -1894,7 +1892,7 @@ static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s,
 {
 	int ret, size = 4 + 188;
 	u_char *buf;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	buf = kmalloc(size, GFP_KERNEL);
 	if (!buf)
@@ -1928,7 +1926,7 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s,
 {
 	int ret = 0, size;
 	u_char *buf;
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	size = sizeof(s->manufact.value) + 4;
 
@@ -1995,7 +1993,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi,
 		     struct packet_command *cgc,
 		     int page_code, int page_control)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	memset(cgc->cmd, 0, sizeof(cgc->cmd));
 
@@ -2010,7 +2008,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi,
 int cdrom_mode_select(struct cdrom_device_info *cdi,
 		      struct packet_command *cgc)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	memset(cgc->cmd, 0, sizeof(cgc->cmd));
 	memset(cgc->buffer, 0, 2);
@@ -2025,7 +2023,7 @@ int cdrom_mode_select(struct cdrom_device_info *cdi,
 static int cdrom_read_subchannel(struct cdrom_device_info *cdi,
 				 struct cdrom_subchnl *subchnl, int mcn)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	struct packet_command cgc;
 	char buffer[32];
 	int ret;
@@ -2073,7 +2071,7 @@ static int cdrom_read_cd(struct cdrom_device_info *cdi,
 			 struct packet_command *cgc, int lba,
 			 int blocksize, int nblocks)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	memset(&cgc->cmd, 0, sizeof(cgc->cmd));
 	cgc->cmd[0] = GPCMD_READ_10;
@@ -2093,7 +2091,7 @@ static int cdrom_read_block(struct cdrom_device_info *cdi,
 			    struct packet_command *cgc,
 			    int lba, int nblocks, int format, int blksize)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 
 	memset(&cgc->cmd, 0, sizeof(cgc->cmd));
 	cgc->cmd[0] = GPCMD_READ_CD;
@@ -2764,7 +2762,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
  */
 static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	struct packet_command cgc;
 	struct modesel_head mh;
 
@@ -2790,7 +2788,7 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
 static int cdrom_get_track_info(struct cdrom_device_info *cdi,
 				__u16 track, __u8 type, track_information *ti)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	struct packet_command cgc;
 	int ret, buflen;
 
@@ -3049,7 +3047,7 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
 					     void __user *arg,
 					     struct packet_command *cgc)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	struct cdrom_msf msf;
 	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
 	if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
@@ -3069,7 +3067,7 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
 					     void __user *arg,
 					     struct packet_command *cgc)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	struct cdrom_blk blk;
 	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
 	if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk)))
@@ -3164,7 +3162,7 @@ static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
 					       struct packet_command *cgc,
 					       int cmd)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
 	cgc->cmd[0] = GPCMD_START_STOP_UNIT;
 	cgc->cmd[1] = 1;
@@ -3177,7 +3175,7 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
 						 struct packet_command *cgc,
 						 int cmd)
 {
-	struct cdrom_device_ops *cdo = cdi->ops;
+	const struct cdrom_device_ops *cdo = cdi->ops;
 	cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
 	cgc->cmd[0] = GPCMD_PAUSE_RESUME;
 	cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 46ecd95d7161..1afab6558d0c 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -481,7 +481,7 @@ static int gdrom_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
 	return -EINVAL;
 }
 
-static struct cdrom_device_ops gdrom_ops = {
+static const struct cdrom_device_ops gdrom_ops = {
 	.open			= gdrom_open,
 	.release		= gdrom_release,
 	.drive_status		= gdrom_drivestatus,
@@ -489,9 +489,9 @@ static struct cdrom_device_ops gdrom_ops = {
 	.get_last_session	= gdrom_get_last_session,
 	.reset			= gdrom_hardreset,
 	.audio_ioctl		= gdrom_audio_ioctl,
+	.generic_packet		= cdrom_dummy_generic_packet,
 	.capability		= CDC_MULTI_SESSION | CDC_MEDIA_CHANGED |
 				  CDC_RESET | CDC_DRIVE_STATUS | CDC_CD_R,
-	.n_minors		= 1,
 };
 
 static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 9cbd217bc0c9..ab9232e1e16f 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1166,7 +1166,7 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf)
 	 CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM | CDC_GENERIC_PACKET | \
 	 CDC_MO_DRIVE | CDC_MRW | CDC_MRW_W | CDC_RAM)
 
-static struct cdrom_device_ops ide_cdrom_dops = {
+static const struct cdrom_device_ops ide_cdrom_dops = {
 	.open			= ide_cdrom_open_real,
 	.release		= ide_cdrom_release_real,
 	.drive_status		= ide_cdrom_drive_status,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 94352e4df831..013bfe049a48 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -117,7 +117,7 @@ static unsigned int sr_check_events(struct cdrom_device_info *cdi,
 				    unsigned int clearing, int slot);
 static int sr_packet(struct cdrom_device_info *, struct packet_command *);
 
-static struct cdrom_device_ops sr_dops = {
+static const struct cdrom_device_ops sr_dops = {
 	.open			= sr_open,
 	.release	 	= sr_release,
 	.drive_status	 	= sr_drive_status,
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index 8609d577bb66..6e8f209a6dff 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -36,7 +36,7 @@ struct packet_command
 
 /* Uniform cdrom data structures for cdrom.c */
 struct cdrom_device_info {
-	struct cdrom_device_ops  *ops;  /* link to device_ops */
+	const struct cdrom_device_ops *ops; /* link to device_ops */
 	struct list_head list;		/* linked list of all device_info */
 	struct gendisk *disk;		/* matching block layer disk */
 	void *handle;		        /* driver-dependent data */
@@ -87,7 +87,6 @@ struct cdrom_device_ops {
 
 /* driver specifications */
 	const int capability;   /* capability flags */
-	int n_minors;           /* number of active minor devices */
 	/* handle uniform packets for scsi type devices (scsi,atapi) */
 	int (*generic_packet) (struct cdrom_device_info *,
 			       struct packet_command *);
@@ -123,6 +122,8 @@ extern int cdrom_mode_sense(struct cdrom_device_info *cdi,
 			    int page_code, int page_control);
 extern void init_cdrom_command(struct packet_command *cgc,
 			       void *buffer, int len, int type);
+extern int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
+				      struct packet_command *cgc);
 
 /* The SCSI spec says there could be 256 slots. */
 #define CDROM_MAX_SLOTS	256
-- 
cgit v1.2.3


From 23a6964e3adb0796e1633562a574839b92360cb6 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Wed, 18 Jan 2017 15:25:10 +0200
Subject: IB/mlx5: Add port counter support for Receive WQs

Counters weren't updated due to Receive WQs' traffic since the
counter-id was not associated with the RQ.

Added support for associating the q-counter-id with the Receive WQ.
The attachment is done only when changing WQ's state from RESET to
READY in modify-WQ command.

FW support is required for the above, without this support
Receive WQ counters will not count.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 12 +++++++++++-
 include/linux/mlx5/mlx5_ifc.h   |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 5c7d655655bb..f395ee9d2fea 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2429,7 +2429,7 @@ static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
 	if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) {
 		if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
 			MLX5_SET64(modify_rq_in, in, modify_bitmask,
-				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID);
+				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
 			MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id);
 		} else
 			pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n",
@@ -4910,6 +4910,16 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 	MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
 	MLX5_SET(rqc, rqc, state, wq_state);
 
+	if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) {
+		if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
+			MLX5_SET64(modify_rq_in, in, modify_bitmask,
+				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
+			MLX5_SET(rqc, rqc, counter_set_id, dev->port->q_cnts.set_id);
+		} else
+			pr_info_once("%s: Receive WQ counters are not supported on current FW\n",
+				     dev->ib_dev.name);
+	}
+
 	err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen);
 	kvfree(in);
 	if (!err)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 37327f6ba9cb..2d197d8a7025 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4937,7 +4937,7 @@ struct mlx5_ifc_modify_rq_out_bits {
 
 enum {
 	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1,
-	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID = 1ULL << 3,
+	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID = 1ULL << 3,
 };
 
 struct mlx5_ifc_modify_rq_in_bits {
-- 
cgit v1.2.3


From 49780d42dfc9ec0f4090c32ca59688449da1a1cd Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Wed, 18 Jan 2017 16:58:10 +0200
Subject: IB/mlx5: Expose MR cache for mlx5_ib

Allow other parts of mlx5_ib to use MR cache mechanism.
* Add new functions mlx5_mr_cache_alloc and mlx5_mr_cache_free
* Traditional MTT MKey buckets are limited by MAX_UMR_CACHE_ENTRY
  Additinal buckets may be added above.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  9 +++-
 drivers/infiniband/hw/mlx5/mr.c      | 99 ++++++++++++++++++++++++++----------
 include/linux/mlx5/driver.h          |  3 +-
 3 files changed, 82 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 13bef19649f2..efc44de3c7d7 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -541,6 +541,10 @@ struct mlx5_cache_ent {
 	struct dentry	       *dir;
 	char                    name[4];
 	u32                     order;
+	u32			xlt;
+	u32			access_mode;
+	u32			page;
+
 	u32			size;
 	u32                     cur;
 	u32                     miss;
@@ -555,6 +559,7 @@ struct mlx5_cache_ent {
 	struct work_struct	work;
 	struct delayed_work	dwork;
 	int			pending;
+	struct completion	compl;
 };
 
 struct mlx5_mr_cache {
@@ -837,7 +842,9 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
+
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
+void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
 struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8cf2a67f9fb0..8f5b94d483e4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -49,6 +49,7 @@ enum {
 
 static int clean_mr(struct mlx5_ib_mr *mr);
 static int use_umr(struct mlx5_ib_dev *dev, int order);
+static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
@@ -149,6 +150,9 @@ static void reg_mr_callback(int status, void *context)
 	if (err)
 		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
 	write_unlock_irqrestore(&table->lock, flags);
+
+	if (!completion_done(&ent->compl))
+		complete(&ent->compl);
 }
 
 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
@@ -157,7 +161,6 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 	struct mlx5_cache_ent *ent = &cache->ent[c];
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	struct mlx5_ib_mr *mr;
-	int npages = 1 << ent->order;
 	void *mkc;
 	u32 *in;
 	int err = 0;
@@ -185,11 +188,11 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 
 		MLX5_SET(mkc, mkc, free, 1);
 		MLX5_SET(mkc, mkc, umr_en, 1);
-		MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT);
+		MLX5_SET(mkc, mkc, access_mode, ent->access_mode);
 
 		MLX5_SET(mkc, mkc, qpn, 0xffffff);
-		MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2);
-		MLX5_SET(mkc, mkc, log_page_size, 12);
+		MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
+		MLX5_SET(mkc, mkc, log_page_size, ent->page);
 
 		spin_lock_irq(&ent->lock);
 		ent->pending++;
@@ -447,6 +450,42 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
+		return NULL;
+	}
+
+	ent = &cache->ent[entry];
+	while (1) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+
+			err = add_keys(dev, entry, 1);
+			if (err)
+				return ERR_PTR(err);
+
+			wait_for_completion(&ent->compl);
+		} else {
+			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
+					      list);
+			list_del(&mr->list);
+			ent->cur--;
+			spin_unlock_irq(&ent->lock);
+			if (ent->cur < ent->limit)
+				queue_work(cache->wq, &ent->work);
+			return mr;
+		}
+	}
+}
+
 static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
@@ -456,12 +495,12 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 	int i;
 
 	c = order2idx(dev, order);
-	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+	if (c < 0 || c > MAX_UMR_CACHE_ENTRY) {
 		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
 		return NULL;
 	}
 
-	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = c; i < MAX_UMR_CACHE_ENTRY; i++) {
 		ent = &cache->ent[i];
 
 		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
@@ -488,7 +527,7 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 	return mr;
 }
 
-static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
@@ -500,6 +539,10 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
 		return;
 	}
+
+	if (unreg_umr(dev, mr))
+		return;
+
 	ent = &cache->ent[c];
 	spin_lock_irq(&ent->lock);
 	list_add_tail(&mr->list, &ent->head);
@@ -602,7 +645,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
-	int limit;
 	int err;
 	int i;
 
@@ -615,26 +657,33 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		INIT_LIST_HEAD(&cache->ent[i].head);
-		spin_lock_init(&cache->ent[i].lock);
-
 		ent = &cache->ent[i];
 		INIT_LIST_HEAD(&ent->head);
 		spin_lock_init(&ent->lock);
 		ent->order = i + 2;
 		ent->dev = dev;
+		ent->limit = 0;
 
-		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
-		    mlx5_core_is_pf(dev->mdev) &&
-		    use_umr(dev, ent->order))
-			limit = dev->mdev->profile->mr_cache[i].limit;
-		else
-			limit = 0;
-
+		init_completion(&ent->compl);
 		INIT_WORK(&ent->work, cache_work_func);
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
-		ent->limit = limit;
 		queue_work(cache->wq, &ent->work);
+
+		if (i > MAX_UMR_CACHE_ENTRY)
+			continue;
+
+		if (!use_umr(dev, ent->order))
+			continue;
+
+		ent->page = PAGE_SHIFT;
+		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
+			   MLX5_IB_UMR_OCTOWORD;
+		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
+		    mlx5_core_is_pf(dev->mdev))
+			ent->limit = dev->mdev->profile->mr_cache[i].limit;
+		else
+			ent->limit = 0;
 	}
 
 	err = mlx5_mr_cache_debugfs_init(dev);
@@ -758,7 +807,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
 static int use_umr(struct mlx5_ib_dev *dev, int order)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return order < MAX_MR_CACHE_ENTRIES + 2;
+		return order <= MAX_UMR_CACHE_ENTRY + 2;
 	return order <= MLX5_MAX_UMR_SHIFT;
 }
 
@@ -871,7 +920,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
 				 MLX5_IB_UPD_XLT_ENABLE);
 
 	if (err) {
-		free_cached_mr(dev, mr);
+		mlx5_mr_cache_free(dev, mr);
 		return ERR_PTR(err);
 	}
 
@@ -1091,6 +1140,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 		goto err_2;
 	}
 	mr->mmkey.type = MLX5_MKEY_MR;
+	mr->desc_size = sizeof(struct mlx5_mtt);
 	mr->umem = umem;
 	mr->dev = dev;
 	mr->live = 1;
@@ -1398,12 +1448,7 @@ static int clean_mr(struct mlx5_ib_mr *mr)
 			return err;
 		}
 	} else {
-		err = unreg_umr(dev, mr);
-		if (err) {
-			mlx5_ib_warn(dev, "failed unregister\n");
-			return err;
-		}
-		free_cached_mr(dev, mr);
+		mlx5_mr_cache_free(dev, mr);
 	}
 
 	if (!umred)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b8d69aeb1784..2534b8a0fd7b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1052,7 +1052,8 @@ enum {
 };
 
 enum {
-	MAX_MR_CACHE_ENTRIES    = 21,
+	MAX_UMR_CACHE_ENTRY = 20,
+	MAX_MR_CACHE_ENTRIES
 };
 
 enum {
-- 
cgit v1.2.3


From 81713d3788d2e6bc005f15ee1c59d0eb06050a6b Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Wed, 18 Jan 2017 16:58:11 +0200
Subject: IB/mlx5: Add implicit MR support

Add implicit MR, covering entire user address space.
The MR is implemented as an indirect KSM MR consisting of
1GB direct MRs.
Pages and direct MRs are added/removed to MR by ODP.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c    |   2 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  20 +-
 drivers/infiniband/hw/mlx5/mr.c      |  33 ++-
 drivers/infiniband/hw/mlx5/odp.c     | 505 ++++++++++++++++++++++++++++++++---
 include/linux/mlx5/driver.h          |   2 +
 5 files changed, 513 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index fe37da2be26f..eb8719ca500e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3583,6 +3583,8 @@ static int __init mlx5_ib_init(void)
 {
 	int err;
 
+	mlx5_ib_odp_init();
+
 	err = mlx5_register_interface(&mlx5_ib_interface);
 
 	return err;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index efc44de3c7d7..3cd064b5f0bf 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -202,6 +202,7 @@ struct mlx5_ib_flow_db {
 #define MLX5_IB_UPD_XLT_ADDR	      BIT(3)
 #define MLX5_IB_UPD_XLT_PD	      BIT(4)
 #define MLX5_IB_UPD_XLT_ACCESS	      BIT(5)
+#define MLX5_IB_UPD_XLT_INDIRECT      BIT(6)
 
 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
  *
@@ -503,6 +504,10 @@ struct mlx5_ib_mr {
 	int			live;
 	void			*descs_alloc;
 	int			access_flags; /* Needed for rereg MR */
+
+	struct mlx5_ib_mr      *parent;
+	atomic_t		num_leaf_free;
+	wait_queue_head_t       q_leaf_free;
 };
 
 struct mlx5_ib_mw {
@@ -637,6 +642,7 @@ struct mlx5_ib_dev {
 	 * being used by a page fault handler.
 	 */
 	struct srcu_struct      mr_srcu;
+	u32			null_mkey;
 #endif
 	struct mlx5_ib_flow_db	flow_db;
 	/* protect resources needed as part of reset flow */
@@ -789,6 +795,9 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 int mlx5_ib_dealloc_mw(struct ib_mw *mw);
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		       int page_shift, int flags);
+struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     int access_flags);
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 			  u64 length, u64 virt_addr, int access_flags,
 			  struct ib_pd *pd, struct ib_udata *udata);
@@ -868,6 +877,9 @@ int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end);
+void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+			   size_t nentries, struct mlx5_ib_mr *mr, int flags);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
@@ -875,9 +887,13 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 }
 
 static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
-static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)	{}
+static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)	    {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
-static inline void mlx5_ib_odp_cleanup(void)				{}
+static inline void mlx5_ib_odp_cleanup(void)				    {}
+static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+					 size_t nentries, struct mlx5_ib_mr *mr,
+					 int flags) {}
 
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8f5b94d483e4..3c1f483d003f 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -469,7 +469,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
 			spin_unlock_irq(&ent->lock);
 
 			err = add_keys(dev, entry, 1);
-			if (err)
+			if (err && err != -EAGAIN)
 				return ERR_PTR(err);
 
 			wait_for_completion(&ent->compl);
@@ -669,8 +669,10 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 		queue_work(cache->wq, &ent->work);
 
-		if (i > MAX_UMR_CACHE_ENTRY)
+		if (i > MAX_UMR_CACHE_ENTRY) {
+			mlx5_odp_init_mr_cache_entry(ent);
 			continue;
+		}
 
 		if (!use_umr(dev, ent->order))
 			continue;
@@ -935,6 +937,10 @@ static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
 {
 	struct mlx5_ib_dev *dev = mr->dev;
 	struct ib_umem *umem = mr->umem;
+	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
+		mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
+		return npages;
+	}
 
 	npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
 
@@ -968,7 +974,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	struct mlx5_umr_wr wr;
 	struct ib_sge sg;
 	int err = 0;
-	int desc_size = sizeof(struct mlx5_mtt);
+	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
+			       ? sizeof(struct mlx5_klm)
+			       : sizeof(struct mlx5_mtt);
 	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
 	const int page_mask = page_align - 1;
 	size_t pages_mapped = 0;
@@ -1186,6 +1194,18 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 		    start, virt_addr, length, access_flags);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (!start && length == U64_MAX) {
+		if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
+		    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+			return ERR_PTR(-EINVAL);
+
+		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
+		return &mr->ibmr;
+	}
+#endif
+
 	err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
 			   &page_shift, &ncont, &order);
 
@@ -1471,8 +1491,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
 		/* Wait for all running page-fault handlers to finish. */
 		synchronize_srcu(&dev->mr_srcu);
 		/* Destroy all page mappings */
-		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
-					 ib_umem_end(umem));
+		if (umem->odp_data->page_list)
+			mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
+						 ib_umem_end(umem));
+		else
+			mlx5_ib_free_implicit_mr(mr);
 		/*
 		 * We kill the umem before the MR for ODP,
 		 * so that there will not be any invalidations in
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index e5bc267aca73..d7b12f0750e2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -34,6 +34,7 @@
 #include <rdma/ib_umem_odp.h>
 
 #include "mlx5_ib.h"
+#include "cmd.h"
 
 #define MAX_PREFETCH_LEN (4*1024*1024U)
 
@@ -41,6 +42,140 @@
  * a pagefault. */
 #define MMU_NOTIFIER_TIMEOUT 1000
 
+#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
+#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
+#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
+#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
+#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
+
+#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
+
+static u64 mlx5_imr_ksm_entries;
+
+static int check_parent(struct ib_umem_odp *odp,
+			       struct mlx5_ib_mr *parent)
+{
+	struct mlx5_ib_mr *mr = odp->private;
+
+	return mr && mr->parent == parent;
+}
+
+static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
+{
+	struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
+	struct ib_ucontext *ctx = odp->umem->context;
+	struct rb_node *rb;
+
+	down_read(&ctx->umem_rwsem);
+	while (1) {
+		rb = rb_next(&odp->interval_tree.rb);
+		if (!rb)
+			goto not_found;
+		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
+		if (check_parent(odp, parent))
+			goto end;
+	}
+not_found:
+	odp = NULL;
+end:
+	up_read(&ctx->umem_rwsem);
+	return odp;
+}
+
+static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
+				      u64 start, u64 length,
+				      struct mlx5_ib_mr *parent)
+{
+	struct ib_umem_odp *odp;
+	struct rb_node *rb;
+
+	down_read(&ctx->umem_rwsem);
+	odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
+	if (!odp)
+		goto end;
+
+	while (1) {
+		if (check_parent(odp, parent))
+			goto end;
+		rb = rb_next(&odp->interval_tree.rb);
+		if (!rb)
+			goto not_found;
+		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
+		if (ib_umem_start(odp->umem) > start + length)
+			goto not_found;
+	}
+not_found:
+	odp = NULL;
+end:
+	up_read(&ctx->umem_rwsem);
+	return odp;
+}
+
+void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+			   size_t nentries, struct mlx5_ib_mr *mr, int flags)
+{
+	struct ib_pd *pd = mr->ibmr.pd;
+	struct ib_ucontext *ctx = pd->uobject->context;
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct ib_umem_odp *odp;
+	unsigned long va;
+	int i;
+
+	if (flags & MLX5_IB_UPD_XLT_ZAP) {
+		for (i = 0; i < nentries; i++, pklm++) {
+			pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+			pklm->key = cpu_to_be32(dev->null_mkey);
+			pklm->va = 0;
+		}
+		return;
+	}
+
+	odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
+			     nentries * MLX5_IMR_MTT_SIZE, mr);
+
+	for (i = 0; i < nentries; i++, pklm++) {
+		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+		va = (offset + i) * MLX5_IMR_MTT_SIZE;
+		if (odp && odp->umem->address == va) {
+			struct mlx5_ib_mr *mtt = odp->private;
+
+			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
+			odp = odp_next(odp);
+		} else {
+			pklm->key = cpu_to_be32(dev->null_mkey);
+		}
+		mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
+			    i, va, be32_to_cpu(pklm->key));
+	}
+}
+
+static void mr_leaf_free_action(struct work_struct *work)
+{
+	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
+	int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
+	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+
+	mr->parent = NULL;
+	synchronize_srcu(&mr->dev->mr_srcu);
+
+	if (!READ_ONCE(odp->dying)) {
+		mr->parent = imr;
+		if (atomic_dec_and_test(&imr->num_leaf_free))
+			wake_up(&imr->q_leaf_free);
+		return;
+	}
+
+	ib_umem_release(odp->umem);
+	if (imr->live)
+		mlx5_ib_update_xlt(imr, idx, 1, 0,
+				   MLX5_IB_UPD_XLT_INDIRECT |
+				   MLX5_IB_UPD_XLT_ATOMIC);
+	mlx5_mr_cache_free(mr->dev, mr);
+
+	if (atomic_dec_and_test(&imr->num_leaf_free))
+		wake_up(&imr->q_leaf_free);
+}
+
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end)
 {
@@ -111,6 +246,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 	 */
 
 	ib_umem_odp_unmap_dma_pages(umem, start, end);
+
+	if (unlikely(!umem->npages && mr->parent &&
+		     !umem->odp_data->dying)) {
+		WRITE_ONCE(umem->odp_data->dying, 1);
+		atomic_inc(&mr->parent->num_leaf_free);
+		schedule_work(&umem->odp_data->work);
+	}
 }
 
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
@@ -147,6 +289,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
 
+	if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
+	    MLX5_CAP_GEN(dev->mdev, null_mkey) &&
+	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+		caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
+
 	return;
 }
 
@@ -184,6 +331,197 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
 			    wq_num);
 }
 
+static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
+					    struct ib_umem *umem,
+					    bool ksm, int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
+					    MLX5_IMR_MTT_CACHE_ENTRY);
+
+	if (IS_ERR(mr))
+		return mr;
+
+	mr->ibmr.pd = pd;
+
+	mr->dev = dev;
+	mr->access_flags = access_flags;
+	mr->mmkey.iova = 0;
+	mr->umem = umem;
+
+	if (ksm) {
+		err = mlx5_ib_update_xlt(mr, 0,
+					 mlx5_imr_ksm_entries,
+					 MLX5_KSM_PAGE_SHIFT,
+					 MLX5_IB_UPD_XLT_INDIRECT |
+					 MLX5_IB_UPD_XLT_ZAP |
+					 MLX5_IB_UPD_XLT_ENABLE);
+
+	} else {
+		err = mlx5_ib_update_xlt(mr, 0,
+					 MLX5_IMR_MTT_ENTRIES,
+					 PAGE_SHIFT,
+					 MLX5_IB_UPD_XLT_ZAP |
+					 MLX5_IB_UPD_XLT_ENABLE |
+					 MLX5_IB_UPD_XLT_ATOMIC);
+	}
+
+	if (err)
+		goto fail;
+
+	mr->ibmr.lkey = mr->mmkey.key;
+	mr->ibmr.rkey = mr->mmkey.key;
+
+	mr->live = 1;
+
+	mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
+		    mr->mmkey.key, dev->mdev, mr);
+
+	return mr;
+
+fail:
+	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
+	mlx5_mr_cache_free(dev, mr);
+
+	return ERR_PTR(err);
+}
+
+static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
+						u64 io_virt, size_t bcnt)
+{
+	struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
+	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
+	struct ib_umem_odp *odp, *result = NULL;
+	u64 addr = io_virt & MLX5_IMR_MTT_MASK;
+	int nentries = 0, start_idx = 0, ret;
+	struct mlx5_ib_mr *mtt;
+	struct ib_umem *umem;
+
+	mutex_lock(&mr->umem->odp_data->umem_mutex);
+	odp = odp_lookup(ctx, addr, 1, mr);
+
+	mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
+		    io_virt, bcnt, addr, odp);
+
+next_mr:
+	if (likely(odp)) {
+		if (nentries)
+			nentries++;
+	} else {
+		umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
+		if (IS_ERR(umem)) {
+			mutex_unlock(&mr->umem->odp_data->umem_mutex);
+			return ERR_CAST(umem);
+		}
+
+		mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
+		if (IS_ERR(mtt)) {
+			mutex_unlock(&mr->umem->odp_data->umem_mutex);
+			ib_umem_release(umem);
+			return ERR_CAST(mtt);
+		}
+
+		odp = umem->odp_data;
+		odp->private = mtt;
+		mtt->umem = umem;
+		mtt->mmkey.iova = addr;
+		mtt->parent = mr;
+		INIT_WORK(&odp->work, mr_leaf_free_action);
+
+		if (!nentries)
+			start_idx = addr >> MLX5_IMR_MTT_SHIFT;
+		nentries++;
+	}
+
+	odp->dying = 0;
+
+	/* Return first odp if region not covered by single one */
+	if (likely(!result))
+		result = odp;
+
+	addr += MLX5_IMR_MTT_SIZE;
+	if (unlikely(addr < io_virt + bcnt)) {
+		odp = odp_next(odp);
+		if (odp && odp->umem->address != addr)
+			odp = NULL;
+		goto next_mr;
+	}
+
+	if (unlikely(nentries)) {
+		ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
+					 MLX5_IB_UPD_XLT_INDIRECT |
+					 MLX5_IB_UPD_XLT_ATOMIC);
+		if (ret) {
+			mlx5_ib_err(dev, "Failed to update PAS\n");
+			result = ERR_PTR(ret);
+		}
+	}
+
+	mutex_unlock(&mr->umem->odp_data->umem_mutex);
+	return result;
+}
+
+struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     int access_flags)
+{
+	struct ib_ucontext *ctx = pd->ibpd.uobject->context;
+	struct mlx5_ib_mr *imr;
+	struct ib_umem *umem;
+
+	umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
+	if (IS_ERR(umem))
+		return ERR_CAST(umem);
+
+	imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+	if (IS_ERR(imr)) {
+		ib_umem_release(umem);
+		return ERR_CAST(imr);
+	}
+
+	imr->umem = umem;
+	init_waitqueue_head(&imr->q_leaf_free);
+	atomic_set(&imr->num_leaf_free, 0);
+
+	return imr;
+}
+
+static int mr_leaf_free(struct ib_umem *umem, u64 start,
+			u64 end, void *cookie)
+{
+	struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
+
+	if (mr->parent != imr)
+		return 0;
+
+	ib_umem_odp_unmap_dma_pages(umem,
+				    ib_umem_start(umem),
+				    ib_umem_end(umem));
+
+	if (umem->odp_data->dying)
+		return 0;
+
+	WRITE_ONCE(umem->odp_data->dying, 1);
+	atomic_inc(&imr->num_leaf_free);
+	schedule_work(&umem->odp_data->work);
+
+	return 0;
+}
+
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
+{
+	struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
+
+	down_read(&ctx->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
+				      mr_leaf_free, imr);
+	up_read(&ctx->umem_rwsem);
+
+	wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
+}
+
 /*
  * Handle a single data segment in a page-fault WQE or RDMA region.
  *
@@ -195,47 +533,43 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
  * -EFAULT when there's an error mapping the requested pages. The caller will
  *  abort the page fault handling.
  */
-static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
+static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 					 u32 key, u64 io_virt, size_t bcnt,
 					 u32 *bytes_committed,
 					 u32 *bytes_mapped)
 {
 	int srcu_key;
-	unsigned int current_seq;
+	unsigned int current_seq = 0;
 	u64 start_idx;
 	int npages = 0, ret = 0;
 	struct mlx5_ib_mr *mr;
 	u64 access_mask = ODP_READ_ALLOWED_BIT;
+	struct ib_umem_odp *odp;
+	int implicit = 0;
+	size_t size;
 
-	srcu_key = srcu_read_lock(&mib_dev->mr_srcu);
-	mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key);
+	srcu_key = srcu_read_lock(&dev->mr_srcu);
+	mr = mlx5_ib_odp_find_mr_lkey(dev, key);
 	/*
 	 * If we didn't find the MR, it means the MR was closed while we were
 	 * handling the ODP event. In this case we return -EFAULT so that the
 	 * QP will be closed.
 	 */
 	if (!mr || !mr->ibmr.pd) {
-		pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
-		       key);
+		mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
+			    key);
 		ret = -EFAULT;
 		goto srcu_unlock;
 	}
 	if (!mr->umem->odp_data) {
-		pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
-			 key);
+		mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+			    key);
 		if (bytes_mapped)
 			*bytes_mapped +=
 				(bcnt - *bytes_committed);
 		goto srcu_unlock;
 	}
 
-	current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
-	/*
-	 * Ensure the sequence number is valid for some time before we call
-	 * gup.
-	 */
-	smp_rmb();
-
 	/*
 	 * Avoid branches - this code will perform correctly
 	 * in all iterations (in iteration 2 and above,
@@ -244,63 +578,109 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
 	io_virt += *bytes_committed;
 	bcnt -= *bytes_committed;
 
+	if (!mr->umem->odp_data->page_list) {
+		odp = implicit_mr_get_data(mr, io_virt, bcnt);
+
+		if (IS_ERR(odp)) {
+			ret = PTR_ERR(odp);
+			goto srcu_unlock;
+		}
+		mr = odp->private;
+		implicit = 1;
+
+	} else {
+		odp = mr->umem->odp_data;
+	}
+
+next_mr:
+	current_seq = READ_ONCE(odp->notifiers_seq);
+	/*
+	 * Ensure the sequence number is valid for some time before we call
+	 * gup.
+	 */
+	smp_rmb();
+
+	size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
 	start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
 
 	if (mr->umem->writable)
 		access_mask |= ODP_WRITE_ALLOWED_BIT;
-	npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt,
-					   access_mask, current_seq);
-	if (npages < 0) {
-		ret = npages;
+
+	ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
+					access_mask, current_seq);
+
+	if (ret < 0)
 		goto srcu_unlock;
-	}
 
-	if (npages > 0) {
-		mutex_lock(&mr->umem->odp_data->umem_mutex);
+	if (ret > 0) {
+		int np = ret;
+
+		mutex_lock(&odp->umem_mutex);
 		if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
 			/*
 			 * No need to check whether the MTTs really belong to
 			 * this MR, since ib_umem_odp_map_dma_pages already
 			 * checks this.
 			 */
-			ret = mlx5_ib_update_xlt(mr, start_idx, npages,
+			ret = mlx5_ib_update_xlt(mr, start_idx, np,
 						 PAGE_SHIFT,
 						 MLX5_IB_UPD_XLT_ATOMIC);
 		} else {
 			ret = -EAGAIN;
 		}
-		mutex_unlock(&mr->umem->odp_data->umem_mutex);
+		mutex_unlock(&odp->umem_mutex);
 		if (ret < 0) {
 			if (ret != -EAGAIN)
-				pr_err("Failed to update mkey page tables\n");
+				mlx5_ib_err(dev, "Failed to update mkey page tables\n");
 			goto srcu_unlock;
 		}
 
 		if (bytes_mapped) {
-			u32 new_mappings = npages * PAGE_SIZE -
+			u32 new_mappings = np * PAGE_SIZE -
 				(io_virt - round_down(io_virt, PAGE_SIZE));
-			*bytes_mapped += min_t(u32, new_mappings, bcnt);
+			*bytes_mapped += min_t(u32, new_mappings, size);
 		}
+
+		npages += np;
+	}
+
+	bcnt -= size;
+	if (unlikely(bcnt)) {
+		struct ib_umem_odp *next;
+
+		io_virt += size;
+		next = odp_next(odp);
+		if (unlikely(!next || next->umem->address != io_virt)) {
+			mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
+				    io_virt, next);
+			ret = -EAGAIN;
+			goto srcu_unlock_no_wait;
+		}
+		odp = next;
+		mr = odp->private;
+		goto next_mr;
 	}
 
 srcu_unlock:
 	if (ret == -EAGAIN) {
-		if (!mr->umem->odp_data->dying) {
-			struct ib_umem_odp *odp_data = mr->umem->odp_data;
+		if (implicit || !odp->dying) {
 			unsigned long timeout =
 				msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
 
 			if (!wait_for_completion_timeout(
-					&odp_data->notifier_completion,
+					&odp->notifier_completion,
 					timeout)) {
-				pr_warn("timeout waiting for mmu notifier completion\n");
+				mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
+					     current_seq, odp->notifiers_seq);
 			}
 		} else {
 			/* The MR is being killed, kill the QP as well. */
 			ret = -EFAULT;
 		}
 	}
-	srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
+
+srcu_unlock_no_wait:
+	srcu_read_unlock(&dev->mr_srcu, srcu_key);
 	*bytes_committed = 0;
 	return ret ? ret : npages;
 }
@@ -618,8 +998,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
 		goto resolve_page_fault;
 	} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
 		if (ret != -ENOENT)
-			mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n",
-				    ret);
+			mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n",
+				    ret, pfault->wqe.wq_num, pfault->type);
 		goto resolve_page_fault;
 	}
 
@@ -627,7 +1007,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
 resolve_page_fault:
 	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
 	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
-		    pfault->token, resume_with_error,
+		    pfault->wqe.wq_num, resume_with_error,
 		    pfault->type);
 	free_page((unsigned long)buffer);
 }
@@ -700,10 +1080,9 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
 		ret = pagefault_single_data_segment(dev, rkey, address,
 						    prefetch_len,
 						    &bytes_committed, NULL);
-		if (ret < 0) {
+		if (ret < 0 && ret != -EAGAIN) {
 			mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
-				     ret, pfault->token, address,
-				     prefetch_len);
+				     ret, pfault->token, address, prefetch_len);
 		}
 	}
 }
@@ -728,19 +1107,61 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
 	}
 }
 
-int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
+void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
+{
+	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+		return;
+
+	switch (ent->order - 2) {
+	case MLX5_IMR_MTT_CACHE_ENTRY:
+		ent->page = PAGE_SHIFT;
+		ent->xlt = MLX5_IMR_MTT_ENTRIES *
+			   sizeof(struct mlx5_mtt) /
+			   MLX5_IB_UMR_OCTOWORD;
+		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+		ent->limit = 0;
+		break;
+
+	case MLX5_IMR_KSM_CACHE_ENTRY:
+		ent->page = MLX5_KSM_PAGE_SHIFT;
+		ent->xlt = mlx5_imr_ksm_entries *
+			   sizeof(struct mlx5_klm) /
+			   MLX5_IB_UMR_OCTOWORD;
+		ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
+		ent->limit = 0;
+		break;
+	}
+}
+
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
 {
 	int ret;
 
-	ret = init_srcu_struct(&ibdev->mr_srcu);
+	ret = init_srcu_struct(&dev->mr_srcu);
 	if (ret)
 		return ret;
 
+	if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
+		ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
+		if (ret) {
+			mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
-void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev)
+{
+	cleanup_srcu_struct(&dev->mr_srcu);
+}
+
+int mlx5_ib_odp_init(void)
 {
-	cleanup_srcu_struct(&ibdev->mr_srcu);
+	mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
+				       MLX5_IMR_MTT_BITS);
+
+	return 0;
 }
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2534b8a0fd7b..886ff2b00500 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1053,6 +1053,8 @@ enum {
 
 enum {
 	MAX_UMR_CACHE_ENTRY = 20,
+	MLX5_IMR_MTT_CACHE_ENTRY,
+	MLX5_IMR_KSM_CACHE_ENTRY,
 	MAX_MR_CACHE_ENTRIES
 };
 
-- 
cgit v1.2.3


From 51c6ce2ae35980c755af33461c3138570ded615e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:18 -0700
Subject: vmbus: callback is in softirq not workqueue

The callback is done via tasklet not workqueue.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hyperv.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index e208e6437f5b..c9b6d533958f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -32,7 +32,6 @@
 #include <linux/scatterlist.h>
 #include <linux/list.h>
 #include <linux/timer.h>
-#include <linux/workqueue.h>
 #include <linux/completion.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
@@ -743,9 +742,7 @@ struct vmbus_channel {
 
 	struct vmbus_close_msg close_msg;
 
-	/* Channel callback are invoked in this workqueue context */
-	/* HANDLE dataWorkQueue; */
-
+	/* Channel callback's invoked in softirq context */
 	void (*onchannel_callback)(void *context);
 	void *channel_callback_context;
 
-- 
cgit v1.2.3


From 631e63a9f346cb657761ae22138f294718696501 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:20 -0700
Subject: vmbus: change to per channel tasklet

Make the event handling tasklet per channel rather than per-cpu.
This allows for better fairness when getting lots of data on the same
cpu.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      |  2 +-
 drivers/hv/channel_mgmt.c | 16 +++++-----
 drivers/hv/connection.c   | 78 ++---------------------------------------------
 drivers/hv/hv.c           |  2 --
 drivers/hv/hyperv_vmbus.h |  1 -
 drivers/hv/vmbus_drv.c    | 58 ++++++++++++++++++++++++++++++-----
 include/linux/hyperv.h    |  3 +-
 7 files changed, 64 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 789c75f6df26..18cc1c78260d 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -530,7 +530,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
 	int ret;
 
 	/*
-	 * process_chn_event(), running in the tasklet, can race
+	 * vmbus_on_event(), running in the tasklet, can race
 	 * with vmbus_close_internal() in the case of SMP guest, e.g., when
 	 * the former is accessing channel->inbound.ring_buffer, the latter
 	 * could be freeing the ring_buffer pages.
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 579ad2560a39..2f6270d76b79 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -339,6 +339,9 @@ static struct vmbus_channel *alloc_channel(void)
 	INIT_LIST_HEAD(&channel->sc_list);
 	INIT_LIST_HEAD(&channel->percpu_list);
 
+	tasklet_init(&channel->callback_event,
+		     vmbus_on_event, (unsigned long)channel);
+
 	return channel;
 }
 
@@ -347,6 +350,7 @@ static struct vmbus_channel *alloc_channel(void)
  */
 static void free_channel(struct vmbus_channel *channel)
 {
+	tasklet_kill(&channel->callback_event);
 	kfree(channel);
 }
 
@@ -380,21 +384,15 @@ static void vmbus_release_relid(u32 relid)
 
 void hv_event_tasklet_disable(struct vmbus_channel *channel)
 {
-	struct hv_per_cpu_context *hv_cpu;
-
-	hv_cpu = per_cpu_ptr(hv_context.cpu_context, channel->target_cpu);
-	tasklet_disable(&hv_cpu->event_dpc);
+	tasklet_disable(&channel->callback_event);
 }
 
 void hv_event_tasklet_enable(struct vmbus_channel *channel)
 {
-	struct hv_per_cpu_context *hv_cpu;
-
-	hv_cpu = per_cpu_ptr(hv_context.cpu_context, channel->target_cpu);
-	tasklet_enable(&hv_cpu->event_dpc);
+	tasklet_enable(&channel->callback_event);
 
 	/* In case there is any pending event */
-	tasklet_schedule(&hv_cpu->event_dpc);
+	tasklet_schedule(&channel->callback_event);
 }
 
 void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 158f12823baf..27e72dc07e12 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -259,29 +259,6 @@ void vmbus_disconnect(void)
 	vmbus_connection.monitor_pages[1] = NULL;
 }
 
-/*
- * Map the given relid to the corresponding channel based on the
- * per-cpu list of channels that have been affinitized to this CPU.
- * This will be used in the channel callback path as we can do this
- * mapping in a lock-free fashion.
- */
-static struct vmbus_channel *pcpu_relid2channel(u32 relid)
-{
-	struct hv_per_cpu_context *hv_cpu
-		= this_cpu_ptr(hv_context.cpu_context);
-	struct vmbus_channel *found_channel = NULL;
-	struct vmbus_channel *channel;
-
-	list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) {
-		if (channel->offermsg.child_relid == relid) {
-			found_channel = channel;
-			break;
-		}
-	}
-
-	return found_channel;
-}
-
 /*
  * relid2channel - Get the channel object given its
  * child relative id (ie channel id)
@@ -318,24 +295,15 @@ struct vmbus_channel *relid2channel(u32 relid)
 }
 
 /*
- * process_chn_event - Process a channel event notification
+ * vmbus_on_event - Process a channel event notification
  */
-static void process_chn_event(u32 relid)
+void vmbus_on_event(unsigned long data)
 {
-	struct vmbus_channel *channel;
+	struct vmbus_channel *channel = (void *) data;
 	void *arg;
 	bool read_state;
 	u32 bytes_to_read;
 
-	/*
-	 * Find the channel based on this relid and invokes the
-	 * channel callback to process the event
-	 */
-	channel = pcpu_relid2channel(relid);
-
-	if (!channel)
-		return;
-
 	/*
 	 * A channel once created is persistent even when there
 	 * is no driver handling the device. An unloading driver
@@ -344,7 +312,6 @@ static void process_chn_event(u32 relid)
 	 * Thus, checking and invoking the driver specific callback takes
 	 * care of orderly unloading of the driver.
 	 */
-
 	if (channel->onchannel_callback != NULL) {
 		arg = channel->channel_callback_context;
 		read_state = channel->batched_reading;
@@ -372,45 +339,6 @@ static void process_chn_event(u32 relid)
 	}
 }
 
-/*
- * vmbus_on_event - Handler for events
- */
-void vmbus_on_event(unsigned long data)
-{
-	struct hv_per_cpu_context *hv_cpu = (void *)data;
-	unsigned long *recv_int_page;
-	u32 maxbits, relid;
-
-	if (vmbus_proto_version < VERSION_WIN8) {
-		maxbits = MAX_NUM_CHANNELS_SUPPORTED;
-		recv_int_page = vmbus_connection.recv_int_page;
-	} else {
-		/*
-		 * When the host is win8 and beyond, the event page
-		 * can be directly checked to get the id of the channel
-		 * that has the interrupt pending.
-		 */
-		void *page_addr = hv_cpu->synic_event_page;
-		union hv_synic_event_flags *event
-			= (union hv_synic_event_flags *)page_addr +
-						 VMBUS_MESSAGE_SINT;
-
-		maxbits = HV_EVENT_FLAGS_COUNT;
-		recv_int_page = event->flags;
-	}
-
-	if (unlikely(!recv_int_page))
-		return;
-
-	for_each_set_bit(relid, recv_int_page, maxbits) {
-		if (sync_test_and_clear_bit(relid, recv_int_page)) {
-			/* Special case - vmbus channel protocol msg */
-			if (relid != 0)
-				process_chn_event(relid);
-		}
-	}
-}
-
 /*
  * vmbus_post_msg - Send a msg on the vmbus's message connection
  */
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 9fc2355a60ea..665a64f1611e 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -156,8 +156,6 @@ int hv_synic_alloc(void)
 			= per_cpu_ptr(hv_context.cpu_context, cpu);
 
 		memset(hv_cpu, 0, sizeof(*hv_cpu));
-		tasklet_init(&hv_cpu->event_dpc,
-			     vmbus_on_event, (unsigned long) hv_cpu);
 		tasklet_init(&hv_cpu->msg_dpc,
 			     vmbus_on_msg_dpc, (unsigned long) hv_cpu);
 
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index c8ce9ab2e16a..558a798c407c 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -206,7 +206,6 @@ struct hv_per_cpu_context {
 	 * we will manage the tasklet that handles events messages on a per CPU
 	 * basis.
 	 */
-	struct tasklet_struct event_dpc;
 	struct tasklet_struct msg_dpc;
 
 	/*
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index cf8540c1df4a..eaf1a10b0245 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -885,6 +885,56 @@ msg_handled:
 	vmbus_signal_eom(msg, message_type);
 }
 
+
+/*
+ * Schedule all channels with events pending
+ */
+static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
+{
+	unsigned long *recv_int_page;
+	u32 maxbits, relid;
+
+	if (vmbus_proto_version < VERSION_WIN8) {
+		maxbits = MAX_NUM_CHANNELS_SUPPORTED;
+		recv_int_page = vmbus_connection.recv_int_page;
+	} else {
+		/*
+		 * When the host is win8 and beyond, the event page
+		 * can be directly checked to get the id of the channel
+		 * that has the interrupt pending.
+		 */
+		void *page_addr = hv_cpu->synic_event_page;
+		union hv_synic_event_flags *event
+			= (union hv_synic_event_flags *)page_addr +
+						 VMBUS_MESSAGE_SINT;
+
+		maxbits = HV_EVENT_FLAGS_COUNT;
+		recv_int_page = event->flags;
+	}
+
+	if (unlikely(!recv_int_page))
+		return;
+
+	for_each_set_bit(relid, recv_int_page, maxbits) {
+		struct vmbus_channel *channel;
+
+		if (!sync_test_and_clear_bit(relid, recv_int_page))
+			continue;
+
+		/* Special case - vmbus channel protocol msg */
+		if (relid == 0)
+			continue;
+
+		/* Find channel based on relid */
+		list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) {
+			if (channel->offermsg.child_relid == relid) {
+				tasklet_schedule(&channel->callback_event);
+				break;
+			}
+		}
+	}
+}
+
 static void vmbus_isr(void)
 {
 	struct hv_per_cpu_context *hv_cpu
@@ -922,8 +972,7 @@ static void vmbus_isr(void)
 	}
 
 	if (handled)
-		tasklet_schedule(&hv_cpu->event_dpc);
-
+		vmbus_chan_sched(hv_cpu);
 
 	page_addr = hv_cpu->synic_message_page;
 	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
@@ -1536,12 +1585,7 @@ static void __exit vmbus_exit(void)
 						 &hyperv_panic_block);
 	}
 	bus_unregister(&hv_bus);
-	for_each_online_cpu(cpu) {
-		struct hv_per_cpu_context *hv_cpu
-			= per_cpu_ptr(hv_context.cpu_context, cpu);
 
-		tasklet_kill(&hv_cpu->event_dpc);
-	}
 	cpuhp_remove_state(hyperv_cpuhp_online);
 	hv_synic_free();
 	acpi_bus_unregister_driver(&vmbus_acpi_driver);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index c9b6d533958f..69afc9337c0d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -35,7 +35,7 @@
 #include <linux/completion.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
-
+#include <linux/interrupt.h>
 
 #define MAX_PAGE_BUFFER_COUNT				32
 #define MAX_MULTIPAGE_BUFFER_COUNT			32 /* 128K */
@@ -743,6 +743,7 @@ struct vmbus_channel {
 	struct vmbus_close_msg close_msg;
 
 	/* Channel callback's invoked in softirq context */
+	struct tasklet_struct callback_event;
 	void (*onchannel_callback)(void *context);
 	void *channel_callback_context;
 
-- 
cgit v1.2.3


From b71e328297a3a578c482fb4814e737a0ec185839 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:21 -0700
Subject: vmbus: add direct isr callback mode

Change the simple boolean batched_reading into a tri-value.
For future NAPI support in netvsc driver, the callback needs to
occur directly in interrupt handler.

Batched mode is also changed to disable host interrupts immediately
in interrupt routine (to avoid unnecessary host signals), and the
tasklet is rescheduled if more data is detected.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c    |  7 -------
 drivers/hv/connection.c      | 27 ++++++++++++---------------
 drivers/hv/hv_util.c         |  3 +--
 drivers/hv/vmbus_drv.c       | 26 ++++++++++++++++++++++++--
 drivers/uio/uio_hv_generic.c |  2 +-
 include/linux/hyperv.h       | 31 +++++++++++++++++--------------
 6 files changed, 55 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 2f6270d76b79..b2bb5aafaa2f 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -819,13 +819,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 		return;
 	}
 
-	/*
-	 * By default we setup state to enable batched
-	 * reading. A specific service can choose to
-	 * disable this prior to opening the channel.
-	 */
-	newchannel->batched_reading = true;
-
 	/*
 	 * Setup state for signalling the host.
 	 */
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 27e72dc07e12..a8366fec1458 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -300,9 +300,7 @@ struct vmbus_channel *relid2channel(u32 relid)
 void vmbus_on_event(unsigned long data)
 {
 	struct vmbus_channel *channel = (void *) data;
-	void *arg;
-	bool read_state;
-	u32 bytes_to_read;
+	void (*callback_fn)(void *);
 
 	/*
 	 * A channel once created is persistent even when there
@@ -312,9 +310,13 @@ void vmbus_on_event(unsigned long data)
 	 * Thus, checking and invoking the driver specific callback takes
 	 * care of orderly unloading of the driver.
 	 */
-	if (channel->onchannel_callback != NULL) {
-		arg = channel->channel_callback_context;
-		read_state = channel->batched_reading;
+	callback_fn = READ_ONCE(channel->onchannel_callback);
+	if (unlikely(callback_fn == NULL))
+		return;
+
+	(*callback_fn)(channel->channel_callback_context);
+
+	if (channel->callback_mode == HV_CALL_BATCHED) {
 		/*
 		 * This callback reads the messages sent by the host.
 		 * We can optimize host to guest signaling by ensuring:
@@ -326,16 +328,11 @@ void vmbus_on_event(unsigned long data)
 		 *    state is set we check to see if additional packets are
 		 *    available to read. In this case we repeat the process.
 		 */
+		if (hv_end_read(&channel->inbound) != 0) {
+			hv_begin_read(&channel->inbound);
 
-		do {
-			if (read_state)
-				hv_begin_read(&channel->inbound);
-			channel->onchannel_callback(arg);
-			if (read_state)
-				bytes_to_read = hv_end_read(&channel->inbound);
-			else
-				bytes_to_read = 0;
-		} while (read_state && (bytes_to_read != 0));
+			tasklet_schedule(&channel->callback_event);
+		}
 	}
 }
 
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 098cd3dc7db2..3042eaa13062 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -435,8 +435,7 @@ static int util_probe(struct hv_device *dev,
 	 * Turn off batched reading for all util drivers before we open the
 	 * channel.
 	 */
-
-	set_channel_read_state(dev->channel, false);
+	set_channel_read_mode(dev->channel, HV_CALL_DIRECT);
 
 	hv_set_drvdata(dev, srv);
 
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index eaf1a10b0245..f7f6b9144b07 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -886,6 +886,18 @@ msg_handled:
 }
 
 
+/*
+ * Direct callback for channels using other deferred processing
+ */
+static void vmbus_channel_isr(struct vmbus_channel *channel)
+{
+	void (*callback_fn)(void *);
+
+	callback_fn = READ_ONCE(channel->onchannel_callback);
+	if (likely(callback_fn != NULL))
+		(*callback_fn)(channel->channel_callback_context);
+}
+
 /*
  * Schedule all channels with events pending
  */
@@ -927,9 +939,19 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
 
 		/* Find channel based on relid */
 		list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) {
-			if (channel->offermsg.child_relid == relid) {
-				tasklet_schedule(&channel->callback_event);
+			if (channel->offermsg.child_relid != relid)
+				continue;
+
+			switch (channel->callback_mode) {
+			case HV_CALL_ISR:
+				vmbus_channel_isr(channel);
 				break;
+
+			case HV_CALL_BATCHED:
+				hv_begin_read(&channel->inbound);
+				/* fallthrough */
+			case HV_CALL_DIRECT:
+				tasklet_schedule(&channel->callback_event);
 			}
 		}
 	}
diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c
index 50958f167305..48d5327d38d4 100644
--- a/drivers/uio/uio_hv_generic.c
+++ b/drivers/uio/uio_hv_generic.c
@@ -125,7 +125,7 @@ hv_uio_probe(struct hv_device *dev,
 		goto fail;
 
 	dev->channel->inbound.ring_buffer->interrupt_mask = 1;
-	dev->channel->batched_reading = false;
+	set_channel_read_mode(dev->channel, HV_CALL_DIRECT);
 
 	/* Fill general uio info */
 	pdata->info.name = "uio_hv_generic";
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 69afc9337c0d..e5aac5c051f7 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -748,19 +748,21 @@ struct vmbus_channel {
 	void *channel_callback_context;
 
 	/*
-	 * A channel can be marked for efficient (batched)
-	 * reading:
-	 * If batched_reading is set to "true", we read until the
-	 * channel is empty and hold off interrupts from the host
-	 * during the entire read process.
-	 * If batched_reading is set to "false", the client is not
-	 * going to perform batched reading.
-	 *
-	 * By default we will enable batched reading; specific
-	 * drivers that don't want this behavior can turn it off.
+	 * A channel can be marked for one of three modes of reading:
+	 *   BATCHED - callback called from taslket and should read
+	 *            channel until empty. Interrupts from the host
+	 *            are masked while read is in process (default).
+	 *   DIRECT - callback called from tasklet (softirq).
+	 *   ISR - callback called in interrupt context and must
+	 *         invoke its own deferred processing.
+	 *         Host interrupts are disabled and must be re-enabled
+	 *         when ring is empty.
 	 */
-
-	bool batched_reading;
+	enum hv_callback_mode {
+		HV_CALL_BATCHED,
+		HV_CALL_DIRECT,
+		HV_CALL_ISR
+	} callback_mode;
 
 	bool is_dedicated_interrupt;
 	struct hv_input_signal_event_buffer sig_buf;
@@ -910,9 +912,10 @@ static inline void set_channel_affinity_state(struct vmbus_channel *c,
 	c->affinity_policy = policy;
 }
 
-static inline void set_channel_read_state(struct vmbus_channel *c, bool state)
+static inline void set_channel_read_mode(struct vmbus_channel *c,
+					enum hv_callback_mode mode)
 {
-	c->batched_reading = state;
+	c->callback_mode = mode;
 }
 
 static inline void set_per_channel_state(struct vmbus_channel *c, void *s)
-- 
cgit v1.2.3


From 5529eaf6e79a61e0ca7ade257f31d2ababc7f6c9 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:22 -0700
Subject: vmbus: remove conditional locking of vmbus_write

All current usage of vmbus write uses the acquire_lock flag, therefore
having it be optional is unnecessary. This also fixes a sparse warning
since sparse doesn't like when a function has conditional locking.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      | 13 ++++---------
 drivers/hv/channel_mgmt.c |  1 -
 drivers/hv/hyperv_vmbus.h |  3 +--
 drivers/hv/ring_buffer.c  | 11 ++++-------
 include/linux/hyperv.h    | 15 ---------------
 5 files changed, 9 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 18cc1c78260d..81a80c82f1bd 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -651,7 +651,6 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
 	u32 packetlen_aligned = ALIGN(packetlen, sizeof(u64));
 	struct kvec bufferlist[3];
 	u64 aligned_data = 0;
-	bool lock = channel->acquire_ring_lock;
 	int num_vecs = ((bufferlen != 0) ? 3 : 1);
 
 
@@ -670,7 +669,7 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	return hv_ringbuffer_write(channel, bufferlist, num_vecs, lock);
+	return hv_ringbuffer_write(channel, bufferlist, num_vecs);
 }
 EXPORT_SYMBOL(vmbus_sendpacket_ctl);
 
@@ -716,12 +715,10 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
 	u32 packetlen_aligned;
 	struct kvec bufferlist[3];
 	u64 aligned_data = 0;
-	bool lock = channel->acquire_ring_lock;
 
 	if (pagecount > MAX_PAGE_BUFFER_COUNT)
 		return -EINVAL;
 
-
 	/*
 	 * Adjust the size down since vmbus_channel_packet_page_buffer is the
 	 * largest size we support
@@ -753,7 +750,7 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	return hv_ringbuffer_write(channel, bufferlist, 3, lock);
+	return hv_ringbuffer_write(channel, bufferlist, 3);
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer_ctl);
 
@@ -789,7 +786,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 	u32 packetlen_aligned;
 	struct kvec bufferlist[3];
 	u64 aligned_data = 0;
-	bool lock = channel->acquire_ring_lock;
 
 	packetlen = desc_size + bufferlen;
 	packetlen_aligned = ALIGN(packetlen, sizeof(u64));
@@ -809,7 +805,7 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	return hv_ringbuffer_write(channel, bufferlist, 3, lock);
+	return hv_ringbuffer_write(channel, bufferlist, 3);
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc);
 
@@ -827,7 +823,6 @@ int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
 	u32 packetlen_aligned;
 	struct kvec bufferlist[3];
 	u64 aligned_data = 0;
-	bool lock = channel->acquire_ring_lock;
 	u32 pfncount = NUM_PAGES_SPANNED(multi_pagebuffer->offset,
 					 multi_pagebuffer->len);
 
@@ -866,7 +861,7 @@ int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	return hv_ringbuffer_write(channel, bufferlist, 3, lock);
+	return hv_ringbuffer_write(channel, bufferlist, 3);
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_multipagebuffer);
 
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index b2bb5aafaa2f..f33465d78a02 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -332,7 +332,6 @@ static struct vmbus_channel *alloc_channel(void)
 	if (!channel)
 		return NULL;
 
-	channel->acquire_ring_lock = true;
 	spin_lock_init(&channel->inbound_lock);
 	spin_lock_init(&channel->lock);
 
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 558a798c407c..6a9b54677218 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -283,8 +283,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
 
 int hv_ringbuffer_write(struct vmbus_channel *channel,
-			struct kvec *kv_list,
-			u32 kv_count, bool lock);
+			struct kvec *kv_list, u32 kv_count);
 
 int hv_ringbuffer_read(struct vmbus_channel *channel,
 		       void *buffer, u32 buflen, u32 *buffer_actual_len,
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 1b70e034ef92..1a1e70a45146 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -284,7 +284,7 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 
 /* Write to the ring buffer. */
 int hv_ringbuffer_write(struct vmbus_channel *channel,
-			struct kvec *kv_list, u32 kv_count, bool lock)
+			struct kvec *kv_list, u32 kv_count)
 {
 	int i = 0;
 	u32 bytes_avail_towrite;
@@ -304,8 +304,7 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 
 	totalbytes_towrite += sizeof(u64);
 
-	if (lock)
-		spin_lock_irqsave(&outring_info->ring_lock, flags);
+	spin_lock_irqsave(&outring_info->ring_lock, flags);
 
 	bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
 
@@ -315,8 +314,7 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 	 * is empty since the read index == write index.
 	 */
 	if (bytes_avail_towrite <= totalbytes_towrite) {
-		if (lock)
-			spin_unlock_irqrestore(&outring_info->ring_lock, flags);
+		spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 		return -EAGAIN;
 	}
 
@@ -347,8 +345,7 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 	hv_set_next_write_location(outring_info, next_write_location);
 
 
-	if (lock)
-		spin_unlock_irqrestore(&outring_info->ring_lock, flags);
+	spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 
 	hv_signal_on_write(old_write, channel);
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index e5aac5c051f7..466374dbc98f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -845,16 +845,6 @@ struct vmbus_channel {
 	 * link up channels based on their CPU affinity.
 	 */
 	struct list_head percpu_list;
-	/*
-	 * On the channel send side, many of the VMBUS
-	 * device drivers explicity serialize access to the
-	 * outgoing ring buffer. Give more control to the
-	 * VMBUS device drivers in terms how to serialize
-	 * accesss to the outgoing ring buffer.
-	 * The default behavior will be to aquire the
-	 * ring lock to preserve the current behavior.
-	 */
-	bool acquire_ring_lock;
 	/*
 	 * For performance critical channels (storage, networking
 	 * etc,), Hyper-V has a mechanism to enhance the throughput
@@ -895,11 +885,6 @@ struct vmbus_channel {
 
 };
 
-static inline void set_channel_lock_state(struct vmbus_channel *c, bool state)
-{
-	c->acquire_ring_lock = state;
-}
-
 static inline bool is_hvsock_channel(const struct vmbus_channel *c)
 {
 	return !!(c->offermsg.offer.chn_flags &
-- 
cgit v1.2.3


From 6e47dd3e2938f41d75045bbcb64aa9df3a463b2a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:23 -0700
Subject: vmbus: expose hv_begin/end_read

In order to implement NAPI in netvsc, the driver needs access to
control host interrupt mask.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/hyperv_vmbus.h |  4 ----
 drivers/hv/ring_buffer.c  | 20 --------------------
 include/linux/hyperv.h    | 30 ++++++++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 6a9b54677218..e15a130de3c9 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -292,10 +292,6 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 void hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
 			    struct hv_ring_buffer_debug_info *debug_info);
 
-void hv_begin_read(struct hv_ring_buffer_info *rbi);
-
-u32 hv_end_read(struct hv_ring_buffer_info *rbi);
-
 /*
  * Maximum channels is determined by the size of the interrupt page
  * which is PAGE_SIZE. 1/2 of PAGE_SIZE is for send endpoint interrupt
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 1a1e70a45146..75c9eefd55b5 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -32,26 +32,6 @@
 
 #include "hyperv_vmbus.h"
 
-void hv_begin_read(struct hv_ring_buffer_info *rbi)
-{
-	rbi->ring_buffer->interrupt_mask = 1;
-	virt_mb();
-}
-
-u32 hv_end_read(struct hv_ring_buffer_info *rbi)
-{
-
-	rbi->ring_buffer->interrupt_mask = 0;
-	virt_mb();
-
-	/*
-	 * Now check to see if the ring buffer is still empty.
-	 * If it is not, we raced and we need to process new
-	 * incoming messages.
-	 */
-	return hv_get_bytes_to_read(rbi);
-}
-
 /*
  * When we write to the ring buffer, check if the host needs to
  * be signaled. Here is the details of this protocol:
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 466374dbc98f..08eb71a22c14 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1512,6 +1512,36 @@ init_cached_read_index(struct vmbus_channel *channel)
 	rbi->cached_read_index = rbi->ring_buffer->read_index;
 }
 
+/*
+ * Mask off host interrupt callback notifications
+ */
+static inline void hv_begin_read(struct hv_ring_buffer_info *rbi)
+{
+	rbi->ring_buffer->interrupt_mask = 1;
+
+	/* make sure mask update is not reordered */
+	virt_mb();
+}
+
+/*
+ * Re-enable host callback and return number of outstanding bytes
+ */
+static inline u32 hv_end_read(struct hv_ring_buffer_info *rbi)
+{
+
+	rbi->ring_buffer->interrupt_mask = 0;
+
+	/* make sure mask update is not reordered */
+	virt_mb();
+
+	/*
+	 * Now check to see if the ring buffer is still empty.
+	 * If it is not, we raced and we need to process new
+	 * incoming messages.
+	 */
+	return hv_get_bytes_to_read(rbi);
+}
+
 /*
  * An API to support in-place processing of incoming VMBUS packets.
  */
-- 
cgit v1.2.3


From e4165a0fad0963bf8b4a59f54d3360ccb6a6d1ea Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:24 -0700
Subject: vmbus: constify parameters where possible

Functions that just query state of ring buffer can have parameters
marked const.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/hyperv_vmbus.h |  6 +++---
 drivers/hv/ring_buffer.c  | 22 ++++++++++------------
 include/linux/hyperv.h    | 12 ++++++------
 3 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index e15a130de3c9..884f83bba1ab 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -283,14 +283,14 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
 
 int hv_ringbuffer_write(struct vmbus_channel *channel,
-			struct kvec *kv_list, u32 kv_count);
+			const struct kvec *kv_list, u32 kv_count);
 
 int hv_ringbuffer_read(struct vmbus_channel *channel,
 		       void *buffer, u32 buflen, u32 *buffer_actual_len,
 		       u64 *requestid, bool raw);
 
-void hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
-			    struct hv_ring_buffer_debug_info *debug_info);
+void hv_ringbuffer_get_debuginfo(const struct hv_ring_buffer_info *ring_info,
+				 struct hv_ring_buffer_debug_info *debug_info);
 
 /*
  * Maximum channels is determined by the size of the interrupt page
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 75c9eefd55b5..490e9ea098e6 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -96,11 +96,9 @@ hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
 
 /* Get the next read location for the specified ring buffer. */
 static inline u32
-hv_get_next_read_location(struct hv_ring_buffer_info *ring_info)
+hv_get_next_read_location(const struct hv_ring_buffer_info *ring_info)
 {
-	u32 next = ring_info->ring_buffer->read_index;
-
-	return next;
+	return ring_info->ring_buffer->read_index;
 }
 
 /*
@@ -108,8 +106,8 @@ hv_get_next_read_location(struct hv_ring_buffer_info *ring_info)
  * This allows the caller to skip.
  */
 static inline u32
-hv_get_next_readlocation_withoffset(struct hv_ring_buffer_info *ring_info,
-				 u32 offset)
+hv_get_next_readlocation_withoffset(const struct hv_ring_buffer_info *ring_info,
+				    u32 offset)
 {
 	u32 next = ring_info->ring_buffer->read_index;
 
@@ -130,7 +128,7 @@ hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
 
 /* Get the size of the ring buffer. */
 static inline u32
-hv_get_ring_buffersize(struct hv_ring_buffer_info *ring_info)
+hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
 {
 	return ring_info->ring_datasize;
 }
@@ -147,7 +145,7 @@ hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info)
  * Assume there is enough room. Handles wrap-around in src case only!!
  */
 static u32 hv_copyfrom_ringbuffer(
-	struct hv_ring_buffer_info	*ring_info,
+	const struct hv_ring_buffer_info *ring_info,
 	void				*dest,
 	u32				destlen,
 	u32				start_read_offset)
@@ -171,7 +169,7 @@ static u32 hv_copyfrom_ringbuffer(
 static u32 hv_copyto_ringbuffer(
 	struct hv_ring_buffer_info	*ring_info,
 	u32				start_write_offset,
-	void				*src,
+	const void			*src,
 	u32				srclen)
 {
 	void *ring_buffer = hv_get_ring_buffer(ring_info);
@@ -186,8 +184,8 @@ static u32 hv_copyto_ringbuffer(
 }
 
 /* Get various debug metrics for the specified ring buffer. */
-void hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
-			    struct hv_ring_buffer_debug_info *debug_info)
+void hv_ringbuffer_get_debuginfo(const struct hv_ring_buffer_info *ring_info,
+				 struct hv_ring_buffer_debug_info *debug_info)
 {
 	u32 bytes_avail_towrite;
 	u32 bytes_avail_toread;
@@ -264,7 +262,7 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 
 /* Write to the ring buffer. */
 int hv_ringbuffer_write(struct vmbus_channel *channel,
-			struct kvec *kv_list, u32 kv_count)
+			const struct kvec *kv_list, u32 kv_count)
 {
 	int i = 0;
 	u32 bytes_avail_towrite;
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 08eb71a22c14..62bbf3c1aa4a 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -138,8 +138,8 @@ struct hv_ring_buffer_info {
  * for the specified ring buffer
  */
 static inline void
-hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi,
-			  u32 *read, u32 *write)
+hv_get_ringbuffer_availbytes(const struct hv_ring_buffer_info *rbi,
+			     u32 *read, u32 *write)
 {
 	u32 read_loc, write_loc, dsize;
 
@@ -153,7 +153,7 @@ hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi,
 	*read = dsize - *write;
 }
 
-static inline u32 hv_get_bytes_to_read(struct hv_ring_buffer_info *rbi)
+static inline u32 hv_get_bytes_to_read(const struct hv_ring_buffer_info *rbi)
 {
 	u32 read_loc, write_loc, dsize, read;
 
@@ -167,7 +167,7 @@ static inline u32 hv_get_bytes_to_read(struct hv_ring_buffer_info *rbi)
 	return read;
 }
 
-static inline u32 hv_get_bytes_to_write(struct hv_ring_buffer_info *rbi)
+static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi)
 {
 	u32 read_loc, write_loc, dsize, write;
 
@@ -1448,9 +1448,9 @@ void vmbus_set_event(struct vmbus_channel *channel);
 
 /* Get the start of the ring buffer. */
 static inline void *
-hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
+hv_get_ring_buffer(const struct hv_ring_buffer_info *ring_info)
 {
-	return (void *)ring_info->ring_buffer->buffer;
+	return ring_info->ring_buffer->buffer;
 }
 
 /*
-- 
cgit v1.2.3


From e225c20eb0fd0b6657e640408f11ee392dc82b5b Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Tue, 14 Feb 2017 17:29:36 -0700
Subject: Move stack parameters for sed_ioctl to prevent oversized stack with
 CONFIG_KASAN

When CONFIG_KASAN is enabled, compilation fails:

block/sed-opal.c: In function 'sed_ioctl':
block/sed-opal.c:2447:1: error: the frame size of 2256 bytes is larger than 2048 bytes [-Werror=frame-larger-than=]

Moved all the ioctl structures off the stack and dynamically allocate
using _IOC_SIZE()

Fixes: 455a7b238cd6 ("block: Add Sed-opal library")

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c         | 133 ++++++++++++++++-------------------------------
 drivers/nvme/host/core.c |   3 +-
 include/linux/sed-opal.h |   4 +-
 3 files changed, 50 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index bf1406e5159b..e95b8a57053d 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -2344,9 +2344,10 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
 }
 EXPORT_SYMBOL(opal_unlock_from_suspend);
 
-int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr)
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 {
-	void __user *arg = (void __user *)ptr;
+	void *p;
+	int ret = -ENOTTY;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -2355,94 +2356,52 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr)
 		return -ENOTSUPP;
 	}
 
-	switch (cmd) {
-	case IOC_OPAL_SAVE: {
-		struct opal_lock_unlock lk_unlk;
-
-		if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk)))
-			return -EFAULT;
-		return opal_save(dev, &lk_unlk);
-	}
-	case IOC_OPAL_LOCK_UNLOCK: {
-		struct opal_lock_unlock lk_unlk;
-
-		if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk)))
-			return -EFAULT;
-		return opal_lock_unlock(dev, &lk_unlk);
-	}
-	case IOC_OPAL_TAKE_OWNERSHIP: {
-		struct opal_key opal_key;
-
-		if (copy_from_user(&opal_key, arg, sizeof(opal_key)))
-			return -EFAULT;
-		return opal_take_ownership(dev, &opal_key);
-	}
-	case IOC_OPAL_ACTIVATE_LSP: {
-		struct opal_lr_act opal_lr_act;
-
-		if (copy_from_user(&opal_lr_act, arg, sizeof(opal_lr_act)))
-			return -EFAULT;
-		return opal_activate_lsp(dev, &opal_lr_act);
-	}
-	case IOC_OPAL_SET_PW: {
-		struct opal_new_pw opal_pw;
-
-		if (copy_from_user(&opal_pw, arg, sizeof(opal_pw)))
-			return -EFAULT;
-		return opal_set_new_pw(dev, &opal_pw);
-	}
-	case IOC_OPAL_ACTIVATE_USR: {
-		struct opal_session_info session;
-
-		if (copy_from_user(&session, arg, sizeof(session)))
-			return -EFAULT;
-		return opal_activate_user(dev, &session);
-	}
-	case IOC_OPAL_REVERT_TPR: {
-		struct opal_key opal_key;
-
-		if (copy_from_user(&opal_key, arg, sizeof(opal_key)))
-			return -EFAULT;
-		return opal_reverttper(dev, &opal_key);
-	}
-	case IOC_OPAL_LR_SETUP: {
-		struct opal_user_lr_setup lrs;
+	p = memdup_user(arg,  _IOC_SIZE(cmd));
+	if (IS_ERR(p))
+		return PTR_ERR(p);
 
-		if (copy_from_user(&lrs, arg, sizeof(lrs)))
-			return -EFAULT;
-		return opal_setup_locking_range(dev, &lrs);
-	}
-	case IOC_OPAL_ADD_USR_TO_LR: {
-		struct opal_lock_unlock lk_unlk;
-
-		if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk)))
-			return -EFAULT;
-		return opal_add_user_to_lr(dev, &lk_unlk);
-	}
-	case IOC_OPAL_ENABLE_DISABLE_MBR: {
-		struct opal_mbr_data mbr;
-
-		if (copy_from_user(&mbr, arg, sizeof(mbr)))
-			return -EFAULT;
-		return opal_enable_disable_shadow_mbr(dev, &mbr);
-	}
-	case IOC_OPAL_ERASE_LR: {
-		struct opal_session_info session;
-
-		if (copy_from_user(&session, arg, sizeof(session)))
-			return -EFAULT;
-		return opal_erase_locking_range(dev, &session);
-	}
-	case IOC_OPAL_SECURE_ERASE_LR: {
-		struct opal_session_info session;
-
-		if (copy_from_user(&session, arg, sizeof(session)))
-			return -EFAULT;
-		return opal_secure_erase_locking_range(dev, &session);
-	}
+	switch (cmd) {
+	case IOC_OPAL_SAVE:
+		ret = opal_save(dev, p);
+		break;
+	case IOC_OPAL_LOCK_UNLOCK:
+		ret = opal_lock_unlock(dev, p);
+		break;
+	case IOC_OPAL_TAKE_OWNERSHIP:
+		ret = opal_take_ownership(dev, p);
+		break;
+	case IOC_OPAL_ACTIVATE_LSP:
+		ret = opal_activate_lsp(dev, p);
+		break;
+	case IOC_OPAL_SET_PW:
+		ret = opal_set_new_pw(dev, p);
+		break;
+	case IOC_OPAL_ACTIVATE_USR:
+		ret = opal_activate_user(dev, p);
+		break;
+	case IOC_OPAL_REVERT_TPR:
+		ret = opal_reverttper(dev, p);
+		break;
+	case IOC_OPAL_LR_SETUP:
+		ret = opal_setup_locking_range(dev, p);
+		break;
+	case IOC_OPAL_ADD_USR_TO_LR:
+		ret = opal_add_user_to_lr(dev, p);
+		break;
+	case IOC_OPAL_ENABLE_DISABLE_MBR:
+		ret = opal_enable_disable_shadow_mbr(dev, p);
+		break;
+	case IOC_OPAL_ERASE_LR:
+		ret = opal_erase_locking_range(dev, p);
+		break;
+	case IOC_OPAL_SECURE_ERASE_LR:
+		ret = opal_secure_erase_locking_range(dev, p);
+		break;
 	default:
 		pr_warn("No such Opal Ioctl %u\n", cmd);
 	}
-	return -ENOTTY;
+
+	kfree(p);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sed_ioctl);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 26ae4afd3737..b92a79281611 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -789,7 +789,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 			return nvme_nvm_ioctl(ns, cmd, arg);
 #endif
 		if (is_sed_ioctl(cmd))
-			return sed_ioctl(&ns->ctrl->opal_dev, cmd, arg);
+			return sed_ioctl(&ns->ctrl->opal_dev, cmd,
+					 (void __user *) arg);
 		return -ENOTTY;
 	}
 }
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index af1a85eae193..205d520ea688 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -132,7 +132,7 @@ struct opal_dev {
 #ifdef CONFIG_BLK_SED_OPAL
 bool opal_unlock_from_suspend(struct opal_dev *dev);
 void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv);
-int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr);
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
 
 static inline bool is_sed_ioctl(unsigned int cmd)
 {
@@ -160,7 +160,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 }
 
 static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd,
-			    unsigned long ptr)
+			    void __user *ioctl_ptr)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 5f114163f2f5eb2edbb49c4d3e0b405c7a8a7e2a Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 15 Feb 2017 09:39:39 +0100
Subject: net: Add a skb_gro_flush_final helper.

Add a skb_gro_flush_final helper to prepare for  consuming
skbs in call_gro_receive. We will extend this helper to not
touch the skb if the skb is consumed by a gro callback with
a followup patch. We need this to handle the upcomming IPsec
ESP callbacks as they reinject the skb to the napi_gro_receive
asynchronous. The handler is used in all gro_receive functions
that can call the ESP gro handlers.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h | 5 +++++
 net/ethernet/eth.c        | 2 +-
 net/ipv4/af_inet.c        | 2 +-
 net/ipv6/ip6_offload.c    | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 58afbd1cc659..f9da3acfee9a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2661,6 +2661,11 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+{
+	NAPI_GRO_CB(skb)->flush |= flush;
+}
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index efdaaab735fc..c666ff0dd88b 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -474,7 +474,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 685ba53df2d1..602d40f43687 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1423,7 +1423,7 @@ out_unlock:
 	rcu_read_unlock();
 
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index fc7b4017ba24..0838e6d01d2e 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -253,7 +253,7 @@ out_unlock:
 	rcu_read_unlock();
 
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
-- 
cgit v1.2.3


From 25393d3fc055b76587fcc91627aee8c345400c3a Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 15 Feb 2017 09:39:44 +0100
Subject: net: Prepare gro for packet consuming gro callbacks

The upcomming IPsec ESP gro callbacks will consume the skb,
so prepare for that.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h | 9 +++++++++
 net/core/dev.c            | 7 +++++++
 net/xfrm/Kconfig          | 4 ++++
 3 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f9da3acfee9a..9e5d1cd9d975 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -352,6 +352,7 @@ enum gro_result {
 	GRO_HELD,
 	GRO_NORMAL,
 	GRO_DROP,
+	GRO_CONSUMED,
 };
 typedef enum gro_result gro_result_t;
 
@@ -2661,10 +2662,18 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
+#ifdef CONFIG_XFRM_OFFLOAD
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+{
+	if (PTR_ERR(pp) != -EINPROGRESS)
+		NAPI_GRO_CB(skb)->flush |= flush;
+}
+#else
 static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
 {
 	NAPI_GRO_CB(skb)->flush |= flush;
 }
+#endif
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
diff --git a/net/core/dev.c b/net/core/dev.c
index 3e1a60102e64..64efbb9e4436 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4505,6 +4505,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (&ptype->list == head)
 		goto normal;
 
+	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+		ret = GRO_CONSUMED;
+		goto ok;
+	}
+
 	same_flow = NAPI_GRO_CB(skb)->same_flow;
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
@@ -4609,6 +4614,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 	case GRO_HELD:
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
@@ -4680,6 +4686,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
 		break;
 
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..a484451c72ec 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -5,6 +5,10 @@ config XFRM
        bool
        depends on NET
 
+config XFRM_OFFLOAD
+       bool
+       depends on XFRM
+
 config XFRM_ALGO
 	tristate
 	select XFRM
-- 
cgit v1.2.3


From 884f38607897cb4a963ea8a65296f0973a2828d0 Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Wed, 15 Feb 2017 16:35:29 +0800
Subject: mmc: core: move some sdio IDs out of quirks file

Consolidate all the sdio devices' IDs into sdio_ids.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/quirks.h    | 20 --------------------
 include/linux/mmc/sdio_ids.h |  7 +++++++
 2 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h
index 1e56ec4d4085..074940497686 100644
--- a/drivers/mmc/core/quirks.h
+++ b/drivers/mmc/core/quirks.h
@@ -13,26 +13,6 @@
 
 #include "card.h"
 
-#ifndef SDIO_VENDOR_ID_TI
-#define SDIO_VENDOR_ID_TI		0x0097
-#endif
-
-#ifndef SDIO_DEVICE_ID_TI_WL1271
-#define SDIO_DEVICE_ID_TI_WL1271	0x4076
-#endif
-
-#ifndef SDIO_VENDOR_ID_STE
-#define SDIO_VENDOR_ID_STE		0x0020
-#endif
-
-#ifndef SDIO_DEVICE_ID_STE_CW1200
-#define SDIO_DEVICE_ID_STE_CW1200	0x2280
-#endif
-
-#ifndef SDIO_DEVICE_ID_MARVELL_8797_F0
-#define SDIO_DEVICE_ID_MARVELL_8797_F0	0x9128
-#endif
-
 static const struct mmc_fixup mmc_fixup_methods[] = {
 	SDIO_FIXUP(SDIO_VENDOR_ID_TI, SDIO_DEVICE_ID_TI_WL1271,
 		   add_quirk, MMC_QUIRK_NONSTD_FUNC_IF),
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index d43ef96bf075..46794a7a531c 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -51,6 +51,7 @@
 #define SDIO_DEVICE_ID_MARVELL_LIBERTAS		0x9103
 #define SDIO_DEVICE_ID_MARVELL_8688WLAN		0x9104
 #define SDIO_DEVICE_ID_MARVELL_8688BT		0x9105
+#define SDIO_DEVICE_ID_MARVELL_8797_F0		0x9128
 
 #define SDIO_VENDOR_ID_SIANO			0x039a
 #define SDIO_DEVICE_ID_SIANO_NOVA_B0		0x0201
@@ -60,4 +61,10 @@
 #define SDIO_DEVICE_ID_SIANO_NOVA_A0		0x1100
 #define SDIO_DEVICE_ID_SIANO_STELLAR 		0x5347
 
+#define SDIO_VENDOR_ID_TI			0x0097
+#define SDIO_DEVICE_ID_TI_WL1271		0x4076
+
+#define SDIO_VENDOR_ID_STE			0x0020
+#define SDIO_DEVICE_ID_STE_CW1200		0x2280
+
 #endif /* LINUX_MMC_SDIO_IDS_H */
-- 
cgit v1.2.3


From 8a58a34ba479d42b3ef28f8ffd9e245a81a7786f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Feb 2017 16:41:15 +0100
Subject: timers: Make flags output in the timer_start tracepoint useful

The timer flags in the timer_start trace event contain lots of useful
information, but the meaning is not clear in the trace output. Making tools
rely on the bit positions is bad as they might change over time.

Decode the flags in the print out. Tools can retrieve the bits and their
meaning from the trace format file.

Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1702101639290.4036@nanos

Requested-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/timer.h        |  2 ++
 include/trace/events/timer.h | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 51d601f192d4..a17f915f9456 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -66,6 +66,8 @@ struct timer_list {
 #define TIMER_ARRAYSHIFT	22
 #define TIMER_ARRAYMASK		0xFFC00000
 
+#define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
+
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 1448637616d6..f6d8fea6df77 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -36,6 +36,13 @@ DEFINE_EVENT(timer_class, timer_init,
 	TP_ARGS(timer)
 );
 
+#define decode_timer_flags(flags)			\
+	__print_flags(flags, "|",			\
+		{  TIMER_MIGRATING,	"M" },		\
+		{  TIMER_DEFERRABLE,	"D" },		\
+		{  TIMER_PINNED,	"P" },		\
+		{  TIMER_IRQSAFE,	"I" })
+
 /**
  * timer_start - called when the timer is started
  * @timer:	pointer to struct timer_list
@@ -65,9 +72,12 @@ TRACE_EVENT(timer_start,
 		__entry->flags		= flags;
 	),
 
-	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x",
+	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
 		  __entry->timer, __entry->function, __entry->expires,
-		  (long)__entry->expires - __entry->now, __entry->flags)
+		  (long)__entry->expires - __entry->now,
+		  __entry->flags & TIMER_CPUMASK,
+		  __entry->flags >> TIMER_ARRAYSHIFT,
+		  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
 );
 
 /**
-- 
cgit v1.2.3


From 3821fd35b58dba449bd894014fbf4e1c43c9e951 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 3 Feb 2017 15:42:24 -0500
Subject: jump_label: Reduce the size of struct static_key

The static_key->next field goes mostly unused. The field is used for
associating module uses with a static key. Most uses of struct static_key
define a static key in the core kernel and make use of it entirely within
the core kernel, or define the static key in a module and make use of it
only from within that module. In fact, of the ~3,000 static keys defined,
I found only about 5 or so that did not fit this pattern.

Thus, we can remove the static_key->next field entirely and overload
the static_key->entries field. That is, when all the static_key uses
are contained within the same module, static_key->entries continues
to point to those uses. However, if the static_key uses are not contained
within the module where the static_key is defined, then we allocate a
struct static_key_mod, store a pointer to the uses within that
struct static_key_mod, and have the static key point at the static_key_mod.
This does incur some extra memory usage when a static_key is used in a
module that does not define it, but since there are only a handful of such
cases there is a net savings.

In order to identify if the static_key->entries pointer contains a
struct static_key_mod or a struct jump_entry pointer, bit 1 of
static_key->entries is set to 1 if it points to a struct static_key_mod and
is 0 if it points to a struct jump_entry. We were already using bit 0 in a
similar way to store the initial value of the static_key. This does mean
that allocations of struct static_key_mod and that the struct jump_entry
tables need to be at least 4-byte aligned in memory. As far as I can tell
all arches meet this criteria.

For my .config, the patch increased the text by 778 bytes, but reduced
the data + bss size by 14912, for a net savings of 14,134 bytes.

   text	   data	    bss	    dec	    hex	filename
8092427	5016512	 790528	13899467	 d416cb	vmlinux.pre
8093205	5001600	 790528	13885333	 d3df95	vmlinux.post

Link: http://lkml.kernel.org/r/1486154544-4321-1-git-send-email-jbaron@akamai.com

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/static-keys.txt |   4 +-
 include/linux/jump_label.h    |  23 ++++---
 kernel/jump_label.c           | 153 +++++++++++++++++++++++++++++++++++-------
 3 files changed, 145 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index ea8d7b4e53f0..32a25fad0c1b 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -155,7 +155,9 @@ or:
 
 There are a few functions and macros that architectures must implement in order
 to take advantage of this optimization. If there is no architecture support, we
-simply fall back to a traditional, load, test, and jump sequence.
+simply fall back to a traditional, load, test, and jump sequence. Also, the
+struct jump_entry table must be at least 4-byte aligned because the
+static_key->entry field makes use of the two least significant bits.
 
 * select HAVE_ARCH_JUMP_LABEL, see: arch/x86/Kconfig
 
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index a0547c571800..680c98b2f41c 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -89,11 +89,17 @@ extern bool static_key_initialized;
 
 struct static_key {
 	atomic_t enabled;
-/* Set lsb bit to 1 if branch is default true, 0 ot */
-	struct jump_entry *entries;
-#ifdef CONFIG_MODULES
-	struct static_key_mod *next;
-#endif
+/*
+ * bit 0 => 1 if key is initially true
+ *	    0 if initially false
+ * bit 1 => 1 if points to struct static_key_mod
+ *	    0 if points to struct jump_entry
+ */
+	union {
+		unsigned long type;
+		struct jump_entry *entries;
+		struct static_key_mod *next;
+	};
 };
 
 #else
@@ -118,9 +124,10 @@ struct module;
 
 #ifdef HAVE_JUMP_LABEL
 
-#define JUMP_TYPE_FALSE	0UL
-#define JUMP_TYPE_TRUE	1UL
-#define JUMP_TYPE_MASK	1UL
+#define JUMP_TYPE_FALSE		0UL
+#define JUMP_TYPE_TRUE		1UL
+#define JUMP_TYPE_LINKED	2UL
+#define JUMP_TYPE_MASK		3UL
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..953411f5ba7f 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -229,12 +229,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
 
 static inline struct jump_entry *static_key_entries(struct static_key *key)
 {
-	return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+	WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
+	return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
 }
 
 static inline bool static_key_type(struct static_key *key)
 {
-	return (unsigned long)key->entries & JUMP_TYPE_MASK;
+	return key->type & JUMP_TYPE_TRUE;
+}
+
+static inline bool static_key_linked(struct static_key *key)
+{
+	return key->type & JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_clear_linked(struct static_key *key)
+{
+	key->type &= ~JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_set_linked(struct static_key *key)
+{
+	key->type |= JUMP_TYPE_LINKED;
 }
 
 static inline struct static_key *jump_entry_key(struct jump_entry *entry)
@@ -247,6 +263,26 @@ static bool jump_entry_branch(struct jump_entry *entry)
 	return (unsigned long)entry->key & 1UL;
 }
 
+/***
+ * A 'struct static_key' uses a union such that it either points directly
+ * to a table of 'struct jump_entry' or to a linked list of modules which in
+ * turn point to 'struct jump_entry' tables.
+ *
+ * The two lower bits of the pointer are used to keep track of which pointer
+ * type is in use and to store the initial branch direction, we use an access
+ * function which preserves these bits.
+ */
+static void static_key_set_entries(struct static_key *key,
+				   struct jump_entry *entries)
+{
+	unsigned long type;
+
+	WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
+	type = key->type & JUMP_TYPE_MASK;
+	key->entries = entries;
+	key->type |= type;
+}
+
 static enum jump_label_type jump_label_type(struct jump_entry *entry)
 {
 	struct static_key *key = jump_entry_key(entry);
@@ -306,13 +342,7 @@ void __init jump_label_init(void)
 			continue;
 
 		key = iterk;
-		/*
-		 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
-		 */
-		*((unsigned long *)&key->entries) += (unsigned long)iter;
-#ifdef CONFIG_MODULES
-		key->next = NULL;
-#endif
+		static_key_set_entries(key, iter);
 	}
 	static_key_initialized = true;
 	jump_label_unlock();
@@ -336,6 +366,29 @@ struct static_key_mod {
 	struct module *mod;
 };
 
+static inline struct static_key_mod *static_key_mod(struct static_key *key)
+{
+	WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED));
+	return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+/***
+ * key->type and key->next are the same via union.
+ * This sets key->next and preserves the type bits.
+ *
+ * See additional comments above static_key_set_entries().
+ */
+static void static_key_set_mod(struct static_key *key,
+			       struct static_key_mod *mod)
+{
+	unsigned long type;
+
+	WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
+	type = key->type & JUMP_TYPE_MASK;
+	key->next = mod;
+	key->type |= type;
+}
+
 static int __jump_label_mod_text_reserved(void *start, void *end)
 {
 	struct module *mod;
@@ -358,11 +411,23 @@ static void __jump_label_mod_update(struct static_key *key)
 {
 	struct static_key_mod *mod;
 
-	for (mod = key->next; mod; mod = mod->next) {
-		struct module *m = mod->mod;
+	for (mod = static_key_mod(key); mod; mod = mod->next) {
+		struct jump_entry *stop;
+		struct module *m;
+
+		/*
+		 * NULL if the static_key is defined in a module
+		 * that does not use it
+		 */
+		if (!mod->entries)
+			continue;
 
-		__jump_label_update(key, mod->entries,
-				    m->jump_entries + m->num_jump_entries);
+		m = mod->mod;
+		if (!m)
+			stop = __stop___jump_table;
+		else
+			stop = m->jump_entries + m->num_jump_entries;
+		__jump_label_update(key, mod->entries, stop);
 	}
 }
 
@@ -397,7 +462,7 @@ static int jump_label_add_module(struct module *mod)
 	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
 	struct jump_entry *iter;
 	struct static_key *key = NULL;
-	struct static_key_mod *jlm;
+	struct static_key_mod *jlm, *jlm2;
 
 	/* if the module doesn't have jump label entries, just return */
 	if (iter_start == iter_stop)
@@ -414,20 +479,32 @@ static int jump_label_add_module(struct module *mod)
 
 		key = iterk;
 		if (within_module(iter->key, mod)) {
-			/*
-			 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
-			 */
-			*((unsigned long *)&key->entries) += (unsigned long)iter;
-			key->next = NULL;
+			static_key_set_entries(key, iter);
 			continue;
 		}
 		jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
 		if (!jlm)
 			return -ENOMEM;
+		if (!static_key_linked(key)) {
+			jlm2 = kzalloc(sizeof(struct static_key_mod),
+				       GFP_KERNEL);
+			if (!jlm2) {
+				kfree(jlm);
+				return -ENOMEM;
+			}
+			preempt_disable();
+			jlm2->mod = __module_address((unsigned long)key);
+			preempt_enable();
+			jlm2->entries = static_key_entries(key);
+			jlm2->next = NULL;
+			static_key_set_mod(key, jlm2);
+			static_key_set_linked(key);
+		}
 		jlm->mod = mod;
 		jlm->entries = iter;
-		jlm->next = key->next;
-		key->next = jlm;
+		jlm->next = static_key_mod(key);
+		static_key_set_mod(key, jlm);
+		static_key_set_linked(key);
 
 		/* Only update if we've changed from our initial state */
 		if (jump_label_type(iter) != jump_label_init_type(iter))
@@ -454,16 +531,34 @@ static void jump_label_del_module(struct module *mod)
 		if (within_module(iter->key, mod))
 			continue;
 
+		/* No memory during module load */
+		if (WARN_ON(!static_key_linked(key)))
+			continue;
+
 		prev = &key->next;
-		jlm = key->next;
+		jlm = static_key_mod(key);
 
 		while (jlm && jlm->mod != mod) {
 			prev = &jlm->next;
 			jlm = jlm->next;
 		}
 
-		if (jlm) {
+		/* No memory during module load */
+		if (WARN_ON(!jlm))
+			continue;
+
+		if (prev == &key->next)
+			static_key_set_mod(key, jlm->next);
+		else
 			*prev = jlm->next;
+
+		kfree(jlm);
+
+		jlm = static_key_mod(key);
+		/* if only one etry is left, fold it back into the static_key */
+		if (jlm->next == NULL) {
+			static_key_set_entries(key, jlm->entries);
+			static_key_clear_linked(key);
 			kfree(jlm);
 		}
 	}
@@ -492,8 +587,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
 	case MODULE_STATE_COMING:
 		jump_label_lock();
 		ret = jump_label_add_module(mod);
-		if (ret)
+		if (ret) {
+			WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
 			jump_label_del_module(mod);
+		}
 		jump_label_unlock();
 		break;
 	case MODULE_STATE_GOING:
@@ -554,11 +651,14 @@ int jump_label_text_reserved(void *start, void *end)
 static void jump_label_update(struct static_key *key)
 {
 	struct jump_entry *stop = __stop___jump_table;
-	struct jump_entry *entry = static_key_entries(key);
+	struct jump_entry *entry;
 #ifdef CONFIG_MODULES
 	struct module *mod;
 
-	__jump_label_mod_update(key);
+	if (static_key_linked(key)) {
+		__jump_label_mod_update(key);
+		return;
+	}
 
 	preempt_disable();
 	mod = __module_address((unsigned long)key);
@@ -566,6 +666,7 @@ static void jump_label_update(struct static_key *key)
 		stop = mod->jump_entries + mod->num_jump_entries;
 	preempt_enable();
 #endif
+	entry = static_key_entries(key);
 	/* if there are no users, entry can be NULL */
 	if (entry)
 		__jump_label_update(key, entry, stop);
-- 
cgit v1.2.3


From b85ad494098bf881c3713218fbd74193e5d5c488 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 3 Feb 2017 12:39:03 -0600
Subject: of: introduce of_graph_get_remote_node

The OF graph API leaves too much of the graph walking to clients when
in many cases the driver doesn't care about accessing the port or
endpoint nodes. The drivers typically just want the device connected via
a particular graph connection. of_graph_get_remote_node provides this
functionality.

Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/of/base.c        | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/of_graph.h |  8 ++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index ce8206f9f97a..8f2a1dbfe75c 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2472,3 +2472,40 @@ struct device_node *of_graph_get_remote_port(const struct device_node *node)
 	return of_get_next_parent(np);
 }
 EXPORT_SYMBOL(of_graph_get_remote_port);
+
+/**
+ * of_graph_get_remote_node() - get remote parent device_node for given port/endpoint
+ * @node: pointer to parent device_node containing graph port/endpoint
+ * @port: identifier (value of reg property) of the parent port node
+ * @endpoint: identifier (value of reg property) of the endpoint node
+ *
+ * Return: Remote device node associated with remote endpoint node linked
+ *	   to @node. Use of_node_put() on it when done.
+ */
+struct device_node *of_graph_get_remote_node(const struct device_node *node,
+					     u32 port, u32 endpoint)
+{
+	struct device_node *endpoint_node, *remote;
+
+	endpoint_node = of_graph_get_endpoint_by_regs(node, port, endpoint);
+	if (!endpoint_node) {
+		pr_debug("no valid endpoint (%d, %d) for node %s\n",
+			 port, endpoint, node->full_name);
+		return NULL;
+	}
+
+	remote = of_graph_get_remote_port_parent(endpoint_node);
+	of_node_put(endpoint_node);
+	if (!remote) {
+		pr_debug("no valid remote node\n");
+		return NULL;
+	}
+
+	if (!of_device_is_available(remote)) {
+		pr_debug("not available for remote node\n");
+		return NULL;
+	}
+
+	return remote;
+}
+EXPORT_SYMBOL(of_graph_get_remote_node);
diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h
index bb3a5a2cd570..abdb02eaef06 100644
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -51,6 +51,8 @@ struct device_node *of_graph_get_endpoint_by_regs(
 struct device_node *of_graph_get_remote_port_parent(
 					const struct device_node *node);
 struct device_node *of_graph_get_remote_port(const struct device_node *node);
+struct device_node *of_graph_get_remote_node(const struct device_node *node,
+					     u32 port, u32 endpoint);
 #else
 
 static inline int of_graph_parse_endpoint(const struct device_node *node,
@@ -89,6 +91,12 @@ static inline struct device_node *of_graph_get_remote_port(
 {
 	return NULL;
 }
+static inline struct device_node *of_graph_get_remote_node(
+					const struct device_node *node,
+					u32 port, u32 endpoint)
+{
+	return NULL;
+}
 
 #endif /* CONFIG_OF */
 
-- 
cgit v1.2.3


From c78c70fa30e23dc6cdb394f6c13880919499fba5 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.Kalluru@cavium.com>
Date: Wed, 15 Feb 2017 10:24:10 +0200
Subject: qed: Add infrastructure for PTP support

The patch adds the required qed interfaces for configuring/reading
the PTP clock on the adapter.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile       |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h          |   2 +
 drivers/net/ethernet/qlogic/qed/qed_l2.c       |   5 +
 drivers/net/ethernet/qlogic/qed/qed_l2.h       |   1 +
 drivers/net/ethernet/qlogic/qed/qed_main.c     |  15 ++
 drivers/net/ethernet/qlogic/qed/qed_ptp.c      | 323 +++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_ptp.h      |  47 ++++
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |  31 +++
 include/linux/qed/qed_eth_if.h                 |  22 ++
 9 files changed, 447 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ptp.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ptp.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index 729e43768e99..1a7300f72cab 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
-	 qed_selftest.o qed_dcbx.o qed_debug.o
+	 qed_selftest.o qed_dcbx.o qed_debug.o qed_ptp.o
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_QED_RDMA) += qed_roce.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 1f61cf3209e8..6557f94b92ed 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -456,6 +456,8 @@ struct qed_hwfn {
 	u8 dcbx_no_edpm;
 	u8 db_bar_no_edpm;
 
+	/* p_ptp_ptt is valid for leading HWFN only */
+	struct qed_ptt *p_ptp_ptt;
 	struct qed_simd_fp_handler	simd_proto_handler[64];
 
 #ifdef CONFIG_QED_SRIOV
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 7520eb34ad00..df932be5a4e5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -214,6 +214,7 @@ int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->vport_id	= abs_vport_id;
 
 	p_ramrod->mtu			= cpu_to_le16(p_params->mtu);
+	p_ramrod->handle_ptp_pkts	= p_params->handle_ptp_pkts;
 	p_ramrod->inner_vlan_removal_en	= p_params->remove_inner_vlan;
 	p_ramrod->drop_ttl0_en		= p_params->drop_ttl0;
 	p_ramrod->untagged		= p_params->only_untagged;
@@ -1886,6 +1887,7 @@ static int qed_start_vport(struct qed_dev *cdev,
 		start.drop_ttl0 = params->drop_ttl0;
 		start.opaque_fid = p_hwfn->hw_info.opaque_fid;
 		start.concrete_fid = p_hwfn->hw_info.concrete_fid;
+		start.handle_ptp_pkts = params->handle_ptp_pkts;
 		start.vport_id = params->vport_id;
 		start.max_buffers_per_cqe = 16;
 		start.mtu = params->mtu;
@@ -2328,6 +2330,8 @@ extern const struct qed_iov_hv_ops qed_iov_ops_pass;
 extern const struct qed_eth_dcbnl_ops qed_dcbnl_ops_pass;
 #endif
 
+extern const struct qed_eth_ptp_ops qed_ptp_ops_pass;
+
 static const struct qed_eth_ops qed_eth_ops_pass = {
 	.common = &qed_common_ops_pass,
 #ifdef CONFIG_QED_SRIOV
@@ -2336,6 +2340,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 #ifdef CONFIG_DCB
 	.dcb = &qed_dcbnl_ops_pass,
 #endif
+	.ptp = &qed_ptp_ops_pass,
 	.fill_dev_info = &qed_fill_eth_dev_info,
 	.register_ops = &qed_register_eth_ops,
 	.check_mac = &qed_check_mac,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index 93cb932ef663..e763abd334f6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -156,6 +156,7 @@ struct qed_sp_vport_start_params {
 	enum qed_tpa_mode tpa_mode;
 	bool remove_inner_vlan;
 	bool tx_switching;
+	bool handle_ptp_pkts;
 	bool only_untagged;
 	bool drop_ttl0;
 	u8 max_buffers_per_cqe;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 93eee83ccdc3..592e104687a7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -902,6 +902,7 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	struct qed_mcp_drv_version drv_version;
 	const u8 *data = NULL;
 	struct qed_hwfn *hwfn;
+	struct qed_ptt *p_ptt;
 	int rc = -EINVAL;
 
 	if (qed_iov_wq_start(cdev))
@@ -916,6 +917,14 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 				  QED_FW_FILE_NAME);
 			goto err;
 		}
+
+		p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+		if (p_ptt) {
+			QED_LEADING_HWFN(cdev)->p_ptp_ptt = p_ptt;
+		} else {
+			DP_NOTICE(cdev, "Failed to acquire PTT for PTP\n");
+			goto err;
+		}
 	}
 
 	cdev->rx_coalesce_usecs = QED_DEFAULT_RX_USECS;
@@ -1003,6 +1012,10 @@ err:
 	if (IS_PF(cdev))
 		release_firmware(cdev->firmware);
 
+	if (IS_PF(cdev) && QED_LEADING_HWFN(cdev)->p_ptp_ptt)
+		qed_ptt_release(QED_LEADING_HWFN(cdev),
+				QED_LEADING_HWFN(cdev)->p_ptp_ptt);
+
 	qed_iov_wq_stop(cdev, false);
 
 	return rc;
@@ -1016,6 +1029,8 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	qed_ll2_dealloc_if(cdev);
 
 	if (IS_PF(cdev)) {
+		qed_ptt_release(QED_LEADING_HWFN(cdev),
+				QED_LEADING_HWFN(cdev)->p_ptp_ptt);
 		qed_free_stream_mem(cdev);
 		if (IS_QED_ETH_IF(cdev))
 			qed_sriov_disable(cdev, true);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ptp.c b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
new file mode 100644
index 000000000000..d27aa85da23c
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
@@ -0,0 +1,323 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/types.h>
+#include "qed.h"
+#include "qed_dev_api.h"
+#include "qed_hw.h"
+#include "qed_l2.h"
+#include "qed_ptp.h"
+#include "qed_reg_addr.h"
+
+/* 16 nano second time quantas to wait before making a Drift adjustment */
+#define QED_DRIFT_CNTR_TIME_QUANTA_SHIFT	0
+/* Nano seconds to add/subtract when making a Drift adjustment */
+#define QED_DRIFT_CNTR_ADJUSTMENT_SHIFT		28
+/* Add/subtract the Adjustment_Value when making a Drift adjustment */
+#define QED_DRIFT_CNTR_DIRECTION_SHIFT		31
+#define QED_TIMESTAMP_MASK			BIT(16)
+
+/* Read Rx timestamp */
+static int qed_ptp_hw_read_rx_ts(struct qed_dev *cdev, u64 *timestamp)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 val;
+
+	*timestamp = 0;
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID);
+	if (!(val & QED_TIMESTAMP_MASK)) {
+		DP_INFO(p_hwfn, "Invalid Rx timestamp, buf_seqid = %d\n", val);
+		return -EINVAL;
+	}
+
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_TS_LSB);
+	*timestamp = qed_rd(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_TS_MSB);
+	*timestamp <<= 32;
+	*timestamp |= val;
+
+	/* Reset timestamp register to allow new timestamp */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID,
+	       QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+/* Read Tx timestamp */
+static int qed_ptp_hw_read_tx_ts(struct qed_dev *cdev, u64 *timestamp)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 val;
+
+	*timestamp = 0;
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_SEQID);
+	if (!(val & QED_TIMESTAMP_MASK)) {
+		DP_INFO(p_hwfn, "Invalid Tx timestamp, buf_seqid = %d\n", val);
+		return -EINVAL;
+	}
+
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_TS_LSB);
+	*timestamp = qed_rd(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_TS_MSB);
+	*timestamp <<= 32;
+	*timestamp |= val;
+
+	/* Reset timestamp register to allow new timestamp */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_SEQID, QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+/* Read Phy Hardware Clock */
+static int qed_ptp_hw_read_cc(struct qed_dev *cdev, u64 *phc_cycles)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 temp = 0;
+
+	temp = qed_rd(p_hwfn, p_ptt, NIG_REG_TSGEN_SYNC_TIME_LSB);
+	*phc_cycles = qed_rd(p_hwfn, p_ptt, NIG_REG_TSGEN_SYNC_TIME_MSB);
+	*phc_cycles <<= 32;
+	*phc_cycles |= temp;
+
+	return 0;
+}
+
+/* Filter PTP protocol packets that need to be timestamped */
+static int qed_ptp_hw_cfg_rx_filters(struct qed_dev *cdev,
+				     enum qed_ptp_filter_type type)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 rule_mask, parm_mask;
+
+	switch (type) {
+	case QED_PTP_FILTER_L2_IPV4_IPV6:
+		parm_mask = 0x6AA;
+		rule_mask = 0x3EEE;
+		break;
+	case QED_PTP_FILTER_L2:
+		parm_mask = 0x6BF;
+		rule_mask = 0x3EFF;
+		break;
+	case QED_PTP_FILTER_IPV4_IPV6:
+		parm_mask = 0x7EA;
+		rule_mask = 0x3FFE;
+		break;
+	case QED_PTP_FILTER_IPV4:
+		parm_mask = 0x7EE;
+		rule_mask = 0x3FFE;
+		break;
+	default:
+		DP_INFO(p_hwfn, "Invalid PTP filter type %d\n", type);
+		return -EINVAL;
+	}
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, parm_mask);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_RULE_MASK, rule_mask);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_TO_HOST, 0x1);
+
+	/* Reset possibly old timestamps */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID,
+	       QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+/* Adjust the HW clock by a rate given in parts-per-billion (ppb) units.
+ * FW/HW accepts the adjustment value in terms of 3 parameters:
+ *   Drift period - adjustment happens once in certain number of nano seconds.
+ *   Drift value - time is adjusted by a certain value, for example by 5 ns.
+ *   Drift direction - add or subtract the adjustment value.
+ * The routine translates ppb into the adjustment triplet in an optimal manner.
+ */
+static int qed_ptp_hw_adjfreq(struct qed_dev *cdev, s32 ppb)
+{
+	s64 best_val = 0, val, best_period = 0, period, approx_dev, dif, dif2;
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 drift_ctr_cfg = 0, drift_state;
+	int drift_dir = 1;
+
+	if (ppb < 0) {
+		ppb = -ppb;
+		drift_dir = 0;
+	}
+
+	if (ppb > 1) {
+		s64 best_dif = ppb, best_approx_dev = 1;
+
+		/* Adjustment value is up to +/-7ns, find an optimal value in
+		 * this range.
+		 */
+		for (val = 7; val > 0; val--) {
+			period = div_s64(val * 1000000000, ppb);
+			period -= 8;
+			period >>= 4;
+			if (period < 1)
+				period = 1;
+			if (period > 0xFFFFFFE)
+				period = 0xFFFFFFE;
+
+			/* Check both rounding ends for approximate error */
+			approx_dev = period * 16 + 8;
+			dif = ppb * approx_dev - val * 1000000000;
+			dif2 = dif + 16 * ppb;
+
+			if (dif < 0)
+				dif = -dif;
+			if (dif2 < 0)
+				dif2 = -dif2;
+
+			/* Determine which end gives better approximation */
+			if (dif * (approx_dev + 16) > dif2 * approx_dev) {
+				period++;
+				approx_dev += 16;
+				dif = dif2;
+			}
+
+			/* Track best approximation found so far */
+			if (best_dif * approx_dev > dif * best_approx_dev) {
+				best_dif = dif;
+				best_val = val;
+				best_period = period;
+				best_approx_dev = approx_dev;
+			}
+		}
+	} else if (ppb == 1) {
+		/* This is a special case as its the only value which wouldn't
+		 * fit in a s64 variable. In order to prevent castings simple
+		 * handle it seperately.
+		 */
+		best_val = 4;
+		best_period = 0xee6b27f;
+	} else {
+		best_val = 0;
+		best_period = 0xFFFFFFF;
+	}
+
+	drift_ctr_cfg = (best_period << QED_DRIFT_CNTR_TIME_QUANTA_SHIFT) |
+			(((int)best_val) << QED_DRIFT_CNTR_ADJUSTMENT_SHIFT) |
+			(((int)drift_dir) << QED_DRIFT_CNTR_DIRECTION_SHIFT);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR, 0x1);
+
+	drift_state = qed_rd(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR);
+	if (drift_state & 1) {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_DRIFT_CNTR_CONF,
+		       drift_ctr_cfg);
+	} else {
+		DP_INFO(p_hwfn, "Drift counter is not reset\n");
+		return -EINVAL;
+	}
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR, 0x0);
+
+	return 0;
+}
+
+static int qed_ptp_hw_enable(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+
+	/* Reset PTP event detection rules - will be configured in the IOCTL */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_RULE_MASK, 0x3FFF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3FFF);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, 7);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_RX_PTP_EN, 7);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TS_OUTPUT_ENABLE_PDA, 0x1);
+
+	/* Pause free running counter */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 2);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREE_CNT_VALUE_LSB, 0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREE_CNT_VALUE_MSB, 0);
+	/* Resume free running counter */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 4);
+
+	/* Disable drift register */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_DRIFT_CNTR_CONF, 0x0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR, 0x0);
+
+	/* Reset possibly old timestamps */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID,
+	       QED_TIMESTAMP_MASK);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_SEQID, QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+static int qed_ptp_hw_hwtstamp_tx_on(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x6AA);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3EEE);
+
+	return 0;
+}
+
+static int qed_ptp_hw_disable(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+
+	/* Reset PTP event detection rules */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_RULE_MASK, 0x3FFF);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3FFF);
+
+	/* Disable the PTP feature */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_RX_PTP_EN, 0x0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, 0x0);
+
+	return 0;
+}
+
+const struct qed_eth_ptp_ops qed_ptp_ops_pass = {
+	.hwtstamp_tx_on = qed_ptp_hw_hwtstamp_tx_on,
+	.cfg_rx_filters = qed_ptp_hw_cfg_rx_filters,
+	.read_rx_ts = qed_ptp_hw_read_rx_ts,
+	.read_tx_ts = qed_ptp_hw_read_tx_ts,
+	.read_cc = qed_ptp_hw_read_cc,
+	.adjfreq = qed_ptp_hw_adjfreq,
+	.disable = qed_ptp_hw_disable,
+	.enable = qed_ptp_hw_enable,
+};
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ptp.h b/drivers/net/ethernet/qlogic/qed/qed_ptp.h
new file mode 100644
index 000000000000..63c666d0b739
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_ptp.h
@@ -0,0 +1,47 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QED_PTP_H
+#define _QED_PTP_H
+#include <linux/types.h>
+
+int qed_ptp_hwtstamp_tx_on(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+int qed_ptp_cfg_rx_filters(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			   enum qed_ptp_filter_type type);
+int qed_ptp_read_rx_ts(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u64 *ts);
+int qed_ptp_read_tx_ts(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u64 *ts);
+int qed_ptp_read_cc(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, u64 *cycles);
+int qed_ptp_adjfreq(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, s32 ppb);
+int qed_ptp_disable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+int qed_ptp_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index b6722c6ff761..3b7edf6ff234 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -1481,4 +1481,35 @@
 #define DORQ_REG_PF_ICID_BIT_SHIFT_NORM	0x100448UL
 #define DORQ_REG_PF_MIN_ADDR_REG1 0x100400UL
 #define DORQ_REG_PF_DPI_BIT_SHIFT 0x100450UL
+#define NIG_REG_RX_PTP_EN 0x501900UL
+#define NIG_REG_TX_PTP_EN 0x501904UL
+#define NIG_REG_LLH_PTP_TO_HOST	0x501908UL
+#define NIG_REG_LLH_PTP_TO_MCP 0x50190cUL
+#define NIG_REG_PTP_SW_TXTSEN 0x501910UL
+#define NIG_REG_LLH_PTP_ETHERTYPE_1 0x501914UL
+#define NIG_REG_LLH_PTP_MAC_DA_2_LSB 0x501918UL
+#define NIG_REG_LLH_PTP_MAC_DA_2_MSB 0x50191cUL
+#define NIG_REG_LLH_PTP_PARAM_MASK 0x501920UL
+#define NIG_REG_LLH_PTP_RULE_MASK 0x501924UL
+#define NIG_REG_TX_LLH_PTP_PARAM_MASK 0x501928UL
+#define NIG_REG_TX_LLH_PTP_RULE_MASK 0x50192cUL
+#define NIG_REG_LLH_PTP_HOST_BUF_SEQID 0x501930UL
+#define NIG_REG_LLH_PTP_HOST_BUF_TS_LSB 0x501934UL
+#define NIG_REG_LLH_PTP_HOST_BUF_TS_MSB	0x501938UL
+#define NIG_REG_LLH_PTP_MCP_BUF_SEQID 0x50193cUL
+#define NIG_REG_LLH_PTP_MCP_BUF_TS_LSB 0x501940UL
+#define NIG_REG_LLH_PTP_MCP_BUF_TS_MSB 0x501944UL
+#define NIG_REG_TX_LLH_PTP_BUF_SEQID 0x501948UL
+#define NIG_REG_TX_LLH_PTP_BUF_TS_LSB 0x50194cUL
+#define NIG_REG_TX_LLH_PTP_BUF_TS_MSB 0x501950UL
+#define NIG_REG_RX_PTP_TS_MSB_ERR 0x501954UL
+#define NIG_REG_TX_PTP_TS_MSB_ERR 0x501958UL
+#define NIG_REG_TSGEN_SYNC_TIME_LSB 0x5088c0UL
+#define NIG_REG_TSGEN_SYNC_TIME_MSB 0x5088c4UL
+#define NIG_REG_TSGEN_RST_DRIFT_CNTR 0x5088d8UL
+#define NIG_REG_TSGEN_DRIFT_CNTR_CONF 0x5088dcUL
+#define NIG_REG_TS_OUTPUT_ENABLE_PDA 0x508870UL
+#define NIG_REG_TIMESYNC_GEN_REG_BB 0x500d00UL
+#define NIG_REG_TSGEN_FREE_CNT_VALUE_LSB 0x5088a8UL
+#define NIG_REG_TSGEN_FREE_CNT_VALUE_MSB 0x5088acUL
 #endif
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 3613d63cd5d0..4cd1f0ccfa36 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -96,6 +96,7 @@ struct qed_update_vport_params {
 
 struct qed_start_vport_params {
 	bool remove_inner_vlan;
+	bool handle_ptp_pkts;
 	bool gro_enable;
 	bool drop_ttl0;
 	u8 vport_id;
@@ -159,6 +160,15 @@ struct qed_eth_cb_ops {
 	void (*force_mac) (void *dev, u8 *mac, bool forced);
 };
 
+#define QED_MAX_PHC_DRIFT_PPB   291666666
+
+enum qed_ptp_filter_type {
+	QED_PTP_FILTER_L2,
+	QED_PTP_FILTER_IPV4,
+	QED_PTP_FILTER_IPV4_IPV6,
+	QED_PTP_FILTER_L2_IPV4_IPV6
+};
+
 #ifdef CONFIG_DCB
 /* Prototype declaration of qed_eth_dcbnl_ops should match with the declaration
  * of dcbnl_rtnl_ops structure.
@@ -218,6 +228,17 @@ struct qed_eth_dcbnl_ops {
 };
 #endif
 
+struct qed_eth_ptp_ops {
+	int (*hwtstamp_tx_on)(struct qed_dev *);
+	int (*cfg_rx_filters)(struct qed_dev *, enum qed_ptp_filter_type);
+	int (*read_rx_ts)(struct qed_dev *, u64 *);
+	int (*read_tx_ts)(struct qed_dev *, u64 *);
+	int (*read_cc)(struct qed_dev *, u64 *);
+	int (*disable)(struct qed_dev *);
+	int (*adjfreq)(struct qed_dev *, s32);
+	int (*enable)(struct qed_dev *);
+};
+
 struct qed_eth_ops {
 	const struct qed_common_ops *common;
 #ifdef CONFIG_QED_SRIOV
@@ -226,6 +247,7 @@ struct qed_eth_ops {
 #ifdef CONFIG_DCB
 	const struct qed_eth_dcbnl_ops *dcb;
 #endif
+	const struct qed_eth_ptp_ops *ptp;
 
 	int (*fill_dev_info)(struct qed_dev *cdev,
 			     struct qed_dev_eth_info *info);
-- 
cgit v1.2.3


From c18a1e09008de7d8bd82b046d38a88f4285d53f9 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Tue, 14 Feb 2017 23:28:59 +0800
Subject: block: introduce bio_clone_bioset_partial()

md still need bio clone(not the fast version) for behind write,
and it is more efficient to use bio_clone_bioset_partial().

The idea is simple and just copy the bvecs range specified from
parameters.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 61 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/bio.h | 11 ++++++++--
 2 files changed, 57 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 2b375020fc49..9e2b33ddca0a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -625,21 +625,20 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
+static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+				      struct bio_set *bs, int offset,
+				      int size)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	struct bio *bio;
+	struct bvec_iter iter_src = bio_src->bi_iter;
+
+	/* for supporting partial clone */
+	if (offset || size != bio_src->bi_iter.bi_size) {
+		bio_advance_iter(bio_src, &iter_src, offset);
+		iter_src.bi_size = size;
+	}
 
 	/*
 	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -663,7 +662,8 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	 *    __bio_clone_fast() anyways.
 	 */
 
-	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+	bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
+			       &iter_src), bs);
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
@@ -680,7 +680,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
 		break;
 	default:
-		bio_for_each_segment(bv, bio_src, iter)
+		__bio_for_each_segment(bv, bio_src, iter, iter_src)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
 		break;
 	}
@@ -699,8 +699,43 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
 	return bio;
 }
+
+/**
+ * 	bio_clone_bioset - clone a bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+			     struct bio_set *bs)
+{
+	return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
+				  bio_src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_clone_bioset);
 
+/**
+ * 	bio_clone_bioset_partial - clone a partial bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *	@offset: cloned starting from the offset
+ *	@size: size for the cloned bio
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
+				     struct bio_set *bs, int offset,
+				     int size)
+{
+	return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
+}
+EXPORT_SYMBOL(bio_clone_bioset_partial);
+
 /**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7cf8a6c70a3f..8e521194f6fc 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned bio_segments(struct bio *bio)
+static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 {
 	unsigned segs = 0;
 	struct bio_vec bv;
@@ -205,12 +205,17 @@ static inline unsigned bio_segments(struct bio *bio)
 		break;
 	}
 
-	bio_for_each_segment(bv, bio, iter)
+	__bio_for_each_segment(bv, bio, iter, *bvec)
 		segs++;
 
 	return segs;
 }
 
+static inline unsigned bio_segments(struct bio *bio)
+{
+	return __bio_segments(bio, &bio->bi_iter);
+}
+
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -384,6 +389,8 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
+					    struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 
-- 
cgit v1.2.3


From bf3f14d6342cfb37eab8f0cddd0e4d4063fd9fc9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 15 Feb 2017 22:29:51 -0500
Subject: rhashtable: Revert nested table changes.

This reverts commits:

6a25478077d987edc5e2f880590a2bc5fcab4441
9dbbfb0ab6680c6a85609041011484e6658e7d3c
40137906c5f55c252194ef5834130383e639536f

It's too risky to put in this late in the release
cycle.  We'll put these changes into the next merge
window instead.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/gfs2/glock.c            |  28 ++---
 include/linux/rhashtable.h |  78 ++++---------
 lib/rhashtable.c           | 270 +++++++++------------------------------------
 net/tipc/net.c             |   4 -
 net/tipc/socket.c          |  30 ++---
 5 files changed, 94 insertions(+), 316 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 70e94170af85..94f50cac91c6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1420,32 +1420,26 @@ static struct shrinker glock_shrinker = {
  * @sdp: the filesystem
  * @bucket: the bucket
  *
- * Note that the function can be called multiple times on the same
- * object.  So the user must ensure that the function can cope with
- * that.
  */
 
 static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
 {
 	struct gfs2_glock *gl;
-	struct rhashtable_iter iter;
-
-	rhashtable_walk_enter(&gl_hash_table, &iter);
-
-	do {
-		gl = ERR_PTR(rhashtable_walk_start(&iter));
-		if (gl)
-			continue;
+	struct rhash_head *pos;
+	const struct bucket_table *tbl;
+	int i;
 
-		while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl))
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
+	for (i = 0; i < tbl->size; i++) {
+		rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) {
 			if ((gl->gl_name.ln_sbd == sdp) &&
 			    lockref_get_not_dead(&gl->gl_lockref))
 				examiner(gl);
-
-		rhashtable_walk_stop(&iter);
-	} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
-
-	rhashtable_walk_exit(&iter);
+		}
+	}
+	rcu_read_unlock();
+	cond_resched();
 }
 
 /**
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f2e12a845910..5c132d3188be 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -61,7 +61,6 @@ struct rhlist_head {
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
- * @nest: Number of bits of first-level nested table.
  * @rehash: Current bucket being rehashed
  * @hash_rnd: Random seed to fold into hash
  * @locks_mask: Mask to apply before accessing locks[]
@@ -69,12 +68,10 @@ struct rhlist_head {
  * @walkers: List of active walkers
  * @rcu: RCU structure for freeing the table
  * @future_tbl: Table under construction during rehashing
- * @ntbl: Nested table used when out of memory.
  * @buckets: size * hash buckets
  */
 struct bucket_table {
 	unsigned int		size;
-	unsigned int		nest;
 	unsigned int		rehash;
 	u32			hash_rnd;
 	unsigned int		locks_mask;
@@ -84,7 +81,7 @@ struct bucket_table {
 
 	struct bucket_table __rcu *future_tbl;
 
-	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
+	struct rhash_head __rcu	*buckets[] ____cacheline_aligned_in_smp;
 };
 
 /**
@@ -377,12 +374,6 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void *arg);
 void rhashtable_destroy(struct rhashtable *ht);
 
-struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
-					    unsigned int hash);
-struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
-						   struct bucket_table *tbl,
-						   unsigned int hash);
-
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
 
@@ -398,27 +389,6 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
 #define rht_entry(tpos, pos, member) \
 	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
-static inline struct rhash_head __rcu *const *rht_bucket(
-	const struct bucket_table *tbl, unsigned int hash)
-{
-	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
-				     &tbl->buckets[hash];
-}
-
-static inline struct rhash_head __rcu **rht_bucket_var(
-	struct bucket_table *tbl, unsigned int hash)
-{
-	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
-				     &tbl->buckets[hash];
-}
-
-static inline struct rhash_head __rcu **rht_bucket_insert(
-	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
-{
-	return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
-				     &tbl->buckets[hash];
-}
-
 /**
  * rht_for_each_continue - continue iterating over hash chain
  * @pos:	the &struct rhash_head to use as a loop cursor.
@@ -438,7 +408,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
 
 /**
  * rht_for_each_entry_continue - continue iterating over hash chain
@@ -463,7 +433,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash),	\
+	rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash],	\
 				    tbl, hash, member)
 
 /**
@@ -478,13 +448,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * This hash chain list-traversal primitive allows for the looped code to
  * remove the loop cursor from the list.
  */
-#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
-	for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \
-	     next = !rht_is_a_nulls(pos) ?				      \
-		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
-	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
-	     pos = next,						      \
-	     next = !rht_is_a_nulls(pos) ?				      \
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
+	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
+	     next = !rht_is_a_nulls(pos) ?				    \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
+	     pos = next,						    \
+	     next = !rht_is_a_nulls(pos) ?				    \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
 
 /**
@@ -515,7 +485,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
 
 /**
  * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
@@ -548,8 +518,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
-	rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
 					tbl, hash, member)
 
 /**
@@ -595,7 +565,7 @@ static inline struct rhash_head *__rhashtable_lookup(
 		.ht = ht,
 		.key = key,
 	};
-	struct bucket_table *tbl;
+	const struct bucket_table *tbl;
 	struct rhash_head *he;
 	unsigned int hash;
 
@@ -727,12 +697,8 @@ slow_path:
 	}
 
 	elasticity = ht->elasticity;
-	pprev = rht_bucket_insert(ht, tbl, hash);
-	data = ERR_PTR(-ENOMEM);
-	if (!pprev)
-		goto out;
-
-	rht_for_each_continue(head, *pprev, tbl, hash) {
+	pprev = &tbl->buckets[hash];
+	rht_for_each(head, tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -770,7 +736,7 @@ slow_path:
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		goto slow_path;
 
-	head = rht_dereference_bucket(*pprev, tbl, hash);
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (rhlist) {
@@ -780,7 +746,7 @@ slow_path:
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(*pprev, obj);
+	rcu_assign_pointer(tbl->buckets[hash], obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
@@ -989,8 +955,8 @@ static inline int __rhashtable_remove_fast_one(
 
 	spin_lock_bh(lock);
 
-	pprev = rht_bucket_var(tbl, hash);
-	rht_for_each_continue(he, *pprev, tbl, hash) {
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -1141,8 +1107,8 @@ static inline int __rhashtable_replace_fast(
 
 	spin_lock_bh(lock);
 
-	pprev = rht_bucket_var(tbl, hash);
-	rht_for_each_continue(he, *pprev, tbl, hash) {
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 172454e6b979..32d0ad058380 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -32,11 +32,6 @@
 #define HASH_MIN_SIZE		4U
 #define BUCKET_LOCKS_PER_CPU	32UL
 
-union nested_table {
-	union nested_table __rcu *table;
-	struct rhash_head __rcu *bucket;
-};
-
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
@@ -81,9 +76,6 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	/* Never allocate more than 0.5 locks per bucket */
 	size = min_t(unsigned int, size, tbl->size >> 1);
 
-	if (tbl->nest)
-		size = min(size, 1U << tbl->nest);
-
 	if (sizeof(spinlock_t) != 0) {
 		tbl->locks = NULL;
 #ifdef CONFIG_NUMA
@@ -107,45 +99,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	return 0;
 }
 
-static void nested_table_free(union nested_table *ntbl, unsigned int size)
-{
-	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
-	const unsigned int len = 1 << shift;
-	unsigned int i;
-
-	ntbl = rcu_dereference_raw(ntbl->table);
-	if (!ntbl)
-		return;
-
-	if (size > len) {
-		size >>= shift;
-		for (i = 0; i < len; i++)
-			nested_table_free(ntbl + i, size);
-	}
-
-	kfree(ntbl);
-}
-
-static void nested_bucket_table_free(const struct bucket_table *tbl)
-{
-	unsigned int size = tbl->size >> tbl->nest;
-	unsigned int len = 1 << tbl->nest;
-	union nested_table *ntbl;
-	unsigned int i;
-
-	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
-
-	for (i = 0; i < len; i++)
-		nested_table_free(ntbl + i, size);
-
-	kfree(ntbl);
-}
-
 static void bucket_table_free(const struct bucket_table *tbl)
 {
-	if (tbl->nest)
-		nested_bucket_table_free(tbl);
-
 	if (tbl)
 		kvfree(tbl->locks);
 
@@ -157,59 +112,6 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 	bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
-static union nested_table *nested_table_alloc(struct rhashtable *ht,
-					      union nested_table __rcu **prev,
-					      unsigned int shifted,
-					      unsigned int nhash)
-{
-	union nested_table *ntbl;
-	int i;
-
-	ntbl = rcu_dereference(*prev);
-	if (ntbl)
-		return ntbl;
-
-	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
-
-	if (ntbl && shifted) {
-		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++)
-			INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht,
-					    (i << shifted) | nhash);
-	}
-
-	rcu_assign_pointer(*prev, ntbl);
-
-	return ntbl;
-}
-
-static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
-						      size_t nbuckets,
-						      gfp_t gfp)
-{
-	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
-	struct bucket_table *tbl;
-	size_t size;
-
-	if (nbuckets < (1 << (shift + 1)))
-		return NULL;
-
-	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
-
-	tbl = kzalloc(size, gfp);
-	if (!tbl)
-		return NULL;
-
-	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
-				0, 0)) {
-		kfree(tbl);
-		return NULL;
-	}
-
-	tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
-
-	return tbl;
-}
-
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       size_t nbuckets,
 					       gfp_t gfp)
@@ -224,17 +126,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
 	if (tbl == NULL && gfp == GFP_KERNEL)
 		tbl = vzalloc(size);
-
-	size = nbuckets;
-
-	if (tbl == NULL && gfp != GFP_KERNEL) {
-		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
-		nbuckets = 0;
-	}
 	if (tbl == NULL)
 		return NULL;
 
-	tbl->size = size;
+	tbl->size = nbuckets;
 
 	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
 		bucket_table_free(tbl);
@@ -269,17 +164,12 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl = rhashtable_last_table(ht,
 		rht_dereference_rcu(old_tbl->future_tbl, ht));
-	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
-	int err = -EAGAIN;
+	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
+	int err = -ENOENT;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
 	unsigned int new_hash;
 
-	if (new_tbl->nest)
-		goto out;
-
-	err = -ENOENT;
-
 	rht_for_each(entry, old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
@@ -312,26 +202,19 @@ out:
 	return err;
 }
 
-static int rhashtable_rehash_chain(struct rhashtable *ht,
+static void rhashtable_rehash_chain(struct rhashtable *ht,
 				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	spinlock_t *old_bucket_lock;
-	int err;
 
 	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
 
 	spin_lock_bh(old_bucket_lock);
-	while (!(err = rhashtable_rehash_one(ht, old_hash)))
+	while (!rhashtable_rehash_one(ht, old_hash))
 		;
-
-	if (err == -ENOENT) {
-		old_tbl->rehash++;
-		err = 0;
-	}
+	old_tbl->rehash++;
 	spin_unlock_bh(old_bucket_lock);
-
-	return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
@@ -363,17 +246,13 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	struct bucket_table *new_tbl;
 	struct rhashtable_walker *walker;
 	unsigned int old_hash;
-	int err;
 
 	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 	if (!new_tbl)
 		return 0;
 
-	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
-		err = rhashtable_rehash_chain(ht, old_hash);
-		if (err)
-			return err;
-	}
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
+		rhashtable_rehash_chain(ht, old_hash);
 
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
@@ -392,16 +271,31 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-static int rhashtable_rehash_alloc(struct rhashtable *ht,
-				   struct bucket_table *old_tbl,
-				   unsigned int size)
+/**
+ * rhashtable_expand - Expand hash table while allowing concurrent lookups
+ * @ht:		the hash table to expand
+ *
+ * A secondary bucket array is allocated and the hash entries are migrated.
+ *
+ * This function may only be called in a context where it is safe to call
+ * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
+static int rhashtable_expand(struct rhashtable *ht)
 {
-	struct bucket_table *new_tbl;
+	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
+	old_tbl = rhashtable_last_table(ht, old_tbl);
+
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -430,9 +324,12 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht,
  */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	unsigned int nelems = atomic_read(&ht->nelems);
 	unsigned int size = 0;
+	int err;
+
+	ASSERT_RHT_MUTEX(ht);
 
 	if (nelems)
 		size = roundup_pow_of_two(nelems * 3 / 2);
@@ -445,7 +342,15 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	if (rht_dereference(old_tbl->future_tbl, ht))
 		return -EEXIST;
 
-	return rhashtable_rehash_alloc(ht, old_tbl, size);
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
+	if (new_tbl == NULL)
+		return -ENOMEM;
+
+	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
+	if (err)
+		bucket_table_free(new_tbl);
+
+	return err;
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -461,14 +366,11 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rhashtable_last_table(ht, tbl);
 
 	if (rht_grow_above_75(ht, tbl))
-		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
+		rhashtable_expand(ht);
 	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-		err = rhashtable_shrink(ht);
-	else if (tbl->nest)
-		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
+		rhashtable_shrink(ht);
 
-	if (!err)
-		err = rhashtable_rehash_table(ht);
+	err = rhashtable_rehash_table(ht);
 
 	mutex_unlock(&ht->mutex);
 
@@ -537,8 +439,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 	int elasticity;
 
 	elasticity = ht->elasticity;
-	pprev = rht_bucket_var(tbl, hash);
-	rht_for_each_continue(head, *pprev, tbl, hash) {
+	pprev = &tbl->buckets[hash];
+	rht_for_each(head, tbl, hash) {
 		struct rhlist_head *list;
 		struct rhlist_head *plist;
 
@@ -575,7 +477,6 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 						  struct rhash_head *obj,
 						  void *data)
 {
-	struct rhash_head __rcu **pprev;
 	struct bucket_table *new_tbl;
 	struct rhash_head *head;
 
@@ -598,11 +499,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		return ERR_PTR(-EAGAIN);
 
-	pprev = rht_bucket_insert(ht, tbl, hash);
-	if (!pprev)
-		return ERR_PTR(-ENOMEM);
-
-	head = rht_dereference_bucket(*pprev, tbl, hash);
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (ht->rhlist) {
@@ -612,7 +509,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(*pprev, obj);
+	rcu_assign_pointer(tbl->buckets[hash], obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
@@ -1078,7 +975,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void (*free_fn)(void *ptr, void *arg),
 				 void *arg)
 {
-	struct bucket_table *tbl;
+	const struct bucket_table *tbl;
 	unsigned int i;
 
 	cancel_work_sync(&ht->run_work);
@@ -1089,7 +986,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 		for (i = 0; i < tbl->size; i++) {
 			struct rhash_head *pos, *next;
 
-			for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
+			for (pos = rht_dereference(tbl->buckets[i], ht),
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL;
 			     !rht_is_a_nulls(pos);
@@ -1110,70 +1007,3 @@ void rhashtable_destroy(struct rhashtable *ht)
 	return rhashtable_free_and_destroy(ht, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
-
-struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
-					    unsigned int hash)
-{
-	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
-	static struct rhash_head __rcu *rhnull =
-		(struct rhash_head __rcu *)NULLS_MARKER(0);
-	unsigned int index = hash & ((1 << tbl->nest) - 1);
-	unsigned int size = tbl->size >> tbl->nest;
-	unsigned int subhash = hash;
-	union nested_table *ntbl;
-
-	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
-	ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
-	subhash >>= tbl->nest;
-
-	while (ntbl && size > (1 << shift)) {
-		index = subhash & ((1 << shift) - 1);
-		ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
-		size >>= shift;
-		subhash >>= shift;
-	}
-
-	if (!ntbl)
-		return &rhnull;
-
-	return &ntbl[subhash].bucket;
-
-}
-EXPORT_SYMBOL_GPL(rht_bucket_nested);
-
-struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
-						   struct bucket_table *tbl,
-						   unsigned int hash)
-{
-	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
-	unsigned int index = hash & ((1 << tbl->nest) - 1);
-	unsigned int size = tbl->size >> tbl->nest;
-	union nested_table *ntbl;
-	unsigned int shifted;
-	unsigned int nhash;
-
-	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
-	hash >>= tbl->nest;
-	nhash = index;
-	shifted = tbl->nest;
-	ntbl = nested_table_alloc(ht, &ntbl[index].table,
-				  size <= (1 << shift) ? shifted : 0, nhash);
-
-	while (ntbl && size > (1 << shift)) {
-		index = hash & ((1 << shift) - 1);
-		size >>= shift;
-		hash >>= shift;
-		nhash |= index << shifted;
-		shifted += shift;
-		ntbl = nested_table_alloc(ht, &ntbl[index].table,
-					  size <= (1 << shift) ? shifted : 0,
-					  nhash);
-	}
-
-	if (!ntbl)
-		return NULL;
-
-	return &ntbl[hash].bucket;
-
-}
-EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index ab8a2d5d1e32..28bf4feeb81c 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -110,10 +110,6 @@ int tipc_net_start(struct net *net, u32 addr)
 	char addr_string[16];
 
 	tn->own_addr = addr;
-
-	/* Ensure that the new address is visible before we reinit. */
-	smp_mb();
-
 	tipc_named_reinit(net);
 	tipc_sk_reinit(net);
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 370a5912bcb5..800caaa699a1 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -384,6 +384,8 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	INIT_LIST_HEAD(&tsk->publications);
 	msg = &tsk->phdr;
 	tn = net_generic(sock_net(sk), tipc_net_id);
+	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
+		      NAMED_H_SIZE, 0);
 
 	/* Finish initializing socket data structures */
 	sock->ops = ops;
@@ -393,13 +395,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 		pr_warn("Socket create failed; port number exhausted\n");
 		return -EINVAL;
 	}
-
-	/* Ensure tsk is visible before we read own_addr. */
-	smp_mb();
-
-	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
-		      NAMED_H_SIZE, 0);
-
 	msg_set_origport(msg, tsk->portid);
 	setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
 	sk->sk_shutdown = 0;
@@ -2274,27 +2269,24 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 void tipc_sk_reinit(struct net *net)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct rhashtable_iter iter;
+	const struct bucket_table *tbl;
+	struct rhash_head *pos;
 	struct tipc_sock *tsk;
 	struct tipc_msg *msg;
+	int i;
 
-	rhashtable_walk_enter(&tn->sk_rht, &iter);
-
-	do {
-		tsk = ERR_PTR(rhashtable_walk_start(&iter));
-		if (tsk)
-			continue;
-
-		while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
+	rcu_read_lock();
+	tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
+	for (i = 0; i < tbl->size; i++) {
+		rht_for_each_entry_rcu(tsk, pos, tbl, i, node) {
 			spin_lock_bh(&tsk->sk.sk_lock.slock);
 			msg = &tsk->phdr;
 			msg_set_prevnode(msg, tn->own_addr);
 			msg_set_orignode(msg, tn->own_addr);
 			spin_unlock_bh(&tsk->sk.sk_lock.slock);
 		}
-
-		rhashtable_walk_stop(&iter);
-	} while (tsk == ERR_PTR(-EAGAIN));
+	}
+	rcu_read_unlock();
 }
 
 static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
-- 
cgit v1.2.3


From b802c8410ca915e7772e738e68420115f1a3c811 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 15 Feb 2017 18:42:09 -0800
Subject: async_tx: deprecate broken support for channel switching

Back in 2011, Russell pointed out that the "async_tx channel switch"
capability was violating expectations of the dma mapping api [1]. At the
time the existing uses were reviewed as still usable, but that longer
term we needed a rework of the raid offload implementation. While some
of the framework for a fixed implementation was introduced in 2012 [2],
the wider rewrite never materialized.

There continues to be interest in raid offload with new dma/raid engine
drivers being submitted. Those drivers must not build on top of the
broken channel switching capability.

Prevent async_tx from using an offload engine if the channel switching
capability is enabled. This still allows the engine to be used for other
purposes, but the broken way async_tx uses these engines for raid will
be disabled. For configurations where this causes a performance
regression the only solution is to start the work of eliminating the
async_tx api and moving channel management into the raid code directly
where it can manage marshalling an operation stream between multiple dma
channels.

[1]: http://lists.infradead.org/pipermail/linux-arm-kernel/2011-January/036753.html
[2]: https://lkml.org/lkml/2012/12/6/71

Cc: Anatolij Gustschin <agust@denx.de>
Cc: Anup Patel <anup.patel@broadcom.com>
Cc: Rameshwar Prasad Sahu <rsahu@apm.com>
Cc: Saeed Bishara <saeed.bishara@gmail.com>
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Reported-by: Russell King <linux@armlinux.org.uk>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/async_tx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 388574ea38ed..28e3cf1465ab 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -87,7 +87,7 @@ struct async_submit_ctl {
 	void *scribble;
 };
 
-#ifdef CONFIG_DMA_ENGINE
+#if defined(CONFIG_DMA_ENGINE) && !defined(CONFIG_ASYNC_TX_CHANNEL_SWITCH)
 #define async_tx_issue_pending_all dma_issue_pending_all
 
 /**
-- 
cgit v1.2.3


From 2f44e29cef006a4b0a4ecf7d4c5aac7d0fbb505c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 Feb 2017 22:53:12 +0100
Subject: genirq/msi: Add stubs for get_cached_msi_msg/pci_write_msi_msg

A bug fix to the MSIx handling in vfio added references to functions
that may not be defined if MSI is disabled in the kernel, resulting in
this link error:

drivers/built-in.o: In function `vfio_msi_set_vector_signal':
:(.text+0x450808): undefined reference to `get_cached_msi_msg'
:(.text+0x45080c): undefined reference to `write_msi_msg'

As suggested by Alex Williamson, add stub implementations for
get_cached_msi_msg() and pci_write_msi_msg().

In case this bugfix gets backported, please note that the #ifdef
has changed over time, originally both functions were implemented
in drivers/pci/msi.c and controlled by CONFIG_PCI_MSI, while nowadays
get_cached_msi_msg() is part of the generic MSI support and can be
used without PCI.

Fixes: b8f02af096b1 ("vfio/pci: Restore MSIx message prior to enabling")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Bart Van Assche <bart.vanassche@sandisk.com>
Link: http://lkml.kernel.org/r/1413190208.4202.34.camel@ul30vt.home
Link: http://lkml.kernel.org/r/20170214215343.3307861-1-arnd@arndb.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/msi.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 0db320b7bb15..a83b84ff70e5 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -17,7 +17,13 @@ struct msi_desc;
 struct pci_dev;
 struct platform_msi_priv_data;
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+#ifdef CONFIG_GENERIC_MSI_IRQ
 void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
+#else
+static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+}
+#endif
 
 typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc,
 				    struct msi_msg *msg);
@@ -116,11 +122,15 @@ struct msi_desc {
 
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
 void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
+void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
 #else /* CONFIG_PCI_MSI */
 static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
 {
 	return NULL;
 }
+static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+}
 #endif /* CONFIG_PCI_MSI */
 
 struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
@@ -128,7 +138,6 @@ struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
 void free_msi_entry(struct msi_desc *entry);
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
-void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag);
 u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
-- 
cgit v1.2.3


From bbd6411513aa8ef3ea02abab61318daf87c1af1e Mon Sep 17 00:00:00 2001
From: "Cao, Lei" <Lei.Cao@stratus.com>
Date: Fri, 3 Feb 2017 20:04:35 +0000
Subject: KVM: Support vCPU-based gfn->hva cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide versions of struct gfn_to_hva_cache functions that
take vcpu as a parameter instead of struct kvm.  The existing functions
are not needed anymore, so delete them.  This allows dirty pages to
be logged in the vcpu dirty ring, instead of the global dirty ring,
for ring-based dirty memory tracking.

Signed-off-by: Lei Cao <lei.cao@stratus.com>
Message-Id: <CY1PR08MB19929BD2AC47A291FD680E83F04F0@CY1PR08MB1992.namprd08.prod.outlook.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c     | 22 ++++++++++------------
 arch/x86/kvm/x86.c       | 41 ++++++++++++++++++++---------------------
 include/linux/kvm_host.h | 16 ++++++++--------
 virt/kvm/kvm_main.c      | 34 +++++++++++++++++-----------------
 4 files changed, 55 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9fa5b8164961..bad6a25067bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -529,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
-
-	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
-				      sizeof(val));
+	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
+					   sizeof(val));
 }
 
 static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
 {
-
-	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
-				      sizeof(*val));
+	return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
+					  sizeof(*val));
 }
 
 static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -2287,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
-	if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
-				  sizeof(u32)))
+	if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+				       sizeof(u32)))
 		return;
 
 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2340,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 		max_isr = 0;
 	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
-				sizeof(u32));
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+				    sizeof(u32));
 }
 
 int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
 	if (vapic_addr) {
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
 					&vcpu->arch.apic->vapic_cache,
 					vapic_addr, sizeof(u32)))
 			return -EINVAL;
@@ -2441,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 	vcpu->arch.pv_eoi.msr_val = data;
 	if (!pv_eoi_enabled(vcpu))
 		return 0;
-	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+	return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
 					 addr, sizeof(u8));
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a89a51dcc9..0aa8db229e0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1811,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct pvclock_vcpu_time_info guest_hv_clock;
 
-	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+	if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
 		&guest_hv_clock, sizeof(guest_hv_clock))))
 		return;
 
@@ -1832,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 
 	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+				    &vcpu->hv_clock,
+				    sizeof(vcpu->hv_clock.version));
 
 	smp_wmb();
 
@@ -1848,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock));
+	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+				    &vcpu->hv_clock,
+				    sizeof(vcpu->hv_clock));
 
 	smp_wmb();
 
 	vcpu->hv_clock.version++;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+				    &vcpu->hv_clock,
+				    sizeof(vcpu->hv_clock.version));
 }
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2090,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 		return 0;
 	}
 
-	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+	if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
 					sizeof(u32)))
 		return 1;
 
@@ -2109,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 		return;
 
@@ -2120,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
@@ -2129,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 
@@ -2241,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & 1))
 			break;
 
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
 		     &vcpu->arch.pv_time, data & ~1ULL,
 		     sizeof(struct pvclock_vcpu_time_info)))
 			vcpu->arch.pv_time_enabled = false;
@@ -2262,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data & KVM_STEAL_RESERVED_MASK)
 			return 1;
 
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
 						data & KVM_STEAL_VALID_BITS,
 						sizeof(struct kvm_steal_time)))
 			return 1;
@@ -2875,7 +2875,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.preempted = 1;
 
-	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+	kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
 			&vcpu->arch.st.steal.preempted,
 			offsetof(struct kvm_steal_time, preempted),
 			sizeof(vcpu->arch.st.steal.preempted));
@@ -8533,9 +8533,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 
 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 {
-
-	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
-				      sizeof(val));
+	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
+					   sizeof(val));
 }
 
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cda457bcedc1..2db458ee94b0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -641,18 +641,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 			  unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, unsigned long len);
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+			       void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len);
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, unsigned long len);
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, int offset, unsigned long len);
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			      gpa_t gpa, unsigned long len);
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+				void *data, unsigned long len);
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+				       void *data, int offset, unsigned long len);
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
+				   gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a83c186cefc1..263a80513ad9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1981,18 +1981,18 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
 	return 0;
 }
 
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
 			      gpa_t gpa, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
 	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
 }
-EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init);
 
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, int offset, unsigned long len)
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+				       void *data, int offset, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
 	int r;
 	gpa_t gpa = ghc->gpa + offset;
 
@@ -2002,7 +2002,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
 	if (unlikely(!ghc->memslot))
-		return kvm_write_guest(kvm, gpa, data, len);
+		return kvm_vcpu_write_guest(vcpu, gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
@@ -2014,19 +2014,19 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached);
 
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, unsigned long len)
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+			       void *data, unsigned long len)
 {
-	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
+	return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len);
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached);
 
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, unsigned long len)
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+			       void *data, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
 	int r;
 
 	BUG_ON(len > ghc->len);
@@ -2035,7 +2035,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
 	if (unlikely(!ghc->memslot))
-		return kvm_read_guest(kvm, ghc->gpa, data, len);
+		return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
@@ -2046,7 +2046,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached);
 
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
-- 
cgit v1.2.3


From 55f0cd3fb9c29c20fb94c47e28a9ec8cf704f8c2 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Thu, 16 Feb 2017 13:07:37 -0700
Subject: spi: spi-ep93xx: simplify GPIO chip selects

This driver requires a GPIO line to be used for the chip select of
each SPI device.

Remove the ep93xx_spi_chip_ops definition from the platform data
and use the spi core GPIO handling for the chip selects.

Fix all the ep93xx platforms that use this driver and remove the
old Documentation.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/spi/ep93xx_spi             | 105 -----------------------
 arch/arm/mach-ep93xx/edb93xx.c           |  31 ++-----
 arch/arm/mach-ep93xx/simone.c            |  63 +++-----------
 arch/arm/mach-ep93xx/vision_ep9307.c     |  88 +++----------------
 drivers/spi/spi-ep93xx.c                 | 139 ++++++++++---------------------
 include/linux/platform_data/spi-ep93xx.h |  17 +---
 6 files changed, 74 insertions(+), 369 deletions(-)
 delete mode 100644 Documentation/spi/ep93xx_spi

(limited to 'include/linux')

diff --git a/Documentation/spi/ep93xx_spi b/Documentation/spi/ep93xx_spi
deleted file mode 100644
index 832ddce6e5fb..000000000000
--- a/Documentation/spi/ep93xx_spi
+++ /dev/null
@@ -1,105 +0,0 @@
-Cirrus EP93xx SPI controller driver HOWTO
-=========================================
-
-ep93xx_spi driver brings SPI master support for EP93xx SPI controller.  Chip
-selects are implemented with GPIO lines.
-
-NOTE: If possible, don't use SFRMOUT (SFRM1) signal as a chip select. It will
-not work correctly (it cannot be controlled by software). Use GPIO lines
-instead.
-
-Sample configuration
-====================
-
-Typically driver configuration is done in platform board files (the files under
-arch/arm/mach-ep93xx/*.c). In this example we configure MMC over SPI through
-this driver on TS-7260 board. You can adapt the code to suit your needs.
-
-This example uses EGPIO9 as SD/MMC card chip select (this is wired in DIO1
-header on the board).
-
-You need to select CONFIG_MMC_SPI to use mmc_spi driver.
-
-arch/arm/mach-ep93xx/ts72xx.c:
-
-...
-#include <linux/gpio.h>
-#include <linux/spi/spi.h>
-
-#include <linux/platform_data/spi-ep93xx.h>
-
-/* this is our GPIO line used for chip select */
-#define MMC_CHIP_SELECT_GPIO EP93XX_GPIO_LINE_EGPIO9
-
-static int ts72xx_mmc_spi_setup(struct spi_device *spi)
-{
-	int err;
-
-	err = gpio_request(MMC_CHIP_SELECT_GPIO, spi->modalias);
-	if (err)
-		return err;
-
-	gpio_direction_output(MMC_CHIP_SELECT_GPIO, 1);
-
-	return 0;
-}
-
-static void ts72xx_mmc_spi_cleanup(struct spi_device *spi)
-{
-	gpio_set_value(MMC_CHIP_SELECT_GPIO, 1);
-	gpio_direction_input(MMC_CHIP_SELECT_GPIO);
-	gpio_free(MMC_CHIP_SELECT_GPIO);
-}
-
-static void ts72xx_mmc_spi_cs_control(struct spi_device *spi, int value)
-{
-	gpio_set_value(MMC_CHIP_SELECT_GPIO, value);
-}
-
-static struct ep93xx_spi_chip_ops ts72xx_mmc_spi_ops = {
-	.setup		= ts72xx_mmc_spi_setup,
-	.cleanup	= ts72xx_mmc_spi_cleanup,
-	.cs_control	= ts72xx_mmc_spi_cs_control,
-};
-
-static struct spi_board_info ts72xx_spi_devices[] __initdata = {
-	{
-		.modalias		= "mmc_spi",
-		.controller_data	= &ts72xx_mmc_spi_ops,
-		/*
-		 * We use 10 MHz even though the maximum is 7.4 MHz. The driver
-		 * will limit it automatically to max. frequency.
-		 */
-		.max_speed_hz		= 10 * 1000 * 1000,
-		.bus_num		= 0,
-		.chip_select		= 0,
-		.mode			= SPI_MODE_0,
-	},
-};
-
-static struct ep93xx_spi_info ts72xx_spi_info = {
-	.num_chipselect	= ARRAY_SIZE(ts72xx_spi_devices),
-};
-
-static void __init ts72xx_init_machine(void)
-{
-	...
-	ep93xx_register_spi(&ts72xx_spi_info, ts72xx_spi_devices,
-			    ARRAY_SIZE(ts72xx_spi_devices));
-}
-
-The driver can use DMA for the transfers also. In this case ts72xx_spi_info
-becomes:
-
-static struct ep93xx_spi_info ts72xx_spi_info = {
-	.num_chipselect	= ARRAY_SIZE(ts72xx_spi_devices),
-	.use_dma	= true;
-};
-
-Note that CONFIG_EP93XX_DMA should be enabled as well.
-
-Thanks to
-=========
-Martin Guy, H. Hartley Sweeten and others who helped me during development of
-the driver. Simplemachines.it donated me a Sim.One board which I used testing
-the driver on EP9307.
diff --git a/arch/arm/mach-ep93xx/edb93xx.c b/arch/arm/mach-ep93xx/edb93xx.c
index ad92d9f7e4df..0ac176386789 100644
--- a/arch/arm/mach-ep93xx/edb93xx.c
+++ b/arch/arm/mach-ep93xx/edb93xx.c
@@ -27,7 +27,6 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/gpio.h>
 #include <linux/i2c.h>
 #include <linux/i2c-gpio.h>
 #include <linux/spi/spi.h>
@@ -106,33 +105,10 @@ static struct cs4271_platform_data edb93xx_cs4271_data = {
 	.gpio_nreset	= -EINVAL,	/* filled in later */
 };
 
-static int edb93xx_cs4271_hw_setup(struct spi_device *spi)
-{
-	return gpio_request_one(EP93XX_GPIO_LINE_EGPIO6,
-				GPIOF_OUT_INIT_HIGH, spi->modalias);
-}
-
-static void edb93xx_cs4271_hw_cleanup(struct spi_device *spi)
-{
-	gpio_free(EP93XX_GPIO_LINE_EGPIO6);
-}
-
-static void edb93xx_cs4271_hw_cs_control(struct spi_device *spi, int value)
-{
-	gpio_set_value(EP93XX_GPIO_LINE_EGPIO6, value);
-}
-
-static struct ep93xx_spi_chip_ops edb93xx_cs4271_hw = {
-	.setup		= edb93xx_cs4271_hw_setup,
-	.cleanup	= edb93xx_cs4271_hw_cleanup,
-	.cs_control	= edb93xx_cs4271_hw_cs_control,
-};
-
 static struct spi_board_info edb93xx_spi_board_info[] __initdata = {
 	{
 		.modalias		= "cs4271",
 		.platform_data		= &edb93xx_cs4271_data,
-		.controller_data	= &edb93xx_cs4271_hw,
 		.max_speed_hz		= 6000000,
 		.bus_num		= 0,
 		.chip_select		= 0,
@@ -140,8 +116,13 @@ static struct spi_board_info edb93xx_spi_board_info[] __initdata = {
 	},
 };
 
+static int edb93xx_spi_chipselects[] __initdata = {
+	EP93XX_GPIO_LINE_EGPIO6,
+};
+
 static struct ep93xx_spi_info edb93xx_spi_info __initdata = {
-	.num_chipselect	= ARRAY_SIZE(edb93xx_spi_board_info),
+	.chipselect	= edb93xx_spi_chipselects,
+	.num_chipselect	= ARRAY_SIZE(edb93xx_spi_chipselects),
 };
 
 static void __init edb93xx_register_spi(void)
diff --git a/arch/arm/mach-ep93xx/simone.c b/arch/arm/mach-ep93xx/simone.c
index 7bb540c421ee..c7a40f245892 100644
--- a/arch/arm/mach-ep93xx/simone.c
+++ b/arch/arm/mach-ep93xx/simone.c
@@ -48,56 +48,6 @@ static struct ep93xxfb_mach_info __initdata simone_fb_info = {
  */
 #define MMC_CARD_DETECT_GPIO EP93XX_GPIO_LINE_EGPIO0
 
-/*
- * Up to v1.3, the Sim.One used SFRMOUT as SD card chip select, but this goes
- * low between multi-message command blocks. From v1.4, it uses a GPIO instead.
- * v1.3 parts will still work, since the signal on SFRMOUT is automatic.
- */
-#define MMC_CHIP_SELECT_GPIO EP93XX_GPIO_LINE_EGPIO1
-
-/*
- * MMC SPI chip select GPIO handling. If you are using SFRMOUT (SFRM1) signal,
- * you can leave these empty and pass NULL as .controller_data.
- */
-
-static int simone_mmc_spi_setup(struct spi_device *spi)
-{
-	unsigned int gpio = MMC_CHIP_SELECT_GPIO;
-	int err;
-
-	err = gpio_request(gpio, spi->modalias);
-	if (err)
-		return err;
-
-	err = gpio_direction_output(gpio, 1);
-	if (err) {
-		gpio_free(gpio);
-		return err;
-	}
-
-	return 0;
-}
-
-static void simone_mmc_spi_cleanup(struct spi_device *spi)
-{
-	unsigned int gpio = MMC_CHIP_SELECT_GPIO;
-
-	gpio_set_value(gpio, 1);
-	gpio_direction_input(gpio);
-	gpio_free(gpio);
-}
-
-static void simone_mmc_spi_cs_control(struct spi_device *spi, int value)
-{
-	gpio_set_value(MMC_CHIP_SELECT_GPIO, value);
-}
-
-static struct ep93xx_spi_chip_ops simone_mmc_spi_ops = {
-	.setup		= simone_mmc_spi_setup,
-	.cleanup	= simone_mmc_spi_cleanup,
-	.cs_control	= simone_mmc_spi_cs_control,
-};
-
 /*
  * MMC card detection GPIO setup.
  */
@@ -152,7 +102,6 @@ static struct mmc_spi_platform_data simone_mmc_spi_data = {
 static struct spi_board_info simone_spi_devices[] __initdata = {
 	{
 		.modalias		= "mmc_spi",
-		.controller_data	= &simone_mmc_spi_ops,
 		.platform_data		= &simone_mmc_spi_data,
 		/*
 		 * We use 10 MHz even though the maximum is 3.7 MHz. The driver
@@ -165,8 +114,18 @@ static struct spi_board_info simone_spi_devices[] __initdata = {
 	},
 };
 
+/*
+ * Up to v1.3, the Sim.One used SFRMOUT as SD card chip select, but this goes
+ * low between multi-message command blocks. From v1.4, it uses a GPIO instead.
+ * v1.3 parts will still work, since the signal on SFRMOUT is automatic.
+ */
+static int simone_spi_chipselects[] __initdata = {
+	EP93XX_GPIO_LINE_EGPIO1,
+};
+
 static struct ep93xx_spi_info simone_spi_info __initdata = {
-	.num_chipselect	= ARRAY_SIZE(simone_spi_devices),
+	.chipselect	= simone_spi_chipselects,
+	.num_chipselect	= ARRAY_SIZE(simone_spi_chipselects),
 	.use_dma = 1,
 };
 
diff --git a/arch/arm/mach-ep93xx/vision_ep9307.c b/arch/arm/mach-ep93xx/vision_ep9307.c
index 5cced5988498..1daf9441058c 100644
--- a/arch/arm/mach-ep93xx/vision_ep9307.c
+++ b/arch/arm/mach-ep93xx/vision_ep9307.c
@@ -175,33 +175,9 @@ static struct cs4271_platform_data vision_cs4271_data = {
 	.gpio_nreset	= EP93XX_GPIO_LINE_H(2),
 };
 
-static int vision_cs4271_hw_setup(struct spi_device *spi)
-{
-	return gpio_request_one(EP93XX_GPIO_LINE_EGPIO6,
-				GPIOF_OUT_INIT_HIGH, spi->modalias);
-}
-
-static void vision_cs4271_hw_cleanup(struct spi_device *spi)
-{
-	gpio_free(EP93XX_GPIO_LINE_EGPIO6);
-}
-
-static void vision_cs4271_hw_cs_control(struct spi_device *spi, int value)
-{
-	gpio_set_value(EP93XX_GPIO_LINE_EGPIO6, value);
-}
-
-static struct ep93xx_spi_chip_ops vision_cs4271_hw = {
-	.setup		= vision_cs4271_hw_setup,
-	.cleanup	= vision_cs4271_hw_cleanup,
-	.cs_control	= vision_cs4271_hw_cs_control,
-};
-
 /*************************************************************************
  * SPI Flash
  *************************************************************************/
-#define VISION_SPI_FLASH_CS	EP93XX_GPIO_LINE_EGPIO7
-
 static struct mtd_partition vision_spi_flash_partitions[] = {
 	{
 		.name	= "SPI bootstrap",
@@ -224,68 +200,20 @@ static struct flash_platform_data vision_spi_flash_data = {
 	.nr_parts	= ARRAY_SIZE(vision_spi_flash_partitions),
 };
 
-static int vision_spi_flash_hw_setup(struct spi_device *spi)
-{
-	return gpio_request_one(VISION_SPI_FLASH_CS, GPIOF_INIT_HIGH,
-				spi->modalias);
-}
-
-static void vision_spi_flash_hw_cleanup(struct spi_device *spi)
-{
-	gpio_free(VISION_SPI_FLASH_CS);
-}
-
-static void vision_spi_flash_hw_cs_control(struct spi_device *spi, int value)
-{
-	gpio_set_value(VISION_SPI_FLASH_CS, value);
-}
-
-static struct ep93xx_spi_chip_ops vision_spi_flash_hw = {
-	.setup		= vision_spi_flash_hw_setup,
-	.cleanup	= vision_spi_flash_hw_cleanup,
-	.cs_control	= vision_spi_flash_hw_cs_control,
-};
-
 /*************************************************************************
  * SPI SD/MMC host
  *************************************************************************/
-#define VISION_SPI_MMC_CS	EP93XX_GPIO_LINE_G(2)
-#define VISION_SPI_MMC_WP	EP93XX_GPIO_LINE_F(0)
-#define VISION_SPI_MMC_CD	EP93XX_GPIO_LINE_EGPIO15
-
 static struct mmc_spi_platform_data vision_spi_mmc_data = {
 	.detect_delay	= 100,
 	.powerup_msecs	= 100,
 	.ocr_mask	= MMC_VDD_32_33 | MMC_VDD_33_34,
 	.flags		= MMC_SPI_USE_CD_GPIO | MMC_SPI_USE_RO_GPIO,
-	.cd_gpio	= VISION_SPI_MMC_CD,
+	.cd_gpio	= EP93XX_GPIO_LINE_EGPIO15,
 	.cd_debounce	= 1,
-	.ro_gpio	= VISION_SPI_MMC_WP,
+	.ro_gpio	= EP93XX_GPIO_LINE_F(0),
 	.caps2		= MMC_CAP2_RO_ACTIVE_HIGH,
 };
 
-static int vision_spi_mmc_hw_setup(struct spi_device *spi)
-{
-	return gpio_request_one(VISION_SPI_MMC_CS, GPIOF_INIT_HIGH,
-				spi->modalias);
-}
-
-static void vision_spi_mmc_hw_cleanup(struct spi_device *spi)
-{
-	gpio_free(VISION_SPI_MMC_CS);
-}
-
-static void vision_spi_mmc_hw_cs_control(struct spi_device *spi, int value)
-{
-	gpio_set_value(VISION_SPI_MMC_CS, value);
-}
-
-static struct ep93xx_spi_chip_ops vision_spi_mmc_hw = {
-	.setup		= vision_spi_mmc_hw_setup,
-	.cleanup	= vision_spi_mmc_hw_cleanup,
-	.cs_control	= vision_spi_mmc_hw_cs_control,
-};
-
 /*************************************************************************
  * SPI Bus
  *************************************************************************/
@@ -293,7 +221,6 @@ static struct spi_board_info vision_spi_board_info[] __initdata = {
 	{
 		.modalias		= "cs4271",
 		.platform_data		= &vision_cs4271_data,
-		.controller_data	= &vision_cs4271_hw,
 		.max_speed_hz		= 6000000,
 		.bus_num		= 0,
 		.chip_select		= 0,
@@ -301,7 +228,6 @@ static struct spi_board_info vision_spi_board_info[] __initdata = {
 	}, {
 		.modalias		= "sst25l",
 		.platform_data		= &vision_spi_flash_data,
-		.controller_data	= &vision_spi_flash_hw,
 		.max_speed_hz		= 20000000,
 		.bus_num		= 0,
 		.chip_select		= 1,
@@ -309,7 +235,6 @@ static struct spi_board_info vision_spi_board_info[] __initdata = {
 	}, {
 		.modalias		= "mmc_spi",
 		.platform_data		= &vision_spi_mmc_data,
-		.controller_data	= &vision_spi_mmc_hw,
 		.max_speed_hz		= 20000000,
 		.bus_num		= 0,
 		.chip_select		= 2,
@@ -317,8 +242,15 @@ static struct spi_board_info vision_spi_board_info[] __initdata = {
 	},
 };
 
+static int vision_spi_chipselects[] __initdata = {
+	EP93XX_GPIO_LINE_EGPIO6,
+	EP93XX_GPIO_LINE_EGPIO7,
+	EP93XX_GPIO_LINE_G(2),
+};
+
 static struct ep93xx_spi_info vision_spi_master __initdata = {
-	.num_chipselect	= ARRAY_SIZE(vision_spi_board_info),
+	.chipselect	= vision_spi_chipselects,
+	.num_chipselect	= ARRAY_SIZE(vision_spi_chipselects),
 	.use_dma	= 1,
 };
 
diff --git a/drivers/spi/spi-ep93xx.c b/drivers/spi/spi-ep93xx.c
index 17a6387e20b5..b5d766064b7b 100644
--- a/drivers/spi/spi-ep93xx.c
+++ b/drivers/spi/spi-ep93xx.c
@@ -28,6 +28,7 @@
 #include <linux/platform_device.h>
 #include <linux/sched.h>
 #include <linux/scatterlist.h>
+#include <linux/gpio.h>
 #include <linux/spi/spi.h>
 
 #include <linux/platform_data/dma-ep93xx.h>
@@ -107,16 +108,6 @@ struct ep93xx_spi {
 	void				*zeropage;
 };
 
-/**
- * struct ep93xx_spi_chip - SPI device hardware settings
- * @spi: back pointer to the SPI device
- * @ops: private chip operations
- */
-struct ep93xx_spi_chip {
-	const struct spi_device		*spi;
-	struct ep93xx_spi_chip_ops	*ops;
-};
-
 /* converts bits per word to CR0.DSS value */
 #define bits_per_word_to_dss(bpw)	((bpw) - 1)
 
@@ -229,104 +220,36 @@ static int ep93xx_spi_calc_divisors(const struct ep93xx_spi *espi,
 	return -EINVAL;
 }
 
-static void ep93xx_spi_cs_control(struct spi_device *spi, bool control)
-{
-	struct ep93xx_spi_chip *chip = spi_get_ctldata(spi);
-	int value = (spi->mode & SPI_CS_HIGH) ? control : !control;
-
-	if (chip->ops && chip->ops->cs_control)
-		chip->ops->cs_control(spi, value);
-}
-
-/**
- * ep93xx_spi_setup() - setup an SPI device
- * @spi: SPI device to setup
- *
- * This function sets up SPI device mode, speed etc. Can be called multiple
- * times for a single device. Returns %0 in case of success, negative error in
- * case of failure. When this function returns success, the device is
- * deselected.
- */
-static int ep93xx_spi_setup(struct spi_device *spi)
+static void ep93xx_spi_cs_control(struct spi_device *spi, bool enable)
 {
-	struct ep93xx_spi *espi = spi_master_get_devdata(spi->master);
-	struct ep93xx_spi_chip *chip;
+	if (spi->mode & SPI_CS_HIGH)
+		enable = !enable;
 
-	chip = spi_get_ctldata(spi);
-	if (!chip) {
-		dev_dbg(&espi->pdev->dev, "initial setup for %s\n",
-			spi->modalias);
-
-		chip = kzalloc(sizeof(*chip), GFP_KERNEL);
-		if (!chip)
-			return -ENOMEM;
-
-		chip->spi = spi;
-		chip->ops = spi->controller_data;
-
-		if (chip->ops && chip->ops->setup) {
-			int ret = chip->ops->setup(spi);
-
-			if (ret) {
-				kfree(chip);
-				return ret;
-			}
-		}
-
-		spi_set_ctldata(spi, chip);
-	}
-
-	ep93xx_spi_cs_control(spi, false);
-	return 0;
+	if (gpio_is_valid(spi->cs_gpio))
+		gpio_set_value(spi->cs_gpio, !enable);
 }
 
-/**
- * ep93xx_spi_cleanup() - cleans up master controller specific state
- * @spi: SPI device to cleanup
- *
- * This function releases master controller specific state for given @spi
- * device.
- */
-static void ep93xx_spi_cleanup(struct spi_device *spi)
-{
-	struct ep93xx_spi_chip *chip;
-
-	chip = spi_get_ctldata(spi);
-	if (chip) {
-		if (chip->ops && chip->ops->cleanup)
-			chip->ops->cleanup(spi);
-		spi_set_ctldata(spi, NULL);
-		kfree(chip);
-	}
-}
-
-/**
- * ep93xx_spi_chip_setup() - configures hardware according to given @chip
- * @espi: ep93xx SPI controller struct
- * @chip: chip specific settings
- * @speed_hz: transfer speed
- * @bits_per_word: transfer bits_per_word
- */
 static int ep93xx_spi_chip_setup(const struct ep93xx_spi *espi,
-				 const struct ep93xx_spi_chip *chip,
-				 u32 speed_hz, u8 bits_per_word)
+				 struct spi_device *spi,
+				 struct spi_transfer *xfer)
 {
-	u8 dss = bits_per_word_to_dss(bits_per_word);
+	u8 dss = bits_per_word_to_dss(xfer->bits_per_word);
 	u8 div_cpsr = 0;
 	u8 div_scr = 0;
 	u16 cr0;
 	int err;
 
-	err = ep93xx_spi_calc_divisors(espi, speed_hz, &div_cpsr, &div_scr);
+	err = ep93xx_spi_calc_divisors(espi, xfer->speed_hz,
+				       &div_cpsr, &div_scr);
 	if (err)
 		return err;
 
 	cr0 = div_scr << SSPCR0_SCR_SHIFT;
-	cr0 |= (chip->spi->mode & (SPI_CPHA|SPI_CPOL)) << SSPCR0_MODE_SHIFT;
+	cr0 |= (spi->mode & (SPI_CPHA | SPI_CPOL)) << SSPCR0_MODE_SHIFT;
 	cr0 |= dss;
 
 	dev_dbg(&espi->pdev->dev, "setup: mode %d, cpsr %d, scr %d, dss %d\n",
-		chip->spi->mode, div_cpsr, div_scr, dss);
+		spi->mode, div_cpsr, div_scr, dss);
 	dev_dbg(&espi->pdev->dev, "setup: cr0 %#x\n", cr0);
 
 	ep93xx_spi_write_u8(espi, SSPCPSR, div_cpsr);
@@ -603,12 +526,11 @@ static void ep93xx_spi_process_transfer(struct ep93xx_spi *espi,
 					struct spi_message *msg,
 					struct spi_transfer *t)
 {
-	struct ep93xx_spi_chip *chip = spi_get_ctldata(msg->spi);
 	int err;
 
 	msg->state = t;
 
-	err = ep93xx_spi_chip_setup(espi, chip, t->speed_hz, t->bits_per_word);
+	err = ep93xx_spi_chip_setup(espi, msg->spi, t);
 	if (err) {
 		dev_err(&espi->pdev->dev,
 			"failed to setup chip for transfer\n");
@@ -863,8 +785,13 @@ static int ep93xx_spi_probe(struct platform_device *pdev)
 	struct resource *res;
 	int irq;
 	int error;
+	int i;
 
 	info = dev_get_platdata(&pdev->dev);
+	if (!info) {
+		dev_err(&pdev->dev, "missing platform data\n");
+		return -EINVAL;
+	}
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
@@ -882,14 +809,36 @@ static int ep93xx_spi_probe(struct platform_device *pdev)
 	if (!master)
 		return -ENOMEM;
 
-	master->setup = ep93xx_spi_setup;
 	master->transfer_one_message = ep93xx_spi_transfer_one_message;
-	master->cleanup = ep93xx_spi_cleanup;
 	master->bus_num = pdev->id;
-	master->num_chipselect = info->num_chipselect;
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
 	master->bits_per_word_mask = SPI_BPW_RANGE_MASK(4, 16);
 
+	master->num_chipselect = info->num_chipselect;
+	master->cs_gpios = devm_kzalloc(&master->dev,
+					sizeof(int) * master->num_chipselect,
+					GFP_KERNEL);
+	if (!master->cs_gpios) {
+		error = -ENOMEM;
+		goto fail_release_master;
+	}
+
+	for (i = 0; i < master->num_chipselect; i++) {
+		master->cs_gpios[i] = info->chipselect[i];
+
+		if (!gpio_is_valid(master->cs_gpios[i]))
+			continue;
+
+		error = devm_gpio_request_one(&pdev->dev, master->cs_gpios[i],
+					      GPIOF_OUT_INIT_HIGH,
+					      "ep93xx-spi");
+		if (error) {
+			dev_err(&pdev->dev, "could not request cs gpio %d\n",
+				master->cs_gpios[i]);
+			goto fail_release_master;
+		}
+	}
+
 	platform_set_drvdata(pdev, master);
 
 	espi = spi_master_get_devdata(master);
diff --git a/include/linux/platform_data/spi-ep93xx.h b/include/linux/platform_data/spi-ep93xx.h
index 9bb63ac13f04..171a271c2cbd 100644
--- a/include/linux/platform_data/spi-ep93xx.h
+++ b/include/linux/platform_data/spi-ep93xx.h
@@ -5,25 +5,14 @@ struct spi_device;
 
 /**
  * struct ep93xx_spi_info - EP93xx specific SPI descriptor
- * @num_chipselect: number of chip selects on this board, must be
- *                  at least one
+ * @chipselect: array of gpio numbers to use as chip selects
+ * @num_chipselect: ARRAY_SIZE(chipselect)
  * @use_dma: use DMA for the transfers
  */
 struct ep93xx_spi_info {
+	int	*chipselect;
 	int	num_chipselect;
 	bool	use_dma;
 };
 
-/**
- * struct ep93xx_spi_chip_ops - operation callbacks for SPI slave device
- * @setup: setup the chip select mechanism
- * @cleanup: cleanup the chip select mechanism
- * @cs_control: control the device chip select
- */
-struct ep93xx_spi_chip_ops {
-	int	(*setup)(struct spi_device *spi);
-	void	(*cleanup)(struct spi_device *spi);
-	void	(*cs_control)(struct spi_device *spi, int value);
-};
-
 #endif /* __ASM_MACH_EP93XX_SPI_H */
-- 
cgit v1.2.3


From bd7e5b0899a429445cc6e3037c13f8b5ae3be903 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 3 Feb 2017 21:18:52 -0800
Subject: KVM: x86: remove code for lazy FPU handling

The FPU is always active now when running KVM.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   3 --
 arch/x86/kvm/cpuid.c            |   2 -
 arch/x86/kvm/svm.c              |  43 ++-------------
 arch/x86/kvm/vmx.c              | 112 ++++++----------------------------------
 arch/x86/kvm/x86.c              |   7 +--
 include/linux/kvm_host.h        |   1 -
 6 files changed, 19 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e4f13e714bcf..74ef58c8ff53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,7 +55,6 @@
 #define KVM_REQ_TRIPLE_FAULT      10
 #define KVM_REQ_MMU_SYNC          11
 #define KVM_REQ_CLOCK_UPDATE      12
-#define KVM_REQ_DEACTIVATE_FPU    13
 #define KVM_REQ_EVENT             14
 #define KVM_REQ_APF_HALT          15
 #define KVM_REQ_STEAL_UPDATE      16
@@ -936,8 +935,6 @@ struct kvm_x86_ops {
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	u32 (*get_pkru)(struct kvm_vcpu *vcpu);
-	void (*fpu_activate)(struct kvm_vcpu *vcpu);
-	void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c0e2036217ad..1d155cc56629 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-	kvm_x86_ops->fpu_activate(vcpu);
-
 	/*
 	 * The existing code assumes virtual address is 48-bit in the canonical
 	 * address checks; exit if it is ever changed.
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4e5905a1ce70..d1efe2c62b3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1157,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_save_area *save = &svm->vmcb->save;
 
-	svm->vcpu.fpu_active = 1;
 	svm->vcpu.arch.hflags = 0;
 
 	set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1899,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 	ulong gcr0 = svm->vcpu.arch.cr0;
 	u64 *hcr0 = &svm->vmcb->save.cr0;
 
-	if (!svm->vcpu.fpu_active)
-		*hcr0 |= SVM_CR0_SELECTIVE_MASK;
-	else
-		*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-			| (gcr0 & SVM_CR0_SELECTIVE_MASK);
+	*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+		| (gcr0 & SVM_CR0_SELECTIVE_MASK);
 
 	mark_dirty(svm->vmcb, VMCB_CR);
 
-	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+	if (gcr0 == *hcr0) {
 		clr_cr_intercept(svm, INTERCEPT_CR0_READ);
 		clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
 	} else {
@@ -1938,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	if (!npt_enabled)
 		cr0 |= X86_CR0_PG | X86_CR0_WP;
 
-	if (!vcpu->fpu_active)
-		cr0 |= X86_CR0_TS;
 	/*
 	 * re-enable caching here because the QEMU bios
 	 * does not do it - this results in some delay at
@@ -2158,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	clr_exception_intercept(svm, NM_VECTOR);
-
-	svm->vcpu.fpu_active = 1;
-	update_cr0_intercept(svm);
-}
-
-static int nm_interception(struct vcpu_svm *svm)
-{
-	svm_fpu_activate(&svm->vcpu);
-	return 1;
-}
-
 static bool is_erratum_383(void)
 {
 	int err, i;
@@ -2571,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 		if (!npt_enabled && svm->apf_reason == 0)
 			return NESTED_EXIT_HOST;
 		break;
-	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
-		nm_interception(svm);
-		break;
 	default:
 		break;
 	}
@@ -4018,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
-	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
 	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
 	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
 	[SVM_EXIT_INTR]				= intr_interception,
@@ -5072,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void)
 	return true;
 }
 
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	set_exception_intercept(svm, NM_VECTOR);
-	update_cr0_intercept(svm);
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
 			.stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -5340,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.get_pkru = svm_get_pkru,
 
-	.fpu_activate = svm_fpu_activate,
-	.fpu_deactivate = svm_fpu_deactivate,
-
 	.tlb_flush = svm_flush_tlb,
 
 	.run = svm_vcpu_run,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0e0b5d09597e..9856b73a21ad 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	u32 eb;
 
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-	     (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
+	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
 	if ((vcpu->guest_debug &
 	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb = ~0;
 	if (enable_ept)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
-	if (vcpu->fpu_active)
-		eb &= ~(1u << NM_VECTOR);
 
 	/* When we are running a nested L2 guest and L1 specified for it a
 	 * certain exception bitmap, we must trap the same exceptions and pass
@@ -2340,25 +2338,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
-	ulong cr0;
-
-	if (vcpu->fpu_active)
-		return;
-	vcpu->fpu_active = 1;
-	cr0 = vmcs_readl(GUEST_CR0);
-	cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
-	cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
-	vmcs_writel(GUEST_CR0, cr0);
-	update_exception_bitmap(vcpu);
-	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
-	if (is_guest_mode(vcpu))
-		vcpu->arch.cr0_guest_owned_bits &=
-			~get_vmcs12(vcpu)->cr0_guest_host_mask;
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-}
-
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
 /*
@@ -2377,33 +2356,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
 		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
 }
 
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-	/* Note that there is no vcpu->fpu_active = 0 here. The caller must
-	 * set this *before* calling this function.
-	 */
-	vmx_decache_cr0_guest_bits(vcpu);
-	vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
-	update_exception_bitmap(vcpu);
-	vcpu->arch.cr0_guest_owned_bits = 0;
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-	if (is_guest_mode(vcpu)) {
-		/*
-		 * L1's specified read shadow might not contain the TS bit,
-		 * so now that we turned on shadowing of this bit, we need to
-		 * set this bit of the shadow. Like in nested_vmx_run we need
-		 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
-		 * up-to-date here because we just decached cr0.TS (and we'll
-		 * only update vmcs12->guest_cr0 on nested exit).
-		 */
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
-			(vcpu->arch.cr0 & X86_CR0_TS);
-		vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
-	} else
-		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-}
-
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags, save_rflags;
@@ -4232,9 +4184,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	if (enable_ept)
 		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
 
-	if (!vcpu->fpu_active)
-		hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
-
 	vmcs_writel(CR0_READ_SHADOW, cr0);
 	vmcs_writel(GUEST_CR0, hw_cr0);
 	vcpu->arch.cr0 = cr0;
@@ -5321,7 +5270,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	/* 22.2.1, 20.8.1 */
 	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
 
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+	vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+
 	set_cr4_guest_host_mask(vmx);
 
 	if (vmx_xsaves_supported())
@@ -5425,7 +5376,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 	vmx_set_cr4(vcpu, 0);
 	vmx_set_efer(vcpu, 0);
-	vmx_fpu_activate(vcpu);
+
 	update_exception_bitmap(vcpu);
 
 	vpid_sync_context(vmx->vpid);
@@ -5698,11 +5649,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	if (is_nmi(intr_info))
 		return 1;  /* already handled by vmx_vcpu_run() */
 
-	if (is_no_device(intr_info)) {
-		vmx_fpu_activate(vcpu);
-		return 1;
-	}
-
 	if (is_invalid_opcode(intr_info)) {
 		if (is_guest_mode(vcpu)) {
 			kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5892,22 +5838,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 		return kvm_set_cr4(vcpu, val);
 }
 
-/* called to set cr0 as appropriate for clts instruction exit. */
-static void handle_clts(struct kvm_vcpu *vcpu)
-{
-	if (is_guest_mode(vcpu)) {
-		/*
-		 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
-		 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
-		 * just pretend it's off (also in arch.cr0 for fpu_activate).
-		 */
-		vmcs_writel(CR0_READ_SHADOW,
-			vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
-		vcpu->arch.cr0 &= ~X86_CR0_TS;
-	} else
-		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-}
-
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
@@ -5953,9 +5883,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		}
 		break;
 	case 2: /* clts */
-		handle_clts(vcpu);
+		WARN_ONCE(1, "Guest should always own CR0.TS");
+		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
 		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-		vmx_fpu_activate(vcpu);
 		return kvm_skip_emulated_instruction(vcpu);
 	case 1: /*mov from cr*/
 		switch (cr) {
@@ -10349,8 +10279,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	/*
-	 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
-	 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
+	 * bits which we consider mandatory enabled.
 	 * The CR0_READ_SHADOW is what L2 should have expected to read given
 	 * the specifications by L1; It's not enough to take
 	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
@@ -10963,24 +10893,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
 	/*
 	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
-	 * actually changed, because it depends on the current state of
-	 * fpu_active (which may have changed).
-	 * Note that vmx_set_cr0 refers to efer set above.
+	 * actually changed, because vmx_set_cr0 refers to efer set above.
+	 *
+	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+	 * (KVM doesn't change it);
 	 */
+	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
 	vmx_set_cr0(vcpu, vmcs12->host_cr0);
-	/*
-	 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
-	 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
-	 * but we also need to update cr0_guest_host_mask and exception_bitmap.
-	 */
-	update_exception_bitmap(vcpu);
-	vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-	/*
-	 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
-	 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
-	 */
+	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
 	kvm_set_cr4(vcpu, vmcs12->host_cr4);
 
@@ -11609,9 +11530,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.get_pkru = vmx_get_pkru,
 
-	.fpu_activate = vmx_fpu_activate,
-	.fpu_deactivate = vmx_fpu_deactivate,
-
 	.tlb_flush = vmx_flush_tlb,
 
 	.run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d3047c8cce7..c48404017e4f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6751,10 +6751,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 0;
 			goto out;
 		}
-		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
-			vcpu->fpu_active = 0;
-			kvm_x86_ops->fpu_deactivate(vcpu);
-		}
 		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
 			/* Page is swapped out. Do synthetic halt */
 			vcpu->arch.apf.halted = true;
@@ -6856,8 +6852,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-	if (vcpu->fpu_active)
-		kvm_load_guest_fpu(vcpu);
+	kvm_load_guest_fpu(vcpu);
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2db458ee94b0..8d69d5150748 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -221,7 +221,6 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	struct kvm_run *run;
 
-	int fpu_active;
 	int guest_fpu_loaded, guest_xcr0_loaded;
 	struct swait_queue_head wq;
 	struct pid *pid;
-- 
cgit v1.2.3


From da20420f83ea0fbcf3d03afda08d971ea1d8a356 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 11 Feb 2017 19:26:47 +0800
Subject: rhashtable: Add nested tables

This patch adds code that handles GFP_ATOMIC kmalloc failure on
insertion.  As we cannot use vmalloc, we solve it by making our
hash table nested.  That is, we allocate single pages at each level
and reach our desired table size by nesting them.

When a nested table is created, only a single page is allocated
at the top-level.  Lower levels are allocated on demand during
insertion.  Therefore for each insertion to succeed, only two
(non-consecutive) pages are needed.

After a nested table is created, a rehash will be scheduled in
order to switch to a vmalloced table as soon as possible.  Also,
the rehash code will never rehash into a nested table.  If we
detect a nested table during a rehash, the rehash will be aborted
and a new rehash will be scheduled.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  78 +++++++++----
 lib/rhashtable.c           | 270 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 276 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 5c132d3188be..f2e12a845910 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -61,6 +61,7 @@ struct rhlist_head {
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
  * @rehash: Current bucket being rehashed
  * @hash_rnd: Random seed to fold into hash
  * @locks_mask: Mask to apply before accessing locks[]
@@ -68,10 +69,12 @@ struct rhlist_head {
  * @walkers: List of active walkers
  * @rcu: RCU structure for freeing the table
  * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
  * @buckets: size * hash buckets
  */
 struct bucket_table {
 	unsigned int		size;
+	unsigned int		nest;
 	unsigned int		rehash;
 	u32			hash_rnd;
 	unsigned int		locks_mask;
@@ -81,7 +84,7 @@ struct bucket_table {
 
 	struct bucket_table __rcu *future_tbl;
 
-	struct rhash_head __rcu	*buckets[] ____cacheline_aligned_in_smp;
+	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
 /**
@@ -374,6 +377,12 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void *arg);
 void rhashtable_destroy(struct rhashtable *ht);
 
+struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
+					    unsigned int hash);
+struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
+						   struct bucket_table *tbl,
+						   unsigned int hash);
+
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
 
@@ -389,6 +398,27 @@ void rhashtable_destroy(struct rhashtable *ht);
 #define rht_entry(tpos, pos, member) \
 	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
+static inline struct rhash_head __rcu *const *rht_bucket(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_head __rcu **rht_bucket_var(
+	struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_head __rcu **rht_bucket_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
 /**
  * rht_for_each_continue - continue iterating over hash chain
  * @pos:	the &struct rhash_head to use as a loop cursor.
@@ -408,7 +438,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+	rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
 
 /**
  * rht_for_each_entry_continue - continue iterating over hash chain
@@ -433,7 +463,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash],	\
+	rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash),	\
 				    tbl, hash, member)
 
 /**
@@ -448,13 +478,13 @@ void rhashtable_destroy(struct rhashtable *ht);
  * This hash chain list-traversal primitive allows for the looped code to
  * remove the loop cursor from the list.
  */
-#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
-	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
-	     next = !rht_is_a_nulls(pos) ?				    \
-		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
-	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
-	     pos = next,						    \
-	     next = !rht_is_a_nulls(pos) ?				    \
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
+	for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \
+	     next = !rht_is_a_nulls(pos) ?				      \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
+	     pos = next,						      \
+	     next = !rht_is_a_nulls(pos) ?				      \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
 
 /**
@@ -485,7 +515,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+	rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
 
 /**
  * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
@@ -518,8 +548,8 @@ void rhashtable_destroy(struct rhashtable *ht);
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
+	rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \
 					tbl, hash, member)
 
 /**
@@ -565,7 +595,7 @@ static inline struct rhash_head *__rhashtable_lookup(
 		.ht = ht,
 		.key = key,
 	};
-	const struct bucket_table *tbl;
+	struct bucket_table *tbl;
 	struct rhash_head *he;
 	unsigned int hash;
 
@@ -697,8 +727,12 @@ slow_path:
 	}
 
 	elasticity = ht->elasticity;
-	pprev = &tbl->buckets[hash];
-	rht_for_each(head, tbl, hash) {
+	pprev = rht_bucket_insert(ht, tbl, hash);
+	data = ERR_PTR(-ENOMEM);
+	if (!pprev)
+		goto out;
+
+	rht_for_each_continue(head, *pprev, tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -736,7 +770,7 @@ slow_path:
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		goto slow_path;
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	head = rht_dereference_bucket(*pprev, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (rhlist) {
@@ -746,7 +780,7 @@ slow_path:
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	rcu_assign_pointer(*pprev, obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
@@ -955,8 +989,8 @@ static inline int __rhashtable_remove_fast_one(
 
 	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(he, *pprev, tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -1107,8 +1141,8 @@ static inline int __rhashtable_replace_fast(
 
 	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(he, *pprev, tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 32d0ad058380..172454e6b979 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -32,6 +32,11 @@
 #define HASH_MIN_SIZE		4U
 #define BUCKET_LOCKS_PER_CPU	32UL
 
+union nested_table {
+	union nested_table __rcu *table;
+	struct rhash_head __rcu *bucket;
+};
+
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
@@ -76,6 +81,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	/* Never allocate more than 0.5 locks per bucket */
 	size = min_t(unsigned int, size, tbl->size >> 1);
 
+	if (tbl->nest)
+		size = min(size, 1U << tbl->nest);
+
 	if (sizeof(spinlock_t) != 0) {
 		tbl->locks = NULL;
 #ifdef CONFIG_NUMA
@@ -99,8 +107,45 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	return 0;
 }
 
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	const unsigned int len = 1 << shift;
+	unsigned int i;
+
+	ntbl = rcu_dereference_raw(ntbl->table);
+	if (!ntbl)
+		return;
+
+	if (size > len) {
+		size >>= shift;
+		for (i = 0; i < len; i++)
+			nested_table_free(ntbl + i, size);
+	}
+
+	kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int len = 1 << tbl->nest;
+	union nested_table *ntbl;
+	unsigned int i;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+
+	for (i = 0; i < len; i++)
+		nested_table_free(ntbl + i, size);
+
+	kfree(ntbl);
+}
+
 static void bucket_table_free(const struct bucket_table *tbl)
 {
+	if (tbl->nest)
+		nested_bucket_table_free(tbl);
+
 	if (tbl)
 		kvfree(tbl->locks);
 
@@ -112,6 +157,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 	bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+					      union nested_table __rcu **prev,
+					      unsigned int shifted,
+					      unsigned int nhash)
+{
+	union nested_table *ntbl;
+	int i;
+
+	ntbl = rcu_dereference(*prev);
+	if (ntbl)
+		return ntbl;
+
+	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+	if (ntbl && shifted) {
+		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++)
+			INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht,
+					    (i << shifted) | nhash);
+	}
+
+	rcu_assign_pointer(*prev, ntbl);
+
+	return ntbl;
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+						      size_t nbuckets,
+						      gfp_t gfp)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	struct bucket_table *tbl;
+	size_t size;
+
+	if (nbuckets < (1 << (shift + 1)))
+		return NULL;
+
+	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+	tbl = kzalloc(size, gfp);
+	if (!tbl)
+		return NULL;
+
+	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+				0, 0)) {
+		kfree(tbl);
+		return NULL;
+	}
+
+	tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+	return tbl;
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       size_t nbuckets,
 					       gfp_t gfp)
@@ -126,10 +224,17 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
 	if (tbl == NULL && gfp == GFP_KERNEL)
 		tbl = vzalloc(size);
+
+	size = nbuckets;
+
+	if (tbl == NULL && gfp != GFP_KERNEL) {
+		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+		nbuckets = 0;
+	}
 	if (tbl == NULL)
 		return NULL;
 
-	tbl->size = nbuckets;
+	tbl->size = size;
 
 	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
 		bucket_table_free(tbl);
@@ -164,12 +269,17 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl = rhashtable_last_table(ht,
 		rht_dereference_rcu(old_tbl->future_tbl, ht));
-	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
-	int err = -ENOENT;
+	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
+	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
 	unsigned int new_hash;
 
+	if (new_tbl->nest)
+		goto out;
+
+	err = -ENOENT;
+
 	rht_for_each(entry, old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
@@ -202,19 +312,26 @@ out:
 	return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
 				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	spinlock_t *old_bucket_lock;
+	int err;
 
 	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
 
 	spin_lock_bh(old_bucket_lock);
-	while (!rhashtable_rehash_one(ht, old_hash))
+	while (!(err = rhashtable_rehash_one(ht, old_hash)))
 		;
-	old_tbl->rehash++;
+
+	if (err == -ENOENT) {
+		old_tbl->rehash++;
+		err = 0;
+	}
 	spin_unlock_bh(old_bucket_lock);
+
+	return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
@@ -246,13 +363,17 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	struct bucket_table *new_tbl;
 	struct rhashtable_walker *walker;
 	unsigned int old_hash;
+	int err;
 
 	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 	if (!new_tbl)
 		return 0;
 
-	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
-		rhashtable_rehash_chain(ht, old_hash);
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+		err = rhashtable_rehash_chain(ht, old_hash);
+		if (err)
+			return err;
+	}
 
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
@@ -271,31 +392,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-/**
- * rhashtable_expand - Expand hash table while allowing concurrent lookups
- * @ht:		the hash table to expand
- *
- * A secondary bucket array is allocated and the hash entries are migrated.
- *
- * This function may only be called in a context where it is safe to call
- * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
- *
- * The caller must ensure that no concurrent resizing occurs by holding
- * ht->mutex.
- *
- * It is valid to have concurrent insertions and deletions protected by per
- * bucket locks or concurrent RCU protected lookups and traversals.
- */
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+				   struct bucket_table *old_tbl,
+				   unsigned int size)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl;
 	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
-	old_tbl = rhashtable_last_table(ht, old_tbl);
-
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -324,12 +430,9 @@ static int rhashtable_expand(struct rhashtable *ht)
  */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	unsigned int nelems = atomic_read(&ht->nelems);
 	unsigned int size = 0;
-	int err;
-
-	ASSERT_RHT_MUTEX(ht);
 
 	if (nelems)
 		size = roundup_pow_of_two(nelems * 3 / 2);
@@ -342,15 +445,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	if (rht_dereference(old_tbl->future_tbl, ht))
 		return -EEXIST;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (new_tbl == NULL)
-		return -ENOMEM;
-
-	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
-	if (err)
-		bucket_table_free(new_tbl);
-
-	return err;
+	return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -366,11 +461,14 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rhashtable_last_table(ht, tbl);
 
 	if (rht_grow_above_75(ht, tbl))
-		rhashtable_expand(ht);
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
 	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-		rhashtable_shrink(ht);
+		err = rhashtable_shrink(ht);
+	else if (tbl->nest)
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
 
-	err = rhashtable_rehash_table(ht);
+	if (!err)
+		err = rhashtable_rehash_table(ht);
 
 	mutex_unlock(&ht->mutex);
 
@@ -439,8 +537,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 	int elasticity;
 
 	elasticity = ht->elasticity;
-	pprev = &tbl->buckets[hash];
-	rht_for_each(head, tbl, hash) {
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(head, *pprev, tbl, hash) {
 		struct rhlist_head *list;
 		struct rhlist_head *plist;
 
@@ -477,6 +575,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 						  struct rhash_head *obj,
 						  void *data)
 {
+	struct rhash_head __rcu **pprev;
 	struct bucket_table *new_tbl;
 	struct rhash_head *head;
 
@@ -499,7 +598,11 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		return ERR_PTR(-EAGAIN);
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	pprev = rht_bucket_insert(ht, tbl, hash);
+	if (!pprev)
+		return ERR_PTR(-ENOMEM);
+
+	head = rht_dereference_bucket(*pprev, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (ht->rhlist) {
@@ -509,7 +612,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	rcu_assign_pointer(*pprev, obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
@@ -975,7 +1078,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void (*free_fn)(void *ptr, void *arg),
 				 void *arg)
 {
-	const struct bucket_table *tbl;
+	struct bucket_table *tbl;
 	unsigned int i;
 
 	cancel_work_sync(&ht->run_work);
@@ -986,7 +1089,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 		for (i = 0; i < tbl->size; i++) {
 			struct rhash_head *pos, *next;
 
-			for (pos = rht_dereference(tbl->buckets[i], ht),
+			for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL;
 			     !rht_is_a_nulls(pos);
@@ -1007,3 +1110,70 @@ void rhashtable_destroy(struct rhashtable *ht)
 	return rhashtable_free_and_destroy(ht, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
+					    unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	static struct rhash_head __rcu *rhnull =
+		(struct rhash_head __rcu *)NULLS_MARKER(0);
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int subhash = hash;
+	union nested_table *ntbl;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+	ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+	subhash >>= tbl->nest;
+
+	while (ntbl && size > (1 << shift)) {
+		index = subhash & ((1 << shift) - 1);
+		ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+		size >>= shift;
+		subhash >>= shift;
+	}
+
+	if (!ntbl)
+		return &rhnull;
+
+	return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
+						   struct bucket_table *tbl,
+						   unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	union nested_table *ntbl;
+	unsigned int shifted;
+	unsigned int nhash;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+	hash >>= tbl->nest;
+	nhash = index;
+	shifted = tbl->nest;
+	ntbl = nested_table_alloc(ht, &ntbl[index].table,
+				  size <= (1 << shift) ? shifted : 0, nhash);
+
+	while (ntbl && size > (1 << shift)) {
+		index = hash & ((1 << shift) - 1);
+		size >>= shift;
+		hash >>= shift;
+		nhash |= index << shifted;
+		shifted += shift;
+		ntbl = nested_table_alloc(ht, &ntbl[index].table,
+					  size <= (1 << shift) ? shifted : 0,
+					  nhash);
+	}
+
+	if (!ntbl)
+		return NULL;
+
+	return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
-- 
cgit v1.2.3


From 9383191da4e40360a5d880fbe6bb03911c61621b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 16 Feb 2017 22:24:49 +0100
Subject: bpf: remove stubs for cBPF from arch code

Remove the dummy bpf_jit_compile() stubs for eBPF JITs and make
that a single __weak function in the core that can be overridden
similarly to the eBPF one. Also remove stale pr_err() mentions
of bpf_jit_compile.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/net/bpf_jit_comp.c     |  5 -----
 arch/powerpc/net/bpf_jit_comp64.c |  2 --
 arch/s390/net/bpf_jit_comp.c      |  8 --------
 arch/x86/net/bpf_jit_comp.c       |  8 ++------
 include/linux/filter.h            |  6 +-----
 kernel/bpf/core.c                 | 12 +++++++++++-
 6 files changed, 14 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index b2fc97a2c56c..c444408d5a8c 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -813,11 +813,6 @@ static inline void bpf_flush_icache(void *start, void *end)
 	flush_icache_range((unsigned long)start, (unsigned long)end);
 }
 
-void bpf_jit_compile(struct bpf_prog *prog)
-{
-	/* Nothing to do here. We support Internal BPF. */
-}
-
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_prog *tmp, *orig_prog = prog;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 73a5cf18fd84..f9ebd02260da 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -961,8 +961,6 @@ common_load:
 	return 0;
 }
 
-void bpf_jit_compile(struct bpf_prog *fp) { }
-
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	u32 proglen;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 167b31b186c1..6454efd22e63 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1262,14 +1262,6 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
 	return 0;
 }
 
-/*
- * Classic BPF function stub. BPF programs will be converted into
- * eBPF and then bpf_int_jit_compile() will be called.
- */
-void bpf_jit_compile(struct bpf_prog *fp)
-{
-}
-
 /*
  * Compile eBPF program "fp"
  */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bb660e53cbd6..26123d0ae13a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1067,13 +1067,13 @@ common_load:
 
 		ilen = prog - temp;
 		if (ilen > BPF_MAX_INSN_SIZE) {
-			pr_err("bpf_jit_compile fatal insn size error\n");
+			pr_err("bpf_jit: fatal insn size error\n");
 			return -EFAULT;
 		}
 
 		if (image) {
 			if (unlikely(proglen + ilen > oldproglen)) {
-				pr_err("bpf_jit_compile fatal error\n");
+				pr_err("bpf_jit: fatal error\n");
 				return -EFAULT;
 			}
 			memcpy(image + proglen, temp, ilen);
@@ -1085,10 +1085,6 @@ common_load:
 	return proglen;
 }
 
-void bpf_jit_compile(struct bpf_prog *prog)
-{
-}
-
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header = NULL;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index e4eb2546339a..c7a70e0cc3a0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -607,6 +607,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
+void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
@@ -625,7 +626,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     bpf_jit_fill_hole_t bpf_fill_ill_insns);
 void bpf_jit_binary_free(struct bpf_binary_header *hdr);
 
-void bpf_jit_compile(struct bpf_prog *fp);
 void bpf_jit_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
@@ -669,10 +669,6 @@ static inline bool bpf_jit_blinding_enabled(void)
 	return true;
 }
 #else
-static inline void bpf_jit_compile(struct bpf_prog *fp)
-{
-}
-
 static inline void bpf_jit_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_free(fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fddd76b1b627..2831ba1e71c1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1154,12 +1154,22 @@ const struct bpf_func_proto bpf_tail_call_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
+ * It is encouraged to implement bpf_int_jit_compile() instead, so that
+ * eBPF and implicitly also cBPF can get JITed!
+ */
 struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	return prog;
 }
 
+/* Stub for JITs that support eBPF. All cBPF code gets transformed into
+ * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
+ */
+void __weak bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
 bool __weak bpf_helper_changes_pkt_data(void *func)
 {
 	return false;
-- 
cgit v1.2.3


From 74451e66d516c55e309e8d89a4a1e7596e46aacd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 16 Feb 2017 22:24:50 +0100
Subject: bpf: make jited programs visible in traces

Long standing issue with JITed programs is that stack traces from
function tracing check whether a given address is kernel code
through {__,}kernel_text_address(), which checks for code in core
kernel, modules and dynamically allocated ftrace trampolines. But
what is still missing is BPF JITed programs (interpreted programs
are not an issue as __bpf_prog_run() will be attributed to them),
thus when a stack trace is triggered, the code walking the stack
won't see any of the JITed ones. The same for address correlation
done from user space via reading /proc/kallsyms. This is read by
tools like perf, but the latter is also useful for permanent live
tracing with eBPF itself in combination with stack maps when other
eBPF types are part of the callchain. See offwaketime example on
dumping stack from a map.

This work tries to tackle that issue by making the addresses and
symbols known to the kernel. The lookup from *kernel_text_address()
is implemented through a latched RB tree that can be read under
RCU in fast-path that is also shared for symbol/size/offset lookup
for a specific given address in kallsyms. The slow-path iteration
through all symbols in the seq file done via RCU list, which holds
a tiny fraction of all exported ksyms, usually below 0.1 percent.
Function symbols are exported as bpf_prog_<tag>, in order to aide
debugging and attribution. This facility is currently enabled for
root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening
is active in any mode. The rationale behind this is that still a lot
of systems ship with world read permissions on kallsyms thus addresses
should not get suddenly exposed for them. If that situation gets
much better in future, we always have the option to change the
default on this. Likewise, unprivileged programs are not allowed
to add entries there either, but that is less of a concern as most
such programs types relevant in this context are for root-only anyway.
If enabled, call graphs and stack traces will then show a correct
attribution; one example is illustrated below, where the trace is
now visible in tooling such as perf script --kallsyms=/proc/kallsyms
and friends.

Before:

  7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so)

After:

  7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux)
  [...]
  7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so)

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt      |  12 ++
 arch/arm64/net/bpf_jit_comp.c     |  15 ---
 arch/powerpc/net/bpf_jit_comp64.c |   1 +
 arch/s390/net/bpf_jit_comp.c      |  18 ---
 arch/x86/net/bpf_jit_comp.c       |  15 ---
 include/linux/bpf.h               |   4 +
 include/linux/filter.h            | 112 ++++++++++++++++++-
 kernel/bpf/core.c                 | 223 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c              |   2 +
 kernel/extable.c                  |   9 +-
 kernel/kallsyms.c                 |  61 +++++++++--
 net/Kconfig                       |   3 +-
 net/core/sysctl_net_core.c        |   7 ++
 13 files changed, 419 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index b80fbd4e5575..2ebabc93014a 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -54,6 +54,18 @@ Values :
 	1 - enable JIT hardening for unprivileged users only
 	2 - enable JIT hardening for all users
 
+bpf_jit_kallsyms
+----------------
+
+When Berkeley Packet Filter Just in Time compiler is enabled, then compiled
+images are unknown addresses to the kernel, meaning they neither show up in
+traces nor in /proc/kallsyms. This enables export of these addresses, which
+can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature
+is disabled.
+Values :
+	0 - disable JIT kallsyms export (default value)
+	1 - enable JIT kallsyms export for privileged users only
+
 dev_weight
 --------------
 
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index c444408d5a8c..05d12104d270 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -910,18 +910,3 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
-
-void bpf_jit_free(struct bpf_prog *prog)
-{
-	unsigned long addr = (unsigned long)prog->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!prog->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(prog);
-}
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index f9ebd02260da..c34166ef76fc 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1064,6 +1064,7 @@ out:
 	return fp;
 }
 
+/* Overriding bpf_jit_free() as we don't set images read-only. */
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 6454efd22e63..f1d0e62ec1dd 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1339,21 +1339,3 @@ out:
 					   tmp : orig_fp);
 	return fp;
 }
-
-/*
- * Free eBPF program
- */
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!fp->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(fp);
-}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26123d0ae13a..18a62e208826 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1180,18 +1180,3 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
-
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!fp->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(fp);
-}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 57d60dc5b600..909fc033173a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -8,10 +8,12 @@
 #define _LINUX_BPF_H 1
 
 #include <uapi/linux/bpf.h>
+
 #include <linux/workqueue.h>
 #include <linux/file.h>
 #include <linux/percpu.h>
 #include <linux/err.h>
+#include <linux/rbtree_latch.h>
 
 struct perf_event;
 struct bpf_map;
@@ -177,6 +179,8 @@ struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
+	struct latch_tree_node ksym_tnode;
+	struct list_head ksym_lnode;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c7a70e0cc3a0..0c1cc9143cb2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -54,6 +54,12 @@ struct bpf_prog_aux;
 #define BPF_REG_AX		MAX_BPF_REG
 #define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
 
+/* As per nm, we expose JITed images as text (code) section for
+ * kallsyms. That way, tools like perf can find it to match
+ * addresses.
+ */
+#define BPF_SYM_ELF_TYPE	't'
+
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
@@ -555,6 +561,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 	set_memory_rw((unsigned long)fp, fp->pages);
 }
+
+static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
+{
+	set_memory_rw((unsigned long)hdr, hdr->pages);
+}
 #else
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
@@ -563,8 +574,21 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 }
+
+static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
+{
+}
 #endif /* CONFIG_DEBUG_SET_MODULE_RONX */
 
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+	unsigned long real_start = (unsigned long)fp->bpf_func;
+	unsigned long addr = real_start & PAGE_MASK;
+
+	return (void *)addr;
+}
+
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
@@ -617,6 +641,7 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
+extern int bpf_jit_kallsyms;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
@@ -651,6 +676,11 @@ static inline bool bpf_jit_is_ebpf(void)
 # endif
 }
 
+static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
+{
+	return fp->jited && bpf_jit_is_ebpf();
+}
+
 static inline bool bpf_jit_blinding_enabled(void)
 {
 	/* These are the prerequisites, should someone ever have the
@@ -668,11 +698,91 @@ static inline bool bpf_jit_blinding_enabled(void)
 
 	return true;
 }
-#else
+
+static inline bool bpf_jit_kallsyms_enabled(void)
+{
+	/* There are a couple of corner cases where kallsyms should
+	 * not be enabled f.e. on hardening.
+	 */
+	if (bpf_jit_harden)
+		return false;
+	if (!bpf_jit_kallsyms)
+		return false;
+	if (bpf_jit_kallsyms == 1)
+		return true;
+
+	return false;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+				 unsigned long *off, char *sym);
+bool is_bpf_text_address(unsigned long addr);
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		    char *sym);
+
+static inline const char *
+bpf_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	const char *ret = __bpf_address_lookup(addr, size, off, sym);
+
+	if (ret && modname)
+		*modname = NULL;
+	return ret;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp);
+void bpf_prog_kallsyms_del(struct bpf_prog *fp);
+
+#else /* CONFIG_BPF_JIT */
+
+static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
+{
+	return false;
+}
+
 static inline void bpf_jit_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_free(fp);
 }
+
+static inline bool bpf_jit_kallsyms_enabled(void)
+{
+	return false;
+}
+
+static inline const char *
+__bpf_address_lookup(unsigned long addr, unsigned long *size,
+		     unsigned long *off, char *sym)
+{
+	return NULL;
+}
+
+static inline bool is_bpf_text_address(unsigned long addr)
+{
+	return false;
+}
+
+static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
+				  char *type, char *sym)
+{
+	return -ERANGE;
+}
+
+static inline const char *
+bpf_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	return NULL;
+}
+
+static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+}
+
+static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+}
 #endif /* CONFIG_BPF_JIT */
 
 #define BPF_ANC		BIT(15)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2831ba1e71c1..f45827e205d3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -28,6 +28,9 @@
 #include <linux/moduleloader.h>
 #include <linux/bpf.h>
 #include <linux/frame.h>
+#include <linux/rbtree_latch.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
 
 #include <asm/unaligned.h>
 
@@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	fp->aux = aux;
 	fp->aux->prog = fp;
 
+	INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
+
 	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
@@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }
 
 #ifdef CONFIG_BPF_JIT
+static __always_inline void
+bpf_get_prog_addr_region(const struct bpf_prog *prog,
+			 unsigned long *symbol_start,
+			 unsigned long *symbol_end)
+{
+	const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
+	unsigned long addr = (unsigned long)hdr;
+
+	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
+
+	*symbol_start = addr;
+	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
+}
+
+static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+	BUILD_BUG_ON(sizeof("bpf_prog_") +
+		     sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+
+	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
+	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
+	*sym = 0;
+}
+
+static __always_inline unsigned long
+bpf_get_prog_addr_start(struct latch_tree_node *n)
+{
+	unsigned long symbol_start, symbol_end;
+	const struct bpf_prog_aux *aux;
+
+	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+	return symbol_start;
+}
+
+static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
+					  struct latch_tree_node *b)
+{
+	return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
+}
+
+static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long symbol_start, symbol_end;
+	const struct bpf_prog_aux *aux;
+
+	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+	if (val < symbol_start)
+		return -1;
+	if (val >= symbol_end)
+		return  1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops bpf_tree_ops = {
+	.less	= bpf_tree_less,
+	.comp	= bpf_tree_comp,
+};
+
+static DEFINE_SPINLOCK(bpf_lock);
+static LIST_HEAD(bpf_kallsyms);
+static struct latch_tree_root bpf_tree __cacheline_aligned;
+
+int bpf_jit_kallsyms __read_mostly;
+
+static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
+{
+	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
+	list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
+	latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+}
+
+static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
+{
+	if (list_empty(&aux->ksym_lnode))
+		return;
+
+	latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+	list_del_rcu(&aux->ksym_lnode);
+}
+
+static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
+{
+	return fp->jited && !bpf_prog_was_classic(fp);
+}
+
+static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
+{
+	return list_empty(&fp->aux->ksym_lnode) ||
+	       fp->aux->ksym_lnode.prev == LIST_POISON2;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+	unsigned long flags;
+
+	if (!bpf_prog_kallsyms_candidate(fp) ||
+	    !capable(CAP_SYS_ADMIN))
+		return;
+
+	spin_lock_irqsave(&bpf_lock, flags);
+	bpf_prog_ksym_node_add(fp->aux);
+	spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+	unsigned long flags;
+
+	if (!bpf_prog_kallsyms_candidate(fp))
+		return;
+
+	spin_lock_irqsave(&bpf_lock, flags);
+	bpf_prog_ksym_node_del(fp->aux);
+	spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
+{
+	struct latch_tree_node *n;
+
+	if (!bpf_jit_kallsyms_enabled())
+		return NULL;
+
+	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
+	return n ?
+	       container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
+	       NULL;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+				 unsigned long *off, char *sym)
+{
+	unsigned long symbol_start, symbol_end;
+	struct bpf_prog *prog;
+	char *ret = NULL;
+
+	rcu_read_lock();
+	prog = bpf_prog_kallsyms_find(addr);
+	if (prog) {
+		bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
+		bpf_get_prog_name(prog, sym);
+
+		ret = sym;
+		if (size)
+			*size = symbol_end - symbol_start;
+		if (off)
+			*off  = addr - symbol_start;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool is_bpf_text_address(unsigned long addr)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = bpf_prog_kallsyms_find(addr) != NULL;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		    char *sym)
+{
+	unsigned long symbol_start, symbol_end;
+	struct bpf_prog_aux *aux;
+	unsigned int it = 0;
+	int ret = -ERANGE;
+
+	if (!bpf_jit_kallsyms_enabled())
+		return ret;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
+		if (it++ != symnum)
+			continue;
+
+		bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+		bpf_get_prog_name(aux->prog, sym);
+
+		*value = symbol_start;
+		*type  = BPF_SYM_ELF_TYPE;
+
+		ret = 0;
+		break;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
@@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	module_memfree(hdr);
 }
 
+/* This symbol is only overridden by archs that have different
+ * requirements than the usual eBPF JITs, f.e. when they only
+ * implement cBPF JIT, do not set images read-only, etc.
+ */
+void __weak bpf_jit_free(struct bpf_prog *fp)
+{
+	if (fp->jited) {
+		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+		bpf_jit_binary_unlock_ro(hdr);
+		bpf_jit_binary_free(hdr);
+
+		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
+	}
+
+	bpf_prog_unlock_free(fp);
+}
+
 int bpf_jit_harden __read_mostly;
 
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f74ca17af64a..461eb1e66a0f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -707,6 +707,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
@@ -903,6 +904,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		/* failed to allocate fd */
 		goto free_used_maps;
 
+	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..bd82117ad424 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
+#include <linux/filter.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_ftrace_trampoline(addr))
 		return 1;
+	if (is_bpf_text_address(addr))
+		return 1;
 	/*
 	 * There might be init symbols in saved stacktraces.
 	 * Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_module_text_address(addr))
 		return 1;
-	return is_ftrace_trampoline(addr);
+	if (is_ftrace_trampoline(addr))
+		return 1;
+	if (is_bpf_text_address(addr))
+		return 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fafd1a3ef0da..6a3b249a2ae1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
+#include <linux/filter.h>
 #include <linux/compiler.h>
 
 #include <asm/sections.h>
@@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 				unsigned long *offset)
 {
 	char namebuf[KSYM_NAME_LEN];
+
 	if (is_ksym_addr(addr))
 		return !!get_symbol_pos(addr, symbolsize, offset);
-
-	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+	       !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
 }
 
 /*
@@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *offset,
 			    char **modname, char *namebuf)
 {
+	const char *ret;
+
 	namebuf[KSYM_NAME_LEN - 1] = 0;
 	namebuf[0] = 0;
 
@@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr,
 		return namebuf;
 	}
 
-	/* See if it's in a module. */
-	return module_address_lookup(addr, symbolsize, offset, modname,
-				     namebuf);
+	/* See if it's in a module or a BPF JITed image. */
+	ret = module_address_lookup(addr, symbolsize, offset,
+				    modname, namebuf);
+	if (!ret)
+		ret = bpf_address_lookup(addr, symbolsize,
+					 offset, modname, namebuf);
+	return ret;
 }
 
 int lookup_symbol_name(unsigned long addr, char *symname)
@@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
 struct kallsym_iter {
 	loff_t pos;
+	loff_t pos_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -481,13 +490,27 @@ struct kallsym_iter {
 
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
-	if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
-				&iter->type, iter->name, iter->module_name,
-				&iter->exported) < 0)
+	int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
+				     &iter->value, &iter->type,
+				     iter->name, iter->module_name,
+				     &iter->exported);
+	if (ret < 0) {
+		iter->pos_mod_end = iter->pos;
 		return 0;
+	}
+
 	return 1;
 }
 
+static int get_ksymbol_bpf(struct kallsym_iter *iter)
+{
+	iter->module_name[0] = '\0';
+	iter->exported = 0;
+	return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+			       &iter->value, &iter->type,
+			       iter->name) < 0 ? 0 : 1;
+}
+
 /* Returns space to next name. */
 static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
 {
@@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 	iter->name[0] = '\0';
 	iter->nameoff = get_symbol_offset(new_pos);
 	iter->pos = new_pos;
+	if (new_pos == 0)
+		iter->pos_mod_end = 0;
+}
+
+static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
+{
+	iter->pos = pos;
+
+	if (iter->pos_mod_end > 0 &&
+	    iter->pos_mod_end < iter->pos)
+		return get_ksymbol_bpf(iter);
+
+	if (!get_ksymbol_mod(iter))
+		return get_ksymbol_bpf(iter);
+
+	return 1;
 }
 
 /* Returns false if pos at or past end of file. */
 static int update_iter(struct kallsym_iter *iter, loff_t pos)
 {
 	/* Module symbols can be accessed randomly. */
-	if (pos >= kallsyms_num_syms) {
-		iter->pos = pos;
-		return get_ksymbol_mod(iter);
-	}
+	if (pos >= kallsyms_num_syms)
+		return update_iter_mod(iter, pos);
 
 	/* If we're not on the desired position, reset to new position. */
 	if (pos != iter->pos)
diff --git a/net/Kconfig b/net/Kconfig
index f19c0c3b9589..102f781a0131 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -297,7 +297,8 @@ config BPF_JIT
 
 	  Note, admin should enable this feature changing:
 	  /proc/sys/net/core/bpf_jit_enable
-	  /proc/sys/net/core/bpf_jit_harden (optional)
+	  /proc/sys/net/core/bpf_jit_harden   (optional)
+	  /proc/sys/net/core/bpf_jit_kallsyms (optional)
 
 config NET_FLOW_LIMIT
 	bool
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index eaa72eb0399c..4ead336e14ea 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -334,6 +334,13 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "bpf_jit_kallsyms",
+		.data		= &bpf_jit_kallsyms,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
 # endif
 #endif
 	{
-- 
cgit v1.2.3


From 4f1244c8298606b8fae64b4d78b820ae6b896e3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Feb 2017 13:59:39 +0100
Subject: block/sed-opal: allocate struct opal_dev dynamically

Insted of bloating the containing structure with it all the time this
allocates struct opal_dev dynamically.  Additionally this allows moving
the definition of struct opal_dev into sed-opal.c.  For this a new
private data field is added to it that is passed to the send/receive
callback.  After that a lot of internals can be made private as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Scott Bauer <scott.bauer@intel.com>
Reviewed-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/opal_proto.h       |  23 ++++++++++
 block/sed-opal.c         | 101 +++++++++++++++++++++++++++++++++++++----
 drivers/nvme/host/core.c |   9 ++--
 drivers/nvme/host/nvme.h |  14 ++----
 drivers/nvme/host/pci.c  |   8 +++-
 include/linux/sed-opal.h | 116 ++---------------------------------------------
 6 files changed, 131 insertions(+), 140 deletions(-)

(limited to 'include/linux')

diff --git a/block/opal_proto.h b/block/opal_proto.h
index af9abc56c157..f40c9acf8895 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -19,6 +19,29 @@
 #ifndef _OPAL_PROTO_H
 #define _OPAL_PROTO_H
 
+/*
+ * These constant values come from:
+ * SPC-4 section
+ * 6.30 SECURITY PROTOCOL IN command / table 265.
+ */
+enum {
+	TCG_SECP_00 = 0,
+	TCG_SECP_01,
+};
+
+/*
+ * Token defs derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * 3.2.2 Data Stream Encoding
+ */
+enum opal_response_token {
+	OPAL_DTA_TOKENID_BYTESTRING = 0xe0,
+	OPAL_DTA_TOKENID_SINT = 0xe1,
+	OPAL_DTA_TOKENID_UINT = 0xe2,
+	OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */
+	OPAL_DTA_TOKENID_INVALID = 0X0
+};
+
 #define DTAERROR_NO_METHOD_STATUS 0x89
 #define GENERIC_HOST_SESSION_NUM 0x41
 
diff --git a/block/sed-opal.c b/block/sed-opal.c
index bcdd5b6d02e8..d1c52ba4d62d 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -31,6 +31,77 @@
 
 #include "opal_proto.h"
 
+#define IO_BUFFER_LENGTH 2048
+#define MAX_TOKS 64
+
+typedef int (*opal_step)(struct opal_dev *dev);
+
+enum opal_atom_width {
+	OPAL_WIDTH_TINY,
+	OPAL_WIDTH_SHORT,
+	OPAL_WIDTH_MEDIUM,
+	OPAL_WIDTH_LONG,
+	OPAL_WIDTH_TOKEN
+};
+
+/*
+ * On the parsed response, we don't store again the toks that are already
+ * stored in the response buffer. Instead, for each token, we just store a
+ * pointer to the position in the buffer where the token starts, and the size
+ * of the token in bytes.
+ */
+struct opal_resp_tok {
+	const u8 *pos;
+	size_t len;
+	enum opal_response_token type;
+	enum opal_atom_width width;
+	union {
+		u64 u;
+		s64 s;
+	} stored;
+};
+
+/*
+ * From the response header it's not possible to know how many tokens there are
+ * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later
+ * if we start dealing with messages that have more than that, we can increase
+ * this number. This is done to avoid having to make two passes through the
+ * response, the first one counting how many tokens we have and the second one
+ * actually storing the positions.
+ */
+struct parsed_resp {
+	int num;
+	struct opal_resp_tok toks[MAX_TOKS];
+};
+
+struct opal_dev {
+	bool supported;
+
+	void *data;
+	sec_send_recv *send_recv;
+
+	const opal_step *funcs;
+	void **func_data;
+	int state;
+	struct mutex dev_lock;
+	u16 comid;
+	u32 hsn;
+	u32 tsn;
+	u64 align;
+	u64 lowest_lba;
+
+	size_t pos;
+	u8 cmd[IO_BUFFER_LENGTH];
+	u8 resp[IO_BUFFER_LENGTH];
+
+	struct parsed_resp parsed;
+	size_t prev_d_len;
+	void *prev_data;
+
+	struct list_head unlk_lst;
+};
+
+
 static const u8 opaluid[][OPAL_UID_LENGTH] = {
 	/* users */
 	[OPAL_SMUID_UID] =
@@ -243,14 +314,14 @@ static u16 get_comid_v200(const void *data)
 
 static int opal_send_cmd(struct opal_dev *dev)
 {
-	return dev->send_recv(dev, dev->comid, TCG_SECP_01,
+	return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
 			      dev->cmd, IO_BUFFER_LENGTH,
 			      true);
 }
 
 static int opal_recv_cmd(struct opal_dev *dev)
 {
-	return dev->send_recv(dev, dev->comid, TCG_SECP_01,
+	return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
 			      dev->resp, IO_BUFFER_LENGTH,
 			      false);
 }
@@ -1943,16 +2014,24 @@ static int check_opal_support(struct opal_dev *dev)
 	return ret;
 }
 
-void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv)
+struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
 {
-	if (opal_dev->initialized)
-		return;
-	INIT_LIST_HEAD(&opal_dev->unlk_lst);
-	mutex_init(&opal_dev->dev_lock);
-	opal_dev->send_recv = send_recv;
-	if (check_opal_support(opal_dev) < 0)
+	struct opal_dev *dev;
+
+	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+
+	INIT_LIST_HEAD(&dev->unlk_lst);
+	mutex_init(&dev->dev_lock);
+	dev->data = data;
+	dev->send_recv = send_recv;
+	if (check_opal_support(dev) != 0) {
 		pr_debug("Opal is not supported on this device\n");
-	opal_dev->initialized = true;
+		kfree(dev);
+		return NULL;
+	}
+	return dev;
 }
 EXPORT_SYMBOL(init_opal_dev);
 
@@ -2351,6 +2430,8 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
+	if (!dev)
+		return -ENOTSUPP;
 	if (!dev->supported) {
 		pr_err("Not supported\n");
 		return -ENOTSUPP;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b92a79281611..f6b56a12457a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -789,7 +789,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 			return nvme_nvm_ioctl(ns, cmd, arg);
 #endif
 		if (is_sed_ioctl(cmd))
-			return sed_ioctl(&ns->ctrl->opal_dev, cmd,
+			return sed_ioctl(ns->ctrl->opal_dev, cmd,
 					 (void __user *) arg);
 		return -ENOTTY;
 	}
@@ -1059,18 +1059,17 @@ static const struct pr_ops nvme_pr_ops = {
 };
 
 #ifdef CONFIG_BLK_SED_OPAL
-int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp,
-		    void *buffer, size_t len, bool send)
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+		bool send)
 {
+	struct nvme_ctrl *ctrl = data;
 	struct nvme_command cmd;
-	struct nvme_ctrl *ctrl = NULL;
 
 	memset(&cmd, 0, sizeof(cmd));
 	if (send)
 		cmd.common.opcode = nvme_admin_security_send;
 	else
 		cmd.common.opcode = nvme_admin_security_recv;
-	ctrl = container_of(dev, struct nvme_ctrl, opal_dev);
 	cmd.common.nsid = 0;
 	cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
 	cmd.common.cdw10[1] = cpu_to_le32(len);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1aa353a848e3..fd94c94ccbcb 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -126,7 +126,7 @@ struct nvme_ctrl {
 	struct list_head node;
 	struct ida ns_ida;
 
-	struct opal_dev opal_dev;
+	struct opal_dev *opal_dev;
 
 	char name[12];
 	char serial[20];
@@ -278,16 +278,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl);
 void nvme_queue_scan(struct nvme_ctrl *ctrl);
 void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 
-#ifdef CONFIG_BLK_SED_OPAL
-int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp,
-		    void *buffer, size_t len, bool send);
-#else
-static inline int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp,
-				  void *buffer, size_t len, bool send)
-{
-	return 0;
-}
-#endif /* CONFIG_BLK_DEV_SED_OPAL */
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+		bool send);
 
 #define NVME_NR_AERS	1
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f08e86e73dda..50b070528c50 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1742,6 +1742,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	if (dev->ctrl.admin_q)
 		blk_put_queue(dev->ctrl.admin_q);
 	kfree(dev->queues);
+	kfree(dev->ctrl.opal_dev);
 	kfree(dev);
 }
 
@@ -1791,10 +1792,13 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
-	init_opal_dev(&dev->ctrl.opal_dev, &nvme_sec_submit);
+	if (!dev->ctrl.opal_dev) {
+		dev->ctrl.opal_dev =
+			init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+	}
 
 	if (was_suspend)
-		opal_unlock_from_suspend(&dev->ctrl.opal_dev);
+		opal_unlock_from_suspend(dev->ctrl.opal_dev);
 
 	result = nvme_setup_io_queues(dev);
 	if (result)
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 205d520ea688..deee23d012e7 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -21,117 +21,14 @@
 #include <uapi/linux/sed-opal.h>
 #include <linux/kernel.h>
 
-/*
- * These constant values come from:
- * SPC-4 section
- * 6.30 SECURITY PROTOCOL IN command / table 265.
- */
-enum {
-	TCG_SECP_00 = 0,
-	TCG_SECP_01,
-};
 struct opal_dev;
 
-#define IO_BUFFER_LENGTH 2048
-#define MAX_TOKS 64
-
-typedef int (*opal_step)(struct opal_dev *dev);
-typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp,
-			    void *buffer, size_t len, bool send);
-
-
-enum opal_atom_width {
-	OPAL_WIDTH_TINY,
-	OPAL_WIDTH_SHORT,
-	OPAL_WIDTH_MEDIUM,
-	OPAL_WIDTH_LONG,
-	OPAL_WIDTH_TOKEN
-};
-
-/*
- * Token defs derived from:
- * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
- * 3.2.2 Data Stream Encoding
- */
-enum opal_response_token {
-	OPAL_DTA_TOKENID_BYTESTRING = 0xe0,
-	OPAL_DTA_TOKENID_SINT = 0xe1,
-	OPAL_DTA_TOKENID_UINT = 0xe2,
-	OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */
-	OPAL_DTA_TOKENID_INVALID = 0X0
-};
-
-/*
- * On the parsed response, we don't store again the toks that are already
- * stored in the response buffer. Instead, for each token, we just store a
- * pointer to the position in the buffer where the token starts, and the size
- * of the token in bytes.
- */
-struct opal_resp_tok {
-	const u8 *pos;
-	size_t len;
-	enum opal_response_token type;
-	enum opal_atom_width width;
-	union {
-		u64 u;
-		s64 s;
-	} stored;
-};
-
-/*
- * From the response header it's not possible to know how many tokens there are
- * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later
- * if we start dealing with messages that have more than that, we can increase
- * this number. This is done to avoid having to make two passes through the
- * response, the first one counting how many tokens we have and the second one
- * actually storing the positions.
- */
-struct parsed_resp {
-	int num;
-	struct opal_resp_tok toks[MAX_TOKS];
-};
-
-/**
- * struct opal_dev - The structure representing a OPAL enabled SED.
- * @supported: Whether or not OPAL is supported on this controller.
- * @send_recv: The combined sec_send/sec_recv function pointer.
- * @opal_step: A series of opal methods that are necessary to complete a command.
- * @func_data: An array of parameters for the opal methods above.
- * @state: Describes the current opal_step we're working on.
- * @dev_lock: Locks the entire opal_dev structure.
- * @parsed: Parsed response from controller.
- * @prev_data: Data returned from a method to the controller.
- * @unlk_lst: A list of Locking ranges to unlock on this device during a resume.
- */
-struct opal_dev {
-	bool initialized;
-	bool supported;
-	sec_send_recv *send_recv;
-
-	const opal_step *funcs;
-	void **func_data;
-	int state;
-	struct mutex dev_lock;
-	u16 comid;
-	u32 hsn;
-	u32 tsn;
-	u64 align;
-	u64 lowest_lba;
-
-	size_t pos;
-	u8 cmd[IO_BUFFER_LENGTH];
-	u8 resp[IO_BUFFER_LENGTH];
-
-	struct parsed_resp parsed;
-	size_t prev_d_len;
-	void *prev_data;
-
-	struct list_head unlk_lst;
-};
+typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer,
+		size_t len, bool send);
 
 #ifdef CONFIG_BLK_SED_OPAL
 bool opal_unlock_from_suspend(struct opal_dev *dev);
-void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv);
+struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv);
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
 
 static inline bool is_sed_ioctl(unsigned int cmd)
@@ -168,11 +65,6 @@ static inline bool opal_unlock_from_suspend(struct opal_dev *dev)
 {
 	return false;
 }
-static inline void init_opal_dev(struct opal_dev *opal_dev,
-				 sec_send_recv *send_recv)
-{
-	opal_dev->supported = false;
-	opal_dev->initialized = true;
-}
+#define init_opal_dev(data, send_recv)		NULL
 #endif /* CONFIG_BLK_SED_OPAL */
 #endif /* LINUX_OPAL_H */
-- 
cgit v1.2.3


From 8a9ae523282f324989850fcf41312b42a2fb9296 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Fri, 17 Feb 2017 13:59:40 +0100
Subject: nvme: Check for Security send/recv support before issuing commands.

We need to verify that the controller supports the security
commands before actually trying to issue them.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
[hch: moved the check so that we don't call into the OPAL code if not
      supported]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 1 +
 drivers/nvme/host/nvme.h | 1 +
 drivers/nvme/host/pci.c  | 2 +-
 include/linux/nvme.h     | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f6b56a12457a..b4e743d1c0b4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1264,6 +1264,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		return -EIO;
 	}
 
+	ctrl->oacs = le16_to_cpu(id->oacs);
 	ctrl->vid = le16_to_cpu(id->vid);
 	ctrl->oncs = le16_to_cpup(&id->oncs);
 	atomic_set(&ctrl->abort_limit, id->acl + 1);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fd94c94ccbcb..b0977229e219 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -140,6 +140,7 @@ struct nvme_ctrl {
 	u32 max_hw_sectors;
 	u16 oncs;
 	u16 vid;
+	u16 oacs;
 	atomic_t abort_limit;
 	u8 event_limit;
 	u8 vwc;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 50b070528c50..85896d46aebc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1792,7 +1792,7 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
-	if (!dev->ctrl.opal_dev) {
+	if ((dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) && !dev->ctrl.opal_dev) {
 		dev->ctrl.opal_dev =
 			init_opal_dev(&dev->ctrl, &nvme_sec_submit);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 3d1c6f1b15c9..00eac863a9c7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -244,6 +244,7 @@ enum {
 	NVME_CTRL_ONCS_DSM			= 1 << 2,
 	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
+	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
 };
 
 struct nvme_lbaf {
-- 
cgit v1.2.3


From 1e128c81290a419ab9ec8b09fe989f1c6c15a0f4 Mon Sep 17 00:00:00 2001
From: Arun Easi <arun.easi@qlogic.com>
Date: Wed, 15 Feb 2017 06:28:22 -0800
Subject: qed: Add support for hardware offloaded FCoE.

This adds the backbone required for the various HW initalizations
which are necessary for the FCoE driver (qedf) for QLogic FastLinQ
4xxxx line of adapters - FW notification, resource initializations, etc.

Signed-off-by: Arun Easi <arun.easi@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/Kconfig               |    3 +
 drivers/net/ethernet/qlogic/qed/Makefile          |    1 +
 drivers/net/ethernet/qlogic/qed/qed.h             |   11 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.c         |   98 +-
 drivers/net/ethernet/qlogic/qed/qed_cxt.h         |    3 +
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c        |   13 +-
 drivers/net/ethernet/qlogic/qed/qed_dcbx.h        |    5 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |  205 ++++-
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h     |   42 +
 drivers/net/ethernet/qlogic/qed/qed_fcoe.c        | 1014 +++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_fcoe.h        |   87 ++
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         |  781 +++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_hw.c          |    3 +
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         |   25 +
 drivers/net/ethernet/qlogic/qed/qed_ll2.h         |    2 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c        |    7 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         |    3 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         |    1 +
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h    |    8 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |    4 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |    3 +
 include/linux/qed/common_hsi.h                    |   10 +-
 include/linux/qed/fcoe_common.h                   |  715 +++++++++++++++
 include/linux/qed/qed_fcoe_if.h                   |  145 +++
 include/linux/qed/qed_if.h                        |   41 +-
 25 files changed, 3211 insertions(+), 19 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_fcoe.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_fcoe.h
 create mode 100644 include/linux/qed/fcoe_common.h
 create mode 100644 include/linux/qed/qed_fcoe_if.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig
index aaa1e8517348..c2e24afbaeb2 100644
--- a/drivers/net/ethernet/qlogic/Kconfig
+++ b/drivers/net/ethernet/qlogic/Kconfig
@@ -114,4 +114,7 @@ config QED_RDMA
 config QED_ISCSI
 	bool
 
+config QED_FCOE
+	bool
+
 endif # NET_VENDOR_QLOGIC
diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index 1a7300f72cab..974929dcc74e 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -7,3 +7,4 @@ qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_QED_RDMA) += qed_roce.o
 qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o qed_ooo.o
+qed-$(CONFIG_QED_FCOE) += qed_fcoe.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 6557f94b92ed..61a9cd5be497 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -60,6 +60,7 @@ extern const struct qed_common_ops qed_common_ops_pass;
 #define QED_WFQ_UNIT	100
 
 #define ISCSI_BDQ_ID(_port_id) (_port_id)
+#define FCOE_BDQ_ID(_port_id) ((_port_id) + 2)
 #define QED_WID_SIZE            (1024)
 #define QED_PF_DEMS_SIZE        (4)
 
@@ -167,6 +168,7 @@ struct qed_tunn_update_params {
  */
 enum qed_pci_personality {
 	QED_PCI_ETH,
+	QED_PCI_FCOE,
 	QED_PCI_ISCSI,
 	QED_PCI_ETH_ROCE,
 	QED_PCI_DEFAULT /* default in shmem */
@@ -204,6 +206,7 @@ enum QED_FEATURE {
 	QED_VF,
 	QED_RDMA_CNQ,
 	QED_VF_L2_QUE,
+	QED_FCOE_CQ,
 	QED_MAX_FEATURES,
 };
 
@@ -221,6 +224,7 @@ enum QED_PORT_MODE {
 
 enum qed_dev_cap {
 	QED_DEV_CAP_ETH,
+	QED_DEV_CAP_FCOE,
 	QED_DEV_CAP_ISCSI,
 	QED_DEV_CAP_ROCE,
 };
@@ -255,6 +259,10 @@ struct qed_hw_info {
 	u32				part_num[4];
 
 	unsigned char			hw_mac_addr[ETH_ALEN];
+	u64				node_wwn;
+	u64				port_wwn;
+
+	u16				num_fcoe_conns;
 
 	struct qed_igu_info		*p_igu_info;
 
@@ -410,6 +418,7 @@ struct qed_hwfn {
 	struct qed_ooo_info		*p_ooo_info;
 	struct qed_rdma_info		*p_rdma_info;
 	struct qed_iscsi_info		*p_iscsi_info;
+	struct qed_fcoe_info		*p_fcoe_info;
 	struct qed_pf_params		pf_params;
 
 	bool b_rdma_enabled_in_prs;
@@ -620,11 +629,13 @@ struct qed_dev {
 
 	u8				protocol;
 #define IS_QED_ETH_IF(cdev)     ((cdev)->protocol == QED_PROTOCOL_ETH)
+#define IS_QED_FCOE_IF(cdev)    ((cdev)->protocol == QED_PROTOCOL_FCOE)
 
 	/* Callbacks to protocol driver */
 	union {
 		struct qed_common_cb_ops	*common;
 		struct qed_eth_cb_ops		*eth;
+		struct qed_fcoe_cb_ops		*fcoe;
 		struct qed_iscsi_cb_ops		*iscsi;
 	} protocol_ops;
 	void				*ops_cookie;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index dcb8fc185df7..d42d03df751a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -90,12 +90,14 @@ union conn_context {
 	struct core_conn_context core_ctx;
 	struct eth_conn_context eth_ctx;
 	struct iscsi_conn_context iscsi_ctx;
+	struct fcoe_conn_context fcoe_ctx;
 	struct roce_conn_context roce_ctx;
 };
 
-/* TYPE-0 task context - iSCSI */
+/* TYPE-0 task context - iSCSI, FCOE */
 union type0_task_context {
 	struct iscsi_task_context iscsi_ctx;
+	struct fcoe_task_context fcoe_ctx;
 };
 
 /* TYPE-1 task context - ROCE */
@@ -240,15 +242,22 @@ struct qed_cxt_mngr {
 static bool src_proto(enum protocol_type type)
 {
 	return type == PROTOCOLID_ISCSI ||
+	       type == PROTOCOLID_FCOE ||
 	       type == PROTOCOLID_ROCE;
 }
 
 static bool tm_cid_proto(enum protocol_type type)
 {
 	return type == PROTOCOLID_ISCSI ||
+	       type == PROTOCOLID_FCOE ||
 	       type == PROTOCOLID_ROCE;
 }
 
+static bool tm_tid_proto(enum protocol_type type)
+{
+	return type == PROTOCOLID_FCOE;
+}
+
 /* counts the iids for the CDU/CDUC ILT client configuration */
 struct qed_cdu_iids {
 	u32 pf_cids;
@@ -307,6 +316,22 @@ static void qed_cxt_tm_iids(struct qed_cxt_mngr *p_mngr,
 			iids->pf_cids += p_cfg->cid_count;
 			iids->per_vf_cids += p_cfg->cids_per_vf;
 		}
+
+		if (tm_tid_proto(i)) {
+			struct qed_tid_seg *segs = p_cfg->tid_seg;
+
+			/* for each segment there is at most one
+			 * protocol for which count is not 0.
+			 */
+			for (j = 0; j < NUM_TASK_PF_SEGMENTS; j++)
+				iids->pf_tids[j] += segs[j].count;
+
+			/* The last array elelment is for the VFs. As for PF
+			 * segments there can be only one protocol for
+			 * which this value is not 0.
+			 */
+			iids->per_vf_tids += segs[NUM_TASK_PF_SEGMENTS].count;
+		}
 	}
 
 	iids->pf_cids = roundup(iids->pf_cids, TM_ALIGN);
@@ -1694,9 +1719,42 @@ static void qed_tm_init_pf(struct qed_hwfn *p_hwfn)
 	/* @@@TBD how to enable the scan for the VFs */
 }
 
+static void qed_prs_init_common(struct qed_hwfn *p_hwfn)
+{
+	if ((p_hwfn->hw_info.personality == QED_PCI_FCOE) &&
+	    p_hwfn->pf_params.fcoe_pf_params.is_target)
+		STORE_RT_REG(p_hwfn,
+			     PRS_REG_SEARCH_RESP_INITIATOR_TYPE_RT_OFFSET, 0);
+}
+
+static void qed_prs_init_pf(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_conn_type_cfg *p_fcoe;
+	struct qed_tid_seg *p_tid;
+
+	p_fcoe = &p_mngr->conn_cfg[PROTOCOLID_FCOE];
+
+	/* If FCoE is active set the MAX OX_ID (tid) in the Parser */
+	if (!p_fcoe->cid_count)
+		return;
+
+	p_tid = &p_fcoe->tid_seg[QED_CXT_FCOE_TID_SEG];
+	if (p_hwfn->pf_params.fcoe_pf_params.is_target) {
+		STORE_RT_REG_AGG(p_hwfn,
+				 PRS_REG_TASK_ID_MAX_TARGET_PF_RT_OFFSET,
+				 p_tid->count);
+	} else {
+		STORE_RT_REG_AGG(p_hwfn,
+				 PRS_REG_TASK_ID_MAX_INITIATOR_PF_RT_OFFSET,
+				 p_tid->count);
+	}
+}
+
 void qed_cxt_hw_init_common(struct qed_hwfn *p_hwfn)
 {
 	qed_cdu_init_common(p_hwfn);
+	qed_prs_init_common(p_hwfn);
 }
 
 void qed_cxt_hw_init_pf(struct qed_hwfn *p_hwfn)
@@ -1708,6 +1766,7 @@ void qed_cxt_hw_init_pf(struct qed_hwfn *p_hwfn)
 	qed_ilt_init_pf(p_hwfn);
 	qed_src_init_pf(p_hwfn);
 	qed_tm_init_pf(p_hwfn);
+	qed_prs_init_pf(p_hwfn);
 }
 
 int qed_cxt_acquire_cid(struct qed_hwfn *p_hwfn,
@@ -1885,6 +1944,27 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn)
 					    p_params->num_cons, 1);
 		break;
 	}
+	case QED_PCI_FCOE:
+	{
+		struct qed_fcoe_pf_params *p_params;
+
+		p_params = &p_hwfn->pf_params.fcoe_pf_params;
+
+		if (p_params->num_cons && p_params->num_tasks) {
+			qed_cxt_set_proto_cid_count(p_hwfn,
+						    PROTOCOLID_FCOE,
+						    p_params->num_cons,
+						    0);
+
+			qed_cxt_set_proto_tid_count(p_hwfn, PROTOCOLID_FCOE,
+						    QED_CXT_FCOE_TID_SEG, 0,
+						    p_params->num_tasks, true);
+		} else {
+			DP_INFO(p_hwfn->cdev,
+				"Fcoe personality used without setting params!\n");
+		}
+		break;
+	}
 	case QED_PCI_ISCSI:
 	{
 		struct qed_iscsi_pf_params *p_params;
@@ -1927,6 +2007,10 @@ int qed_cxt_get_tid_mem_info(struct qed_hwfn *p_hwfn,
 
 	/* Verify the personality */
 	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_FCOE:
+		proto = PROTOCOLID_FCOE;
+		seg = QED_CXT_FCOE_TID_SEG;
+		break;
 	case QED_PCI_ISCSI:
 		proto = PROTOCOLID_ISCSI;
 		seg = QED_CXT_ISCSI_TID_SEG;
@@ -2215,15 +2299,19 @@ int qed_cxt_get_task_ctx(struct qed_hwfn *p_hwfn,
 {
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
 	struct qed_ilt_client_cfg *p_cli;
-	struct qed_ilt_cli_blk *p_seg;
 	struct qed_tid_seg *p_seg_info;
-	u32 proto, seg;
-	u32 total_lines;
-	u32 tid_size, ilt_idx;
+	struct qed_ilt_cli_blk *p_seg;
 	u32 num_tids_per_block;
+	u32 tid_size, ilt_idx;
+	u32 total_lines;
+	u32 proto, seg;
 
 	/* Verify the personality */
 	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_FCOE:
+		proto = PROTOCOLID_FCOE;
+		seg = QED_CXT_FCOE_TID_SEG;
+		break;
 	case QED_PCI_ISCSI:
 		proto = PROTOCOLID_ISCSI;
 		seg = QED_CXT_ISCSI_TID_SEG;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index 98f4973cac9d..8b010324268a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -91,6 +91,7 @@ int qed_cxt_get_tid_mem_info(struct qed_hwfn *p_hwfn,
 
 #define QED_CXT_ISCSI_TID_SEG	PROTOCOLID_ISCSI
 #define QED_CXT_ROCE_TID_SEG	PROTOCOLID_ROCE
+#define QED_CXT_FCOE_TID_SEG	PROTOCOLID_FCOE
 enum qed_cxt_elem_type {
 	QED_ELEM_CXT,
 	QED_ELEM_SRQ,
@@ -204,4 +205,6 @@ int qed_cxt_free_proto_ilt(struct qed_hwfn *p_hwfn, enum protocol_type proto);
 
 #define QED_CTX_WORKING_MEM 0
 #define QED_CTX_FL_MEM 1
+int qed_cxt_get_task_ctx(struct qed_hwfn *p_hwfn,
+			 u32 tid, u8 ctx_type, void **task_ctx);
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index dc0d2c9ad6b5..5bd36a4a8fcd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -432,7 +432,6 @@ qed_dcbx_copy_mib(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
-#ifdef CONFIG_DCB
 static void
 qed_dcbx_get_priority_info(struct qed_hwfn *p_hwfn,
 			   struct qed_dcbx_app_prio *p_prio,
@@ -749,7 +748,6 @@ qed_dcbx_get_params(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 
 	return 0;
 }
-#endif
 
 static int
 qed_dcbx_read_local_lldp_mib(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
@@ -864,6 +862,15 @@ static int qed_dcbx_read_mib(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+void qed_dcbx_aen(struct qed_hwfn *hwfn, u32 mib_type)
+{
+	struct qed_common_cb_ops *op = hwfn->cdev->protocol_ops.common;
+	void *cookie = hwfn->cdev->ops_cookie;
+
+	if (cookie && op->dcbx_aen)
+		op->dcbx_aen(cookie, &hwfn->p_dcbx_info->get, mib_type);
+}
+
 /* Read updated MIB.
  * Reconfigure QM and invoke PF update ramrod command if operational MIB
  * change is detected.
@@ -890,6 +897,8 @@ qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn,
 			qed_sp_pf_update(p_hwfn);
 		}
 	}
+	qed_dcbx_get_params(p_hwfn, p_ptt, &p_hwfn->p_dcbx_info->get, type);
+	qed_dcbx_aen(p_hwfn, type);
 
 	return rc;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
index d70300fda020..0fabe97f998d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
@@ -57,7 +57,6 @@ struct qed_dcbx_app_data {
 	u8 tc;			/* Traffic Class */
 };
 
-#ifdef CONFIG_DCB
 #define QED_DCBX_VERSION_DISABLED       0
 #define QED_DCBX_VERSION_IEEE           1
 #define QED_DCBX_VERSION_CEE            2
@@ -73,7 +72,6 @@ struct qed_dcbx_set {
 	struct qed_dcbx_admin_params config;
 	u32 ver_num;
 };
-#endif
 
 struct qed_dcbx_results {
 	bool dcbx_enabled;
@@ -97,9 +95,8 @@ struct qed_dcbx_info {
 	struct qed_dcbx_results results;
 	struct dcbx_mib operational;
 	struct dcbx_mib remote;
-#ifdef CONFIG_DCB
 	struct qed_dcbx_set set;
-#endif
+	struct qed_dcbx_get get;
 	u8 dcbx_cap;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 33e720143b8d..5ee7f040c50a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -49,6 +49,7 @@
 #include "qed_cxt.h"
 #include "qed_dcbx.h"
 #include "qed_dev_api.h"
+#include "qed_fcoe.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_init_ops.h"
@@ -172,6 +173,9 @@ void qed_resc_free(struct qed_dev *cdev)
 #ifdef CONFIG_QED_LL2
 		qed_ll2_free(p_hwfn, p_hwfn->p_ll2_info);
 #endif
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE)
+			qed_fcoe_free(p_hwfn, p_hwfn->p_fcoe_info);
+
 		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
 			qed_iscsi_free(p_hwfn, p_hwfn->p_iscsi_info);
 			qed_ooo_free(p_hwfn, p_hwfn->p_ooo_info);
@@ -433,6 +437,7 @@ int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 int qed_resc_alloc(struct qed_dev *cdev)
 {
 	struct qed_iscsi_info *p_iscsi_info;
+	struct qed_fcoe_info *p_fcoe_info;
 	struct qed_ooo_info *p_ooo_info;
 #ifdef CONFIG_QED_LL2
 	struct qed_ll2_info *p_ll2_info;
@@ -539,6 +544,14 @@ int qed_resc_alloc(struct qed_dev *cdev)
 			p_hwfn->p_ll2_info = p_ll2_info;
 		}
 #endif
+
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE) {
+			p_fcoe_info = qed_fcoe_alloc(p_hwfn);
+			if (!p_fcoe_info)
+				goto alloc_no_mem;
+			p_hwfn->p_fcoe_info = p_fcoe_info;
+		}
+
 		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
 			p_iscsi_info = qed_iscsi_alloc(p_hwfn);
 			if (!p_iscsi_info)
@@ -602,6 +615,9 @@ void qed_resc_setup(struct qed_dev *cdev)
 		if (p_hwfn->using_ll2)
 			qed_ll2_setup(p_hwfn, p_hwfn->p_ll2_info);
 #endif
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE)
+			qed_fcoe_setup(p_hwfn, p_hwfn->p_fcoe_info);
+
 		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
 			qed_iscsi_setup(p_hwfn, p_hwfn->p_iscsi_info);
 			qed_ooo_setup(p_hwfn, p_hwfn->p_ooo_info);
@@ -994,7 +1010,8 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 	/* Protocl Configuration  */
 	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_TCP_RT_OFFSET,
 		     (p_hwfn->hw_info.personality == QED_PCI_ISCSI) ? 1 : 0);
-	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_FCOE_RT_OFFSET, 0);
+	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_FCOE_RT_OFFSET,
+		     (p_hwfn->hw_info.personality == QED_PCI_FCOE) ? 1 : 0);
 	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0);
 
 	/* Cleanup chip from previous driver if such remains exist */
@@ -1026,8 +1043,16 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		/* send function start command */
 		rc = qed_sp_pf_start(p_hwfn, p_tunn, p_hwfn->cdev->mf_mode,
 				     allow_npar_tx_switch);
-		if (rc)
+		if (rc) {
 			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
+			return rc;
+		}
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE) {
+			qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_TAG1, BIT(2));
+			qed_wr(p_hwfn, p_ptt,
+			       PRS_REG_PKT_LEN_STAT_TAGS_NOT_COUNTED_FIRST,
+			       0x100);
+		}
 	}
 	return rc;
 }
@@ -1787,8 +1812,8 @@ static int qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 
 static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
-	u32 nvm_cfg1_offset, mf_mode, addr, generic_cont0, core_cfg;
 	u32 port_cfg_addr, link_temp, nvm_cfg_addr, device_capabilities;
+	u32 nvm_cfg1_offset, mf_mode, addr, generic_cont0, core_cfg;
 	struct qed_mcp_link_params *link;
 
 	/* Read global nvm_cfg address */
@@ -1934,6 +1959,9 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ETHERNET)
 		__set_bit(QED_DEV_CAP_ETH,
 			  &p_hwfn->hw_info.device_capabilities);
+	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_FCOE)
+		__set_bit(QED_DEV_CAP_FCOE,
+			  &p_hwfn->hw_info.device_capabilities);
 	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ISCSI)
 		__set_bit(QED_DEV_CAP_ISCSI,
 			  &p_hwfn->hw_info.device_capabilities);
@@ -2671,6 +2699,177 @@ void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn,
 		DP_NOTICE(p_hwfn, "Tried to remove a non-configured filter\n");
 }
 
+int
+qed_llh_add_protocol_filter(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    u16 source_port_or_eth_type,
+			    u16 dest_port, enum qed_llh_port_filter_type_t type)
+{
+	u32 high = 0, low = 0, en;
+	int i;
+
+	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+		return 0;
+
+	switch (type) {
+	case QED_LLH_FILTER_ETHERTYPE:
+		high = source_port_or_eth_type;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_PORT:
+	case QED_LLH_FILTER_UDP_SRC_PORT:
+		low = source_port_or_eth_type << 16;
+		break;
+	case QED_LLH_FILTER_TCP_DEST_PORT:
+	case QED_LLH_FILTER_UDP_DEST_PORT:
+		low = dest_port;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT:
+	case QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT:
+		low = (source_port_or_eth_type << 16) | dest_port;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Non valid LLH protocol filter type %d\n", type);
+		return -EINVAL;
+	}
+	/* Find a free entry and utilize it */
+	for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) {
+		en = qed_rd(p_hwfn, p_ptt,
+			    NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32));
+		if (en)
+			continue;
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       2 * i * sizeof(u32), low);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       (2 * i + 1) * sizeof(u32), high);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32), 1);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE +
+		       i * sizeof(u32), 1 << type);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 1);
+		break;
+	}
+	if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to find an empty LLH filter to utilize\n");
+		return -EINVAL;
+	}
+	switch (type) {
+	case QED_LLH_FILTER_ETHERTYPE:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "ETH type %x is added at %d\n",
+			   source_port_or_eth_type, i);
+		break;
+	case QED_LLH_FILTER_TCP_SRC_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "TCP src port %x is added at %d\n",
+			   source_port_or_eth_type, i);
+		break;
+	case QED_LLH_FILTER_UDP_SRC_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "UDP src port %x is added at %d\n",
+			   source_port_or_eth_type, i);
+		break;
+	case QED_LLH_FILTER_TCP_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "TCP dst port %x is added at %d\n", dest_port, i);
+		break;
+	case QED_LLH_FILTER_UDP_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "UDP dst port %x is added at %d\n", dest_port, i);
+		break;
+	case QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "TCP src/dst ports %x/%x are added at %d\n",
+			   source_port_or_eth_type, dest_port, i);
+		break;
+	case QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "UDP src/dst ports %x/%x are added at %d\n",
+			   source_port_or_eth_type, dest_port, i);
+		break;
+	}
+	return 0;
+}
+
+void
+qed_llh_remove_protocol_filter(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u16 source_port_or_eth_type,
+			       u16 dest_port,
+			       enum qed_llh_port_filter_type_t type)
+{
+	u32 high = 0, low = 0;
+	int i;
+
+	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+		return;
+
+	switch (type) {
+	case QED_LLH_FILTER_ETHERTYPE:
+		high = source_port_or_eth_type;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_PORT:
+	case QED_LLH_FILTER_UDP_SRC_PORT:
+		low = source_port_or_eth_type << 16;
+		break;
+	case QED_LLH_FILTER_TCP_DEST_PORT:
+	case QED_LLH_FILTER_UDP_DEST_PORT:
+		low = dest_port;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT:
+	case QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT:
+		low = (source_port_or_eth_type << 16) | dest_port;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Non valid LLH protocol filter type %d\n", type);
+		return;
+	}
+
+	for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) {
+		if (!qed_rd(p_hwfn, p_ptt,
+			    NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32)))
+			continue;
+		if (!qed_rd(p_hwfn, p_ptt,
+			    NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32)))
+			continue;
+		if (!(qed_rd(p_hwfn, p_ptt,
+			     NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE +
+			     i * sizeof(u32)) & BIT(type)))
+			continue;
+		if (qed_rd(p_hwfn, p_ptt,
+			   NIG_REG_LLH_FUNC_FILTER_VALUE +
+			   2 * i * sizeof(u32)) != low)
+			continue;
+		if (qed_rd(p_hwfn, p_ptt,
+			   NIG_REG_LLH_FUNC_FILTER_VALUE +
+			   (2 * i + 1) * sizeof(u32)) != high)
+			continue;
+
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE +
+		       i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE + 2 * i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       (2 * i + 1) * sizeof(u32), 0);
+		break;
+	}
+
+	if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE)
+		DP_NOTICE(p_hwfn, "Tried to remove a non-configured filter\n");
+}
+
 static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 			    u32 hw_addr, void *p_eth_qzone,
 			    size_t eth_qzone_size, u8 timeset)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index 5d37ba24da40..6812003411cd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -353,6 +353,48 @@ int qed_llh_add_mac_filter(struct qed_hwfn *p_hwfn,
 void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn,
 			       struct qed_ptt *p_ptt, u8 *p_filter);
 
+enum qed_llh_port_filter_type_t {
+	QED_LLH_FILTER_ETHERTYPE,
+	QED_LLH_FILTER_TCP_SRC_PORT,
+	QED_LLH_FILTER_TCP_DEST_PORT,
+	QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT,
+	QED_LLH_FILTER_UDP_SRC_PORT,
+	QED_LLH_FILTER_UDP_DEST_PORT,
+	QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT
+};
+
+/**
+ * @brief qed_llh_add_protocol_filter - configures a protocol filter in llh
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_port_or_eth_type - source port or ethertype to add
+ * @param dest_port - destination port to add
+ * @param type - type of filters and comparing
+ */
+int
+qed_llh_add_protocol_filter(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    u16 source_port_or_eth_type,
+			    u16 dest_port,
+			    enum qed_llh_port_filter_type_t type);
+
+/**
+ * @brief qed_llh_remove_protocol_filter - remove a protocol filter in llh
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_port_or_eth_type - source port or ethertype to add
+ * @param dest_port - destination port to add
+ * @param type - type of filters and comparing
+ */
+void
+qed_llh_remove_protocol_filter(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u16 source_port_or_eth_type,
+			       u16 dest_port,
+			       enum qed_llh_port_filter_type_t type);
+
 /**
  * *@brief Cleanup of previous driver remains prior to load
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
new file mode 100644
index 000000000000..cbc81412174f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
@@ -0,0 +1,1014 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#define __PREVENT_DUMP_MEM_ARR__
+#define __PREVENT_PXP_GLOBAL_WIN__
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_fcoe.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include <linux/qed/qed_fcoe_if.h>
+
+struct qed_fcoe_conn {
+	struct list_head list_entry;
+	bool free_on_delete;
+
+	u16 conn_id;
+	u32 icid;
+	u32 fw_cid;
+	u8 layer_code;
+
+	dma_addr_t sq_pbl_addr;
+	dma_addr_t sq_curr_page_addr;
+	dma_addr_t sq_next_page_addr;
+	dma_addr_t xferq_pbl_addr;
+	void *xferq_pbl_addr_virt_addr;
+	dma_addr_t xferq_addr[4];
+	void *xferq_addr_virt_addr[4];
+	dma_addr_t confq_pbl_addr;
+	void *confq_pbl_addr_virt_addr;
+	dma_addr_t confq_addr[2];
+	void *confq_addr_virt_addr[2];
+
+	dma_addr_t terminate_params;
+
+	u16 dst_mac_addr_lo;
+	u16 dst_mac_addr_mid;
+	u16 dst_mac_addr_hi;
+	u16 src_mac_addr_lo;
+	u16 src_mac_addr_mid;
+	u16 src_mac_addr_hi;
+
+	u16 tx_max_fc_pay_len;
+	u16 e_d_tov_timer_val;
+	u16 rec_tov_timer_val;
+	u16 rx_max_fc_pay_len;
+	u16 vlan_tag;
+	u16 physical_q0;
+
+	struct fc_addr_nw s_id;
+	u8 max_conc_seqs_c3;
+	struct fc_addr_nw d_id;
+	u8 flags;
+	u8 def_q_idx;
+};
+
+static int
+qed_sp_fcoe_func_start(struct qed_hwfn *p_hwfn,
+		       enum spq_mode comp_mode,
+		       struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct qed_fcoe_pf_params *fcoe_pf_params = NULL;
+	struct fcoe_init_ramrod_params *p_ramrod = NULL;
+	struct fcoe_init_func_ramrod_data *p_data;
+	struct fcoe_conn_context *p_cxt = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	struct qed_cxt_info cxt_info;
+	u32 dummy_cid;
+	int rc = 0;
+	u16 tmp;
+	u8 i;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_INIT_FUNC,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.fcoe_init;
+	p_data = &p_ramrod->init_ramrod_data;
+	fcoe_pf_params = &p_hwfn->pf_params.fcoe_pf_params;
+
+	p_data->mtu = cpu_to_le16(fcoe_pf_params->mtu);
+	tmp = cpu_to_le16(fcoe_pf_params->sq_num_pbl_pages);
+	p_data->sq_num_pages_in_pbl = tmp;
+
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_FCOE, &dummy_cid);
+	if (rc)
+		return rc;
+
+	cxt_info.iid = dummy_cid;
+	rc = qed_cxt_get_cid_info(p_hwfn, &cxt_info);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Cannot find context info for dummy cid=%d\n",
+			  dummy_cid);
+		return rc;
+	}
+	p_cxt = cxt_info.p_cxt;
+	SET_FIELD(p_cxt->tstorm_ag_context.flags3,
+		  TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_EN, 1);
+
+	fcoe_pf_params->dummy_icid = (u16)dummy_cid;
+
+	tmp = cpu_to_le16(fcoe_pf_params->num_tasks);
+	p_data->func_params.num_tasks = tmp;
+	p_data->func_params.log_page_size = fcoe_pf_params->log_page_size;
+	p_data->func_params.debug_mode = fcoe_pf_params->debug_mode;
+
+	DMA_REGPAIR_LE(p_data->q_params.glbl_q_params_addr,
+		       fcoe_pf_params->glbl_q_params_addr);
+
+	tmp = cpu_to_le16(fcoe_pf_params->cq_num_entries);
+	p_data->q_params.cq_num_entries = tmp;
+
+	tmp = cpu_to_le16(fcoe_pf_params->cmdq_num_entries);
+	p_data->q_params.cmdq_num_entries = tmp;
+
+	tmp = fcoe_pf_params->num_cqs;
+	p_data->q_params.num_queues = (u8)tmp;
+
+	tmp = (u16)p_hwfn->hw_info.resc_start[QED_CMDQS_CQS];
+	p_data->q_params.queue_relative_offset = (u8)tmp;
+
+	for (i = 0; i < fcoe_pf_params->num_cqs; i++) {
+		tmp = cpu_to_le16(p_hwfn->sbs_info[i]->igu_sb_id);
+		p_data->q_params.cq_cmdq_sb_num_arr[i] = tmp;
+	}
+
+	p_data->q_params.cq_sb_pi = fcoe_pf_params->gl_rq_pi;
+	p_data->q_params.cmdq_sb_pi = fcoe_pf_params->gl_cmd_pi;
+
+	p_data->q_params.bdq_resource_id = FCOE_BDQ_ID(p_hwfn->port_id);
+
+	DMA_REGPAIR_LE(p_data->q_params.bdq_pbl_base_address[BDQ_ID_RQ],
+		       fcoe_pf_params->bdq_pbl_base_addr[BDQ_ID_RQ]);
+	p_data->q_params.bdq_pbl_num_entries[BDQ_ID_RQ] =
+	    fcoe_pf_params->bdq_pbl_num_entries[BDQ_ID_RQ];
+	tmp = fcoe_pf_params->bdq_xoff_threshold[BDQ_ID_RQ];
+	p_data->q_params.bdq_xoff_threshold[BDQ_ID_RQ] = cpu_to_le16(tmp);
+	tmp = fcoe_pf_params->bdq_xon_threshold[BDQ_ID_RQ];
+	p_data->q_params.bdq_xon_threshold[BDQ_ID_RQ] = cpu_to_le16(tmp);
+
+	DMA_REGPAIR_LE(p_data->q_params.bdq_pbl_base_address[BDQ_ID_IMM_DATA],
+		       fcoe_pf_params->bdq_pbl_base_addr[BDQ_ID_IMM_DATA]);
+	p_data->q_params.bdq_pbl_num_entries[BDQ_ID_IMM_DATA] =
+	    fcoe_pf_params->bdq_pbl_num_entries[BDQ_ID_IMM_DATA];
+	tmp = fcoe_pf_params->bdq_xoff_threshold[BDQ_ID_IMM_DATA];
+	p_data->q_params.bdq_xoff_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(tmp);
+	tmp = fcoe_pf_params->bdq_xon_threshold[BDQ_ID_IMM_DATA];
+	p_data->q_params.bdq_xon_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(tmp);
+	tmp = fcoe_pf_params->rq_buffer_size;
+	p_data->q_params.rq_buffer_size = cpu_to_le16(tmp);
+
+	if (fcoe_pf_params->is_target) {
+		SET_FIELD(p_data->q_params.q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+		if (p_data->q_params.bdq_pbl_num_entries[BDQ_ID_IMM_DATA])
+			SET_FIELD(p_data->q_params.q_validity,
+				  SCSI_INIT_FUNC_QUEUES_IMM_DATA_VALID, 1);
+		SET_FIELD(p_data->q_params.q_validity,
+			  SCSI_INIT_FUNC_QUEUES_CMD_VALID, 1);
+	} else {
+		SET_FIELD(p_data->q_params.q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+	}
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	return rc;
+}
+
+static int
+qed_sp_fcoe_conn_offload(struct qed_hwfn *p_hwfn,
+			 struct qed_fcoe_conn *p_conn,
+			 enum spq_mode comp_mode,
+			 struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct fcoe_conn_offload_ramrod_params *p_ramrod = NULL;
+	struct fcoe_conn_offload_ramrod_data *p_data;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u16 pq_id = 0, tmp;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_OFFLOAD_CONN,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.fcoe_conn_ofld;
+	p_data = &p_ramrod->offload_ramrod_data;
+
+	/* Transmission PQ is the first of the PF */
+	pq_id = qed_get_qm_pq(p_hwfn, PROTOCOLID_FCOE, NULL);
+	p_conn->physical_q0 = cpu_to_le16(pq_id);
+	p_data->physical_q0 = cpu_to_le16(pq_id);
+
+	p_data->conn_id = cpu_to_le16(p_conn->conn_id);
+	DMA_REGPAIR_LE(p_data->sq_pbl_addr, p_conn->sq_pbl_addr);
+	DMA_REGPAIR_LE(p_data->sq_curr_page_addr, p_conn->sq_curr_page_addr);
+	DMA_REGPAIR_LE(p_data->sq_next_page_addr, p_conn->sq_next_page_addr);
+	DMA_REGPAIR_LE(p_data->xferq_pbl_addr, p_conn->xferq_pbl_addr);
+	DMA_REGPAIR_LE(p_data->xferq_curr_page_addr, p_conn->xferq_addr[0]);
+	DMA_REGPAIR_LE(p_data->xferq_next_page_addr, p_conn->xferq_addr[1]);
+
+	DMA_REGPAIR_LE(p_data->respq_pbl_addr, p_conn->confq_pbl_addr);
+	DMA_REGPAIR_LE(p_data->respq_curr_page_addr, p_conn->confq_addr[0]);
+	DMA_REGPAIR_LE(p_data->respq_next_page_addr, p_conn->confq_addr[1]);
+
+	p_data->dst_mac_addr_lo = cpu_to_le16(p_conn->dst_mac_addr_lo);
+	p_data->dst_mac_addr_mid = cpu_to_le16(p_conn->dst_mac_addr_mid);
+	p_data->dst_mac_addr_hi = cpu_to_le16(p_conn->dst_mac_addr_hi);
+	p_data->src_mac_addr_lo = cpu_to_le16(p_conn->src_mac_addr_lo);
+	p_data->src_mac_addr_mid = cpu_to_le16(p_conn->src_mac_addr_mid);
+	p_data->src_mac_addr_hi = cpu_to_le16(p_conn->src_mac_addr_hi);
+
+	tmp = cpu_to_le16(p_conn->tx_max_fc_pay_len);
+	p_data->tx_max_fc_pay_len = tmp;
+	tmp = cpu_to_le16(p_conn->e_d_tov_timer_val);
+	p_data->e_d_tov_timer_val = tmp;
+	tmp = cpu_to_le16(p_conn->rec_tov_timer_val);
+	p_data->rec_rr_tov_timer_val = tmp;
+	tmp = cpu_to_le16(p_conn->rx_max_fc_pay_len);
+	p_data->rx_max_fc_pay_len = tmp;
+
+	p_data->vlan_tag = cpu_to_le16(p_conn->vlan_tag);
+	p_data->s_id.addr_hi = p_conn->s_id.addr_hi;
+	p_data->s_id.addr_mid = p_conn->s_id.addr_mid;
+	p_data->s_id.addr_lo = p_conn->s_id.addr_lo;
+	p_data->max_conc_seqs_c3 = p_conn->max_conc_seqs_c3;
+	p_data->d_id.addr_hi = p_conn->d_id.addr_hi;
+	p_data->d_id.addr_mid = p_conn->d_id.addr_mid;
+	p_data->d_id.addr_lo = p_conn->d_id.addr_lo;
+	p_data->flags = p_conn->flags;
+	p_data->def_q_idx = p_conn->def_q_idx;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_sp_fcoe_conn_destroy(struct qed_hwfn *p_hwfn,
+			 struct qed_fcoe_conn *p_conn,
+			 enum spq_mode comp_mode,
+			 struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct fcoe_conn_terminate_ramrod_params *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_TERMINATE_CONN,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.fcoe_conn_terminate;
+	DMA_REGPAIR_LE(p_ramrod->terminate_ramrod_data.terminate_params_addr,
+		       p_conn->terminate_params);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_sp_fcoe_func_stop(struct qed_hwfn *p_hwfn,
+		      enum spq_mode comp_mode,
+		      struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct qed_ptt *p_ptt = p_hwfn->p_main_ptt;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u32 active_segs = 0;
+	int rc = 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_hwfn->pf_params.fcoe_pf_params.dummy_icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_DESTROY_FUNC,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	active_segs = qed_rd(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK);
+	active_segs &= ~BIT(QED_CXT_FCOE_TID_SEG);
+	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK, active_segs);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_fcoe_allocate_connection(struct qed_hwfn *p_hwfn,
+			     struct qed_fcoe_conn **p_out_conn)
+{
+	struct qed_fcoe_conn *p_conn = NULL;
+	void *p_addr;
+	u32 i;
+
+	spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+	if (!list_empty(&p_hwfn->p_fcoe_info->free_list))
+		p_conn =
+		    list_first_entry(&p_hwfn->p_fcoe_info->free_list,
+				     struct qed_fcoe_conn, list_entry);
+	if (p_conn) {
+		list_del(&p_conn->list_entry);
+		spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+		*p_out_conn = p_conn;
+		return 0;
+	}
+	spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+
+	p_conn = kzalloc(sizeof(*p_conn), GFP_KERNEL);
+	if (!p_conn)
+		return -ENOMEM;
+
+	p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    QED_CHAIN_PAGE_SIZE,
+				    &p_conn->xferq_pbl_addr, GFP_KERNEL);
+	if (!p_addr)
+		goto nomem_pbl_xferq;
+	p_conn->xferq_pbl_addr_virt_addr = p_addr;
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->xferq_addr); i++) {
+		p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					    QED_CHAIN_PAGE_SIZE,
+					    &p_conn->xferq_addr[i], GFP_KERNEL);
+		if (!p_addr)
+			goto nomem_xferq;
+		p_conn->xferq_addr_virt_addr[i] = p_addr;
+
+		p_addr = p_conn->xferq_pbl_addr_virt_addr;
+		((dma_addr_t *)p_addr)[i] = p_conn->xferq_addr[i];
+	}
+
+	p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    QED_CHAIN_PAGE_SIZE,
+				    &p_conn->confq_pbl_addr, GFP_KERNEL);
+	if (!p_addr)
+		goto nomem_xferq;
+	p_conn->confq_pbl_addr_virt_addr = p_addr;
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->confq_addr); i++) {
+		p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					    QED_CHAIN_PAGE_SIZE,
+					    &p_conn->confq_addr[i], GFP_KERNEL);
+		if (!p_addr)
+			goto nomem_confq;
+		p_conn->confq_addr_virt_addr[i] = p_addr;
+
+		p_addr = p_conn->confq_pbl_addr_virt_addr;
+		((dma_addr_t *)p_addr)[i] = p_conn->confq_addr[i];
+	}
+
+	p_conn->free_on_delete = true;
+	*p_out_conn = p_conn;
+	return 0;
+
+nomem_confq:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  QED_CHAIN_PAGE_SIZE,
+			  p_conn->confq_pbl_addr_virt_addr,
+			  p_conn->confq_pbl_addr);
+	for (i = 0; i < ARRAY_SIZE(p_conn->confq_addr); i++)
+		if (p_conn->confq_addr_virt_addr[i])
+			dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+					  QED_CHAIN_PAGE_SIZE,
+					  p_conn->confq_addr_virt_addr[i],
+					  p_conn->confq_addr[i]);
+nomem_xferq:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  QED_CHAIN_PAGE_SIZE,
+			  p_conn->xferq_pbl_addr_virt_addr,
+			  p_conn->xferq_pbl_addr);
+	for (i = 0; i < ARRAY_SIZE(p_conn->xferq_addr); i++)
+		if (p_conn->xferq_addr_virt_addr[i])
+			dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+					  QED_CHAIN_PAGE_SIZE,
+					  p_conn->xferq_addr_virt_addr[i],
+					  p_conn->xferq_addr[i]);
+nomem_pbl_xferq:
+	kfree(p_conn);
+	return -ENOMEM;
+}
+
+static void qed_fcoe_free_connection(struct qed_hwfn *p_hwfn,
+				     struct qed_fcoe_conn *p_conn)
+{
+	u32 i;
+
+	if (!p_conn)
+		return;
+
+	if (p_conn->confq_pbl_addr_virt_addr)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->confq_pbl_addr_virt_addr,
+				  p_conn->confq_pbl_addr);
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->confq_addr); i++) {
+		if (!p_conn->confq_addr_virt_addr[i])
+			continue;
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->confq_addr_virt_addr[i],
+				  p_conn->confq_addr[i]);
+	}
+
+	if (p_conn->xferq_pbl_addr_virt_addr)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->xferq_pbl_addr_virt_addr,
+				  p_conn->xferq_pbl_addr);
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->xferq_addr); i++) {
+		if (!p_conn->xferq_addr_virt_addr[i])
+			continue;
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->xferq_addr_virt_addr[i],
+				  p_conn->xferq_addr[i]);
+	}
+	kfree(p_conn);
+}
+
+static void __iomem *qed_fcoe_get_db_addr(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	return (u8 __iomem *)p_hwfn->doorbells +
+	       qed_db_addr(cid, DQ_DEMS_LEGACY);
+}
+
+static void __iomem *qed_fcoe_get_primary_bdq_prod(struct qed_hwfn *p_hwfn,
+						   u8 bdq_id)
+{
+	u8 bdq_function_id = FCOE_BDQ_ID(p_hwfn->port_id);
+
+	return (u8 __iomem *)p_hwfn->regview + GTT_BAR0_MAP_REG_MSDM_RAM +
+	       MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(bdq_function_id, bdq_id);
+}
+
+static void __iomem *qed_fcoe_get_secondary_bdq_prod(struct qed_hwfn *p_hwfn,
+						     u8 bdq_id)
+{
+	u8 bdq_function_id = FCOE_BDQ_ID(p_hwfn->port_id);
+
+	return (u8 __iomem *)p_hwfn->regview + GTT_BAR0_MAP_REG_TSDM_RAM +
+	       TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(bdq_function_id, bdq_id);
+}
+
+struct qed_fcoe_info *qed_fcoe_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_fcoe_info *p_fcoe_info;
+
+	/* Allocate LL2's set struct */
+	p_fcoe_info = kzalloc(sizeof(*p_fcoe_info), GFP_KERNEL);
+	if (!p_fcoe_info) {
+		DP_NOTICE(p_hwfn, "Failed to allocate qed_fcoe_info'\n");
+		return NULL;
+	}
+	INIT_LIST_HEAD(&p_fcoe_info->free_list);
+	return p_fcoe_info;
+}
+
+void qed_fcoe_setup(struct qed_hwfn *p_hwfn, struct qed_fcoe_info *p_fcoe_info)
+{
+	struct fcoe_task_context *p_task_ctx = NULL;
+	int rc;
+	u32 i;
+
+	spin_lock_init(&p_fcoe_info->lock);
+	for (i = 0; i < p_hwfn->pf_params.fcoe_pf_params.num_tasks; i++) {
+		rc = qed_cxt_get_task_ctx(p_hwfn, i,
+					  QED_CTX_WORKING_MEM,
+					  (void **)&p_task_ctx);
+		if (rc)
+			continue;
+
+		memset(p_task_ctx, 0, sizeof(struct fcoe_task_context));
+		SET_FIELD(p_task_ctx->timer_context.logical_client_0,
+			  TIMERS_CONTEXT_VALIDLC0, 1);
+		SET_FIELD(p_task_ctx->timer_context.logical_client_1,
+			  TIMERS_CONTEXT_VALIDLC1, 1);
+		SET_FIELD(p_task_ctx->tstorm_ag_context.flags0,
+			  TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE, 1);
+	}
+}
+
+void qed_fcoe_free(struct qed_hwfn *p_hwfn, struct qed_fcoe_info *p_fcoe_info)
+{
+	struct qed_fcoe_conn *p_conn = NULL;
+
+	if (!p_fcoe_info)
+		return;
+
+	while (!list_empty(&p_fcoe_info->free_list)) {
+		p_conn = list_first_entry(&p_fcoe_info->free_list,
+					  struct qed_fcoe_conn, list_entry);
+		if (!p_conn)
+			break;
+		list_del(&p_conn->list_entry);
+		qed_fcoe_free_connection(p_hwfn, p_conn);
+	}
+
+	kfree(p_fcoe_info);
+}
+
+static int
+qed_fcoe_acquire_connection(struct qed_hwfn *p_hwfn,
+			    struct qed_fcoe_conn *p_in_conn,
+			    struct qed_fcoe_conn **p_out_conn)
+{
+	struct qed_fcoe_conn *p_conn = NULL;
+	int rc = 0;
+	u32 icid;
+
+	spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_FCOE, &icid);
+	spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+	if (rc)
+		return rc;
+
+	/* Use input connection [if provided] or allocate a new one */
+	if (p_in_conn) {
+		p_conn = p_in_conn;
+	} else {
+		rc = qed_fcoe_allocate_connection(p_hwfn, &p_conn);
+		if (rc) {
+			spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+			qed_cxt_release_cid(p_hwfn, icid);
+			spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+			return rc;
+		}
+	}
+
+	p_conn->icid = icid;
+	p_conn->fw_cid = (p_hwfn->hw_info.opaque_fid << 16) | icid;
+	*p_out_conn = p_conn;
+
+	return rc;
+}
+
+static void qed_fcoe_release_connection(struct qed_hwfn *p_hwfn,
+					struct qed_fcoe_conn *p_conn)
+{
+	spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+	list_add_tail(&p_conn->list_entry, &p_hwfn->p_fcoe_info->free_list);
+	qed_cxt_release_cid(p_hwfn, p_conn->icid);
+	spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+}
+
+static void _qed_fcoe_get_tstats(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_fcoe_stats *p_stats)
+{
+	struct fcoe_rx_stat tstats;
+	u32 tstats_addr;
+
+	memset(&tstats, 0, sizeof(tstats));
+	tstats_addr = BAR0_MAP_REG_TSDM_RAM +
+	    TSTORM_FCOE_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, sizeof(tstats));
+
+	p_stats->fcoe_rx_byte_cnt = HILO_64_REGPAIR(tstats.fcoe_rx_byte_cnt);
+	p_stats->fcoe_rx_data_pkt_cnt =
+	    HILO_64_REGPAIR(tstats.fcoe_rx_data_pkt_cnt);
+	p_stats->fcoe_rx_xfer_pkt_cnt =
+	    HILO_64_REGPAIR(tstats.fcoe_rx_xfer_pkt_cnt);
+	p_stats->fcoe_rx_other_pkt_cnt =
+	    HILO_64_REGPAIR(tstats.fcoe_rx_other_pkt_cnt);
+
+	p_stats->fcoe_silent_drop_pkt_cmdq_full_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_cmdq_full_cnt);
+	p_stats->fcoe_silent_drop_pkt_rq_full_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_rq_full_cnt);
+	p_stats->fcoe_silent_drop_pkt_crc_error_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_crc_error_cnt);
+	p_stats->fcoe_silent_drop_pkt_task_invalid_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_task_invalid_cnt);
+	p_stats->fcoe_silent_drop_total_pkt_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_total_pkt_cnt);
+}
+
+static void _qed_fcoe_get_pstats(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_fcoe_stats *p_stats)
+{
+	struct fcoe_tx_stat pstats;
+	u32 pstats_addr;
+
+	memset(&pstats, 0, sizeof(pstats));
+	pstats_addr = BAR0_MAP_REG_PSDM_RAM +
+	    PSTORM_FCOE_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, sizeof(pstats));
+
+	p_stats->fcoe_tx_byte_cnt = HILO_64_REGPAIR(pstats.fcoe_tx_byte_cnt);
+	p_stats->fcoe_tx_data_pkt_cnt =
+	    HILO_64_REGPAIR(pstats.fcoe_tx_data_pkt_cnt);
+	p_stats->fcoe_tx_xfer_pkt_cnt =
+	    HILO_64_REGPAIR(pstats.fcoe_tx_xfer_pkt_cnt);
+	p_stats->fcoe_tx_other_pkt_cnt =
+	    HILO_64_REGPAIR(pstats.fcoe_tx_other_pkt_cnt);
+}
+
+static int qed_fcoe_get_stats(struct qed_hwfn *p_hwfn,
+			      struct qed_fcoe_stats *p_stats)
+{
+	struct qed_ptt *p_ptt;
+
+	memset(p_stats, 0, sizeof(*p_stats));
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "Failed to acquire ptt\n");
+		return -EINVAL;
+	}
+
+	_qed_fcoe_get_tstats(p_hwfn, p_ptt, p_stats);
+	_qed_fcoe_get_pstats(p_hwfn, p_ptt, p_stats);
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+struct qed_hash_fcoe_con {
+	struct hlist_node node;
+	struct qed_fcoe_conn *con;
+};
+
+static int qed_fill_fcoe_dev_info(struct qed_dev *cdev,
+				  struct qed_dev_fcoe_info *info)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	int rc;
+
+	memset(info, 0, sizeof(*info));
+	rc = qed_fill_dev_info(cdev, &info->common);
+
+	info->primary_dbq_rq_addr =
+	    qed_fcoe_get_primary_bdq_prod(hwfn, BDQ_ID_RQ);
+	info->secondary_bdq_rq_addr =
+	    qed_fcoe_get_secondary_bdq_prod(hwfn, BDQ_ID_RQ);
+
+	return rc;
+}
+
+static void qed_register_fcoe_ops(struct qed_dev *cdev,
+				  struct qed_fcoe_cb_ops *ops, void *cookie)
+{
+	cdev->protocol_ops.fcoe = ops;
+	cdev->ops_cookie = cookie;
+}
+
+static struct qed_hash_fcoe_con *qed_fcoe_get_hash(struct qed_dev *cdev,
+						   u32 handle)
+{
+	struct qed_hash_fcoe_con *hash_con = NULL;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED))
+		return NULL;
+
+	hash_for_each_possible(cdev->connections, hash_con, node, handle) {
+		if (hash_con->con->icid == handle)
+			break;
+	}
+
+	if (!hash_con || (hash_con->con->icid != handle))
+		return NULL;
+
+	return hash_con;
+}
+
+static int qed_fcoe_stop(struct qed_dev *cdev)
+{
+	int rc;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED)) {
+		DP_NOTICE(cdev, "fcoe already stopped\n");
+		return 0;
+	}
+
+	if (!hash_empty(cdev->connections)) {
+		DP_NOTICE(cdev,
+			  "Can't stop fcoe - not all connections were returned\n");
+		return -EINVAL;
+	}
+
+	/* Stop the fcoe */
+	rc = qed_sp_fcoe_func_stop(QED_LEADING_HWFN(cdev),
+				   QED_SPQ_MODE_EBLOCK, NULL);
+	cdev->flags &= ~QED_FLAG_STORAGE_STARTED;
+
+	return rc;
+}
+
+static int qed_fcoe_start(struct qed_dev *cdev, struct qed_fcoe_tid *tasks)
+{
+	int rc;
+
+	if (cdev->flags & QED_FLAG_STORAGE_STARTED) {
+		DP_NOTICE(cdev, "fcoe already started;\n");
+		return 0;
+	}
+
+	rc = qed_sp_fcoe_func_start(QED_LEADING_HWFN(cdev),
+				    QED_SPQ_MODE_EBLOCK, NULL);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to start fcoe\n");
+		return rc;
+	}
+
+	cdev->flags |= QED_FLAG_STORAGE_STARTED;
+	hash_init(cdev->connections);
+
+	if (tasks) {
+		struct qed_tid_mem *tid_info = kzalloc(sizeof(*tid_info),
+						       GFP_ATOMIC);
+
+		if (!tid_info) {
+			DP_NOTICE(cdev,
+				  "Failed to allocate tasks information\n");
+			qed_fcoe_stop(cdev);
+			return -ENOMEM;
+		}
+
+		rc = qed_cxt_get_tid_mem_info(QED_LEADING_HWFN(cdev), tid_info);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed to gather task information\n");
+			qed_fcoe_stop(cdev);
+			kfree(tid_info);
+			return rc;
+		}
+
+		/* Fill task information */
+		tasks->size = tid_info->tid_size;
+		tasks->num_tids_per_block = tid_info->num_tids_per_block;
+		memcpy(tasks->blocks, tid_info->blocks,
+		       MAX_TID_BLOCKS_FCOE * sizeof(u8 *));
+
+		kfree(tid_info);
+	}
+
+	return 0;
+}
+
+static int qed_fcoe_acquire_conn(struct qed_dev *cdev,
+				 u32 *handle,
+				 u32 *fw_cid, void __iomem **p_doorbell)
+{
+	struct qed_hash_fcoe_con *hash_con;
+	int rc;
+
+	/* Allocate a hashed connection */
+	hash_con = kzalloc(sizeof(*hash_con), GFP_KERNEL);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to allocate hashed connection\n");
+		return -ENOMEM;
+	}
+
+	/* Acquire the connection */
+	rc = qed_fcoe_acquire_connection(QED_LEADING_HWFN(cdev), NULL,
+					 &hash_con->con);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to acquire Connection\n");
+		kfree(hash_con);
+		return rc;
+	}
+
+	/* Added the connection to hash table */
+	*handle = hash_con->con->icid;
+	*fw_cid = hash_con->con->fw_cid;
+	hash_add(cdev->connections, &hash_con->node, *handle);
+
+	if (p_doorbell)
+		*p_doorbell = qed_fcoe_get_db_addr(QED_LEADING_HWFN(cdev),
+						   *handle);
+
+	return 0;
+}
+
+static int qed_fcoe_release_conn(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_fcoe_con *hash_con;
+
+	hash_con = qed_fcoe_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	hlist_del(&hash_con->node);
+	qed_fcoe_release_connection(QED_LEADING_HWFN(cdev), hash_con->con);
+	kfree(hash_con);
+
+	return 0;
+}
+
+static int qed_fcoe_offload_conn(struct qed_dev *cdev,
+				 u32 handle,
+				 struct qed_fcoe_params_offload *conn_info)
+{
+	struct qed_hash_fcoe_con *hash_con;
+	struct qed_fcoe_conn *con;
+
+	hash_con = qed_fcoe_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+
+	con->sq_pbl_addr = conn_info->sq_pbl_addr;
+	con->sq_curr_page_addr = conn_info->sq_curr_page_addr;
+	con->sq_next_page_addr = conn_info->sq_next_page_addr;
+	con->tx_max_fc_pay_len = conn_info->tx_max_fc_pay_len;
+	con->e_d_tov_timer_val = conn_info->e_d_tov_timer_val;
+	con->rec_tov_timer_val = conn_info->rec_tov_timer_val;
+	con->rx_max_fc_pay_len = conn_info->rx_max_fc_pay_len;
+	con->vlan_tag = conn_info->vlan_tag;
+	con->max_conc_seqs_c3 = conn_info->max_conc_seqs_c3;
+	con->flags = conn_info->flags;
+	con->def_q_idx = conn_info->def_q_idx;
+
+	con->src_mac_addr_hi = (conn_info->src_mac[5] << 8) |
+	    conn_info->src_mac[4];
+	con->src_mac_addr_mid = (conn_info->src_mac[3] << 8) |
+	    conn_info->src_mac[2];
+	con->src_mac_addr_lo = (conn_info->src_mac[1] << 8) |
+	    conn_info->src_mac[0];
+	con->dst_mac_addr_hi = (conn_info->dst_mac[5] << 8) |
+	    conn_info->dst_mac[4];
+	con->dst_mac_addr_mid = (conn_info->dst_mac[3] << 8) |
+	    conn_info->dst_mac[2];
+	con->dst_mac_addr_lo = (conn_info->dst_mac[1] << 8) |
+	    conn_info->dst_mac[0];
+
+	con->s_id.addr_hi = conn_info->s_id.addr_hi;
+	con->s_id.addr_mid = conn_info->s_id.addr_mid;
+	con->s_id.addr_lo = conn_info->s_id.addr_lo;
+	con->d_id.addr_hi = conn_info->d_id.addr_hi;
+	con->d_id.addr_mid = conn_info->d_id.addr_mid;
+	con->d_id.addr_lo = conn_info->d_id.addr_lo;
+
+	return qed_sp_fcoe_conn_offload(QED_LEADING_HWFN(cdev), con,
+					QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_fcoe_destroy_conn(struct qed_dev *cdev,
+				 u32 handle, dma_addr_t terminate_params)
+{
+	struct qed_hash_fcoe_con *hash_con;
+	struct qed_fcoe_conn *con;
+
+	hash_con = qed_fcoe_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+	con->terminate_params = terminate_params;
+
+	return qed_sp_fcoe_conn_destroy(QED_LEADING_HWFN(cdev), con,
+					QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_fcoe_stats(struct qed_dev *cdev, struct qed_fcoe_stats *stats)
+{
+	return qed_fcoe_get_stats(QED_LEADING_HWFN(cdev), stats);
+}
+
+void qed_get_protocol_stats_fcoe(struct qed_dev *cdev,
+				 struct qed_mcp_fcoe_stats *stats)
+{
+	struct qed_fcoe_stats proto_stats;
+
+	/* Retrieve FW statistics */
+	memset(&proto_stats, 0, sizeof(proto_stats));
+	if (qed_fcoe_stats(cdev, &proto_stats)) {
+		DP_VERBOSE(cdev, QED_MSG_STORAGE,
+			   "Failed to collect FCoE statistics\n");
+		return;
+	}
+
+	/* Translate FW statistics into struct */
+	stats->rx_pkts = proto_stats.fcoe_rx_data_pkt_cnt +
+			 proto_stats.fcoe_rx_xfer_pkt_cnt +
+			 proto_stats.fcoe_rx_other_pkt_cnt;
+	stats->tx_pkts = proto_stats.fcoe_tx_data_pkt_cnt +
+			 proto_stats.fcoe_tx_xfer_pkt_cnt +
+			 proto_stats.fcoe_tx_other_pkt_cnt;
+	stats->fcs_err = proto_stats.fcoe_silent_drop_pkt_crc_error_cnt;
+
+	/* Request protocol driver to fill-in the rest */
+	if (cdev->protocol_ops.fcoe && cdev->ops_cookie) {
+		struct qed_fcoe_cb_ops *ops = cdev->protocol_ops.fcoe;
+		void *cookie = cdev->ops_cookie;
+
+		if (ops->get_login_failures)
+			stats->login_failure = ops->get_login_failures(cookie);
+	}
+}
+
+static const struct qed_fcoe_ops qed_fcoe_ops_pass = {
+	.common = &qed_common_ops_pass,
+	.ll2 = &qed_ll2_ops_pass,
+	.fill_dev_info = &qed_fill_fcoe_dev_info,
+	.start = &qed_fcoe_start,
+	.stop = &qed_fcoe_stop,
+	.register_ops = &qed_register_fcoe_ops,
+	.acquire_conn = &qed_fcoe_acquire_conn,
+	.release_conn = &qed_fcoe_release_conn,
+	.offload_conn = &qed_fcoe_offload_conn,
+	.destroy_conn = &qed_fcoe_destroy_conn,
+	.get_stats = &qed_fcoe_stats,
+};
+
+const struct qed_fcoe_ops *qed_get_fcoe_ops(void)
+{
+	return &qed_fcoe_ops_pass;
+}
+EXPORT_SYMBOL(qed_get_fcoe_ops);
+
+void qed_put_fcoe_ops(void)
+{
+}
+EXPORT_SYMBOL(qed_put_fcoe_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_fcoe.h b/drivers/net/ethernet/qlogic/qed/qed_fcoe.h
new file mode 100644
index 000000000000..472af34a171d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_fcoe.h
@@ -0,0 +1,87 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_FCOE_H
+#define _QED_FCOE_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_fcoe_if.h>
+#include <linux/qed/qed_chain.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+
+struct qed_fcoe_info {
+	spinlock_t lock; /* Connection resources. */
+	struct list_head free_list;
+};
+
+#if IS_ENABLED(CONFIG_QED_FCOE)
+struct qed_fcoe_info *qed_fcoe_alloc(struct qed_hwfn *p_hwfn);
+
+void qed_fcoe_setup(struct qed_hwfn *p_hwfn, struct qed_fcoe_info *p_fcoe_info);
+
+void qed_fcoe_free(struct qed_hwfn *p_hwfn, struct qed_fcoe_info *p_fcoe_info);
+void qed_get_protocol_stats_fcoe(struct qed_dev *cdev,
+				 struct qed_mcp_fcoe_stats *stats);
+#else /* CONFIG_QED_FCOE */
+static inline struct qed_fcoe_info *
+qed_fcoe_alloc(struct qed_hwfn *p_hwfn)
+{
+	return NULL;
+}
+
+static inline void qed_fcoe_setup(struct qed_hwfn *p_hwfn,
+				  struct qed_fcoe_info *p_fcoe_info)
+{
+}
+
+static inline void qed_fcoe_free(struct qed_hwfn *p_hwfn,
+				 struct qed_fcoe_info *p_fcoe_info)
+{
+}
+
+static inline void qed_get_protocol_stats_fcoe(struct qed_dev *cdev,
+					       struct qed_mcp_fcoe_stats *stats)
+{
+}
+#endif /* CONFIG_QED_FCOE */
+
+#ifdef CONFIG_QED_LL2
+extern const struct qed_common_ops qed_common_ops_pass;
+extern const struct qed_ll2_ops qed_ll2_ops_pass;
+#endif
+
+#endif /* _QED_FCOE_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 5d31189288e8..37c2bfb663bb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -43,10 +43,12 @@
 #include <linux/qed/common_hsi.h>
 #include <linux/qed/storage_common.h>
 #include <linux/qed/tcp_common.h>
+#include <linux/qed/fcoe_common.h>
 #include <linux/qed/eth_common.h>
 #include <linux/qed/iscsi_common.h>
 #include <linux/qed/rdma_common.h>
 #include <linux/qed/roce_common.h>
+#include <linux/qed/qed_fcoe_if.h>
 
 struct qed_hwfn;
 struct qed_ptt;
@@ -937,7 +939,7 @@ struct mstorm_vf_zone {
 enum personality_type {
 	BAD_PERSONALITY_TYP,
 	PERSONALITY_ISCSI,
-	PERSONALITY_RESERVED2,
+	PERSONALITY_FCOE,
 	PERSONALITY_RDMA_AND_ETH,
 	PERSONALITY_RESERVED3,
 	PERSONALITY_CORE,
@@ -3473,6 +3475,10 @@ void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
 #define TSTORM_RDMA_QUEUE_STAT_OFFSET(rdma_stat_counter_id) \
 	(IRO[46].base +	((rdma_stat_counter_id) * IRO[46].m1))
 #define TSTORM_RDMA_QUEUE_STAT_SIZE				(IRO[46].size)
+#define TSTORM_FCOE_RX_STATS_OFFSET(pf_id) \
+	(IRO[43].base +	((pf_id) * IRO[43].m1))
+#define PSTORM_FCOE_TX_STATS_OFFSET(pf_id) \
+	(IRO[44].base + ((pf_id) * IRO[44].m1))
 
 static const struct iro iro_arr[47] = {
 	{0x0, 0x0, 0x0, 0x0, 0x8},
@@ -7407,6 +7413,769 @@ struct ystorm_roce_resp_conn_ag_ctx {
 	__le32 reg3;
 };
 
+struct ystorm_fcoe_conn_st_ctx {
+	u8 func_mode;
+	u8 cos;
+	u8 conf_version;
+	u8 eth_hdr_size;
+	__le16 stat_ram_addr;
+	__le16 mtu;
+	__le16 max_fc_payload_len;
+	__le16 tx_max_fc_pay_len;
+	u8 fcp_cmd_size;
+	u8 fcp_rsp_size;
+	__le16 mss;
+	struct regpair reserved;
+	u8 protection_info_flags;
+#define YSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_MASK  0x1
+#define YSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_SHIFT 0
+#define YSTORM_FCOE_CONN_ST_CTX_VALID_MASK               0x1
+#define YSTORM_FCOE_CONN_ST_CTX_VALID_SHIFT              1
+#define YSTORM_FCOE_CONN_ST_CTX_RESERVED1_MASK           0x3F
+#define YSTORM_FCOE_CONN_ST_CTX_RESERVED1_SHIFT          2
+	u8 dst_protection_per_mss;
+	u8 src_protection_per_mss;
+	u8 ptu_log_page_size;
+	u8 flags;
+#define YSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_MASK     0x1
+#define YSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_SHIFT    0
+#define YSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_MASK     0x1
+#define YSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_SHIFT    1
+#define YSTORM_FCOE_CONN_ST_CTX_RSRV_MASK                0x3F
+#define YSTORM_FCOE_CONN_ST_CTX_RSRV_SHIFT               2
+	u8 fcp_xfer_size;
+	u8 reserved3[2];
+};
+
+struct fcoe_vlan_fields {
+	__le16 fields;
+#define FCOE_VLAN_FIELDS_VID_MASK  0xFFF
+#define FCOE_VLAN_FIELDS_VID_SHIFT 0
+#define FCOE_VLAN_FIELDS_CLI_MASK  0x1
+#define FCOE_VLAN_FIELDS_CLI_SHIFT 12
+#define FCOE_VLAN_FIELDS_PRI_MASK  0x7
+#define FCOE_VLAN_FIELDS_PRI_SHIFT 13
+};
+
+union fcoe_vlan_field_union {
+	struct fcoe_vlan_fields fields;
+	__le16 val;
+};
+
+union fcoe_vlan_vif_field_union {
+	union fcoe_vlan_field_union vlan;
+	__le16 vif;
+};
+
+struct pstorm_fcoe_eth_context_section {
+	u8 remote_addr_3;
+	u8 remote_addr_2;
+	u8 remote_addr_1;
+	u8 remote_addr_0;
+	u8 local_addr_1;
+	u8 local_addr_0;
+	u8 remote_addr_5;
+	u8 remote_addr_4;
+	u8 local_addr_5;
+	u8 local_addr_4;
+	u8 local_addr_3;
+	u8 local_addr_2;
+	union fcoe_vlan_vif_field_union vif_outer_vlan;
+	__le16 vif_outer_eth_type;
+	union fcoe_vlan_vif_field_union inner_vlan;
+	__le16 inner_eth_type;
+};
+
+struct pstorm_fcoe_conn_st_ctx {
+	u8 func_mode;
+	u8 cos;
+	u8 conf_version;
+	u8 rsrv;
+	__le16 stat_ram_addr;
+	__le16 mss;
+	struct regpair abts_cleanup_addr;
+	struct pstorm_fcoe_eth_context_section eth;
+	u8 sid_2;
+	u8 sid_1;
+	u8 sid_0;
+	u8 flags;
+#define PSTORM_FCOE_CONN_ST_CTX_VNTAG_VLAN_MASK          0x1
+#define PSTORM_FCOE_CONN_ST_CTX_VNTAG_VLAN_SHIFT         0
+#define PSTORM_FCOE_CONN_ST_CTX_SUPPORT_REC_RR_TOV_MASK  0x1
+#define PSTORM_FCOE_CONN_ST_CTX_SUPPORT_REC_RR_TOV_SHIFT 1
+#define PSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_MASK     0x1
+#define PSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_SHIFT    2
+#define PSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_MASK     0x1
+#define PSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_SHIFT    3
+#define PSTORM_FCOE_CONN_ST_CTX_RESERVED_MASK            0xF
+#define PSTORM_FCOE_CONN_ST_CTX_RESERVED_SHIFT           4
+	u8 did_2;
+	u8 did_1;
+	u8 did_0;
+	u8 src_mac_index;
+	__le16 rec_rr_tov_val;
+	u8 q_relative_offset;
+	u8 reserved1;
+};
+
+struct xstorm_fcoe_conn_st_ctx {
+	u8 func_mode;
+	u8 src_mac_index;
+	u8 conf_version;
+	u8 cached_wqes_avail;
+	__le16 stat_ram_addr;
+	u8 flags;
+#define XSTORM_FCOE_CONN_ST_CTX_SQ_DEFERRED_MASK             0x1
+#define XSTORM_FCOE_CONN_ST_CTX_SQ_DEFERRED_SHIFT            0
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_MASK         0x1
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_SHIFT        1
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_ORIG_MASK    0x1
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_ORIG_SHIFT   2
+#define XSTORM_FCOE_CONN_ST_CTX_LAST_QUEUE_HANDLED_MASK      0x3
+#define XSTORM_FCOE_CONN_ST_CTX_LAST_QUEUE_HANDLED_SHIFT     3
+#define XSTORM_FCOE_CONN_ST_CTX_RSRV_MASK                    0x7
+#define XSTORM_FCOE_CONN_ST_CTX_RSRV_SHIFT                   5
+	u8 cached_wqes_offset;
+	u8 reserved2;
+	u8 eth_hdr_size;
+	u8 seq_id;
+	u8 max_conc_seqs;
+	__le16 num_pages_in_pbl;
+	__le16 reserved;
+	struct regpair sq_pbl_addr;
+	struct regpair sq_curr_page_addr;
+	struct regpair sq_next_page_addr;
+	struct regpair xferq_pbl_addr;
+	struct regpair xferq_curr_page_addr;
+	struct regpair xferq_next_page_addr;
+	struct regpair respq_pbl_addr;
+	struct regpair respq_curr_page_addr;
+	struct regpair respq_next_page_addr;
+	__le16 mtu;
+	__le16 tx_max_fc_pay_len;
+	__le16 max_fc_payload_len;
+	__le16 min_frame_size;
+	__le16 sq_pbl_next_index;
+	__le16 respq_pbl_next_index;
+	u8 fcp_cmd_byte_credit;
+	u8 fcp_rsp_byte_credit;
+	__le16 protection_info;
+#define XSTORM_FCOE_CONN_ST_CTX_PROTECTION_PERF_MASK         0x1
+#define XSTORM_FCOE_CONN_ST_CTX_PROTECTION_PERF_SHIFT        0
+#define XSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_MASK      0x1
+#define XSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_SHIFT     1
+#define XSTORM_FCOE_CONN_ST_CTX_VALID_MASK                   0x1
+#define XSTORM_FCOE_CONN_ST_CTX_VALID_SHIFT                  2
+#define XSTORM_FCOE_CONN_ST_CTX_FRAME_PROT_ALIGNED_MASK      0x1
+#define XSTORM_FCOE_CONN_ST_CTX_FRAME_PROT_ALIGNED_SHIFT     3
+#define XSTORM_FCOE_CONN_ST_CTX_RESERVED3_MASK               0xF
+#define XSTORM_FCOE_CONN_ST_CTX_RESERVED3_SHIFT              4
+#define XSTORM_FCOE_CONN_ST_CTX_DST_PROTECTION_PER_MSS_MASK  0xFF
+#define XSTORM_FCOE_CONN_ST_CTX_DST_PROTECTION_PER_MSS_SHIFT 8
+	__le16 xferq_pbl_next_index;
+	__le16 page_size;
+	u8 mid_seq;
+	u8 fcp_xfer_byte_credit;
+	u8 reserved1[2];
+	struct fcoe_wqe cached_wqes[16];
+};
+
+struct xstorm_fcoe_conn_ag_ctx {
+	u8 reserved0;
+	u8 fcoe_state;
+	u8 flags0;
+#define XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_SHIFT      0
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED1_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED1_SHIFT         1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED2_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED2_SHIFT         2
+#define XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM3_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM3_SHIFT      3
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED3_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED3_SHIFT         4
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED4_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED4_SHIFT         5
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED5_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED5_SHIFT         6
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED6_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED6_SHIFT         7
+	u8 flags1;
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED7_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED7_SHIFT         0
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED8_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED8_SHIFT         1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED9_MASK          0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED9_SHIFT         2
+#define XSTORM_FCOE_CONN_AG_CTX_BIT11_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT11_SHIFT             3
+#define XSTORM_FCOE_CONN_AG_CTX_BIT12_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT12_SHIFT             4
+#define XSTORM_FCOE_CONN_AG_CTX_BIT13_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT13_SHIFT             5
+#define XSTORM_FCOE_CONN_AG_CTX_BIT14_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT14_SHIFT             6
+#define XSTORM_FCOE_CONN_AG_CTX_BIT15_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT15_SHIFT             7
+	u8 flags2;
+#define XSTORM_FCOE_CONN_AG_CTX_CF0_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF0_SHIFT               0
+#define XSTORM_FCOE_CONN_AG_CTX_CF1_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF1_SHIFT               2
+#define XSTORM_FCOE_CONN_AG_CTX_CF2_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT               4
+#define XSTORM_FCOE_CONN_AG_CTX_CF3_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF3_SHIFT               6
+	u8 flags3;
+#define XSTORM_FCOE_CONN_AG_CTX_CF4_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF4_SHIFT               0
+#define XSTORM_FCOE_CONN_AG_CTX_CF5_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF5_SHIFT               2
+#define XSTORM_FCOE_CONN_AG_CTX_CF6_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF6_SHIFT               4
+#define XSTORM_FCOE_CONN_AG_CTX_CF7_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF7_SHIFT               6
+	u8 flags4;
+#define XSTORM_FCOE_CONN_AG_CTX_CF8_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF8_SHIFT               0
+#define XSTORM_FCOE_CONN_AG_CTX_CF9_MASK                0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF9_SHIFT               2
+#define XSTORM_FCOE_CONN_AG_CTX_CF10_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF10_SHIFT              4
+#define XSTORM_FCOE_CONN_AG_CTX_CF11_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF11_SHIFT              6
+	u8 flags5;
+#define XSTORM_FCOE_CONN_AG_CTX_CF12_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF12_SHIFT              0
+#define XSTORM_FCOE_CONN_AG_CTX_CF13_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF13_SHIFT              2
+#define XSTORM_FCOE_CONN_AG_CTX_CF14_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF14_SHIFT              4
+#define XSTORM_FCOE_CONN_AG_CTX_CF15_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF15_SHIFT              6
+	u8 flags6;
+#define XSTORM_FCOE_CONN_AG_CTX_CF16_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF16_SHIFT              0
+#define XSTORM_FCOE_CONN_AG_CTX_CF17_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF17_SHIFT              2
+#define XSTORM_FCOE_CONN_AG_CTX_CF18_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF18_SHIFT              4
+#define XSTORM_FCOE_CONN_AG_CTX_DQ_CF_MASK              0x3
+#define XSTORM_FCOE_CONN_AG_CTX_DQ_CF_SHIFT             6
+	u8 flags7;
+#define XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_MASK           0x3
+#define XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_SHIFT          0
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED10_MASK         0x3
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED10_SHIFT        2
+#define XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_MASK          0x3
+#define XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_SHIFT         4
+#define XSTORM_FCOE_CONN_AG_CTX_CF0EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT             6
+#define XSTORM_FCOE_CONN_AG_CTX_CF1EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT             7
+	u8 flags8;
+#define XSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT             0
+#define XSTORM_FCOE_CONN_AG_CTX_CF3EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF3EN_SHIFT             1
+#define XSTORM_FCOE_CONN_AG_CTX_CF4EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF4EN_SHIFT             2
+#define XSTORM_FCOE_CONN_AG_CTX_CF5EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF5EN_SHIFT             3
+#define XSTORM_FCOE_CONN_AG_CTX_CF6EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF6EN_SHIFT             4
+#define XSTORM_FCOE_CONN_AG_CTX_CF7EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF7EN_SHIFT             5
+#define XSTORM_FCOE_CONN_AG_CTX_CF8EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF8EN_SHIFT             6
+#define XSTORM_FCOE_CONN_AG_CTX_CF9EN_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF9EN_SHIFT             7
+	u8 flags9;
+#define XSTORM_FCOE_CONN_AG_CTX_CF10EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF10EN_SHIFT            0
+#define XSTORM_FCOE_CONN_AG_CTX_CF11EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF11EN_SHIFT            1
+#define XSTORM_FCOE_CONN_AG_CTX_CF12EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF12EN_SHIFT            2
+#define XSTORM_FCOE_CONN_AG_CTX_CF13EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF13EN_SHIFT            3
+#define XSTORM_FCOE_CONN_AG_CTX_CF14EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF14EN_SHIFT            4
+#define XSTORM_FCOE_CONN_AG_CTX_CF15EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF15EN_SHIFT            5
+#define XSTORM_FCOE_CONN_AG_CTX_CF16EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF16EN_SHIFT            6
+#define XSTORM_FCOE_CONN_AG_CTX_CF17EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF17EN_SHIFT            7
+	u8 flags10;
+#define XSTORM_FCOE_CONN_AG_CTX_CF18EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF18EN_SHIFT            0
+#define XSTORM_FCOE_CONN_AG_CTX_DQ_CF_EN_MASK           0x1
+#define XSTORM_FCOE_CONN_AG_CTX_DQ_CF_EN_SHIFT          1
+#define XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_EN_MASK        0x1
+#define XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT       2
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED11_MASK         0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED11_SHIFT        3
+#define XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_EN_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_EN_SHIFT      4
+#define XSTORM_FCOE_CONN_AG_CTX_CF23EN_MASK             0x1
+#define XSTORM_FCOE_CONN_AG_CTX_CF23EN_SHIFT            5
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED12_MASK         0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED12_SHIFT        6
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED13_MASK         0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED13_SHIFT        7
+	u8 flags11;
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED14_MASK         0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED14_SHIFT        0
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED15_MASK         0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED15_SHIFT        1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED16_MASK         0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESERVED16_SHIFT        2
+#define XSTORM_FCOE_CONN_AG_CTX_RULE5EN_MASK            0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE5EN_SHIFT           3
+#define XSTORM_FCOE_CONN_AG_CTX_RULE6EN_MASK            0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE6EN_SHIFT           4
+#define XSTORM_FCOE_CONN_AG_CTX_RULE7EN_MASK            0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE7EN_SHIFT           5
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED1_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED1_SHIFT      6
+#define XSTORM_FCOE_CONN_AG_CTX_XFERQ_DECISION_EN_MASK  0x1
+#define XSTORM_FCOE_CONN_AG_CTX_XFERQ_DECISION_EN_SHIFT 7
+	u8 flags12;
+#define XSTORM_FCOE_CONN_AG_CTX_SQ_DECISION_EN_MASK     0x1
+#define XSTORM_FCOE_CONN_AG_CTX_SQ_DECISION_EN_SHIFT    0
+#define XSTORM_FCOE_CONN_AG_CTX_RULE11EN_MASK           0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE11EN_SHIFT          1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED2_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED2_SHIFT      2
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED3_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED3_SHIFT      3
+#define XSTORM_FCOE_CONN_AG_CTX_RULE14EN_MASK           0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE14EN_SHIFT          4
+#define XSTORM_FCOE_CONN_AG_CTX_RULE15EN_MASK           0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE15EN_SHIFT          5
+#define XSTORM_FCOE_CONN_AG_CTX_RULE16EN_MASK           0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE16EN_SHIFT          6
+#define XSTORM_FCOE_CONN_AG_CTX_RULE17EN_MASK           0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE17EN_SHIFT          7
+	u8 flags13;
+#define XSTORM_FCOE_CONN_AG_CTX_RESPQ_DECISION_EN_MASK  0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RESPQ_DECISION_EN_SHIFT 0
+#define XSTORM_FCOE_CONN_AG_CTX_RULE19EN_MASK           0x1
+#define XSTORM_FCOE_CONN_AG_CTX_RULE19EN_SHIFT          1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED4_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED4_SHIFT      2
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED5_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED5_SHIFT      3
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED6_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED6_SHIFT      4
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED7_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED7_SHIFT      5
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED8_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED8_SHIFT      6
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED9_MASK       0x1
+#define XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED9_SHIFT      7
+	u8 flags14;
+#define XSTORM_FCOE_CONN_AG_CTX_BIT16_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT16_SHIFT             0
+#define XSTORM_FCOE_CONN_AG_CTX_BIT17_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT17_SHIFT             1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT18_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT18_SHIFT             2
+#define XSTORM_FCOE_CONN_AG_CTX_BIT19_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT19_SHIFT             3
+#define XSTORM_FCOE_CONN_AG_CTX_BIT20_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT20_SHIFT             4
+#define XSTORM_FCOE_CONN_AG_CTX_BIT21_MASK              0x1
+#define XSTORM_FCOE_CONN_AG_CTX_BIT21_SHIFT             5
+#define XSTORM_FCOE_CONN_AG_CTX_CF23_MASK               0x3
+#define XSTORM_FCOE_CONN_AG_CTX_CF23_SHIFT              6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 word1;
+	__le16 word2;
+	__le16 sq_cons;
+	__le16 sq_prod;
+	__le16 xferq_prod;
+	__le16 xferq_cons;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 remain_io;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le16 respq_prod;
+	__le16 respq_cons;
+	__le16 word9;
+	__le16 word10;
+	__le32 reg7;
+	__le32 reg8;
+};
+
+struct ustorm_fcoe_conn_st_ctx {
+	struct regpair respq_pbl_addr;
+	__le16 num_pages_in_pbl;
+	u8 ptu_log_page_size;
+	u8 log_page_size;
+	__le16 respq_prod;
+	u8 reserved[2];
+};
+
+struct tstorm_fcoe_conn_ag_ctx {
+	u8 reserved0;
+	u8 fcoe_state;
+	u8 flags0;
+#define TSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_MASK          0x1
+#define TSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_SHIFT         0
+#define TSTORM_FCOE_CONN_AG_CTX_BIT1_MASK                  0x1
+#define TSTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT                 1
+#define TSTORM_FCOE_CONN_AG_CTX_BIT2_MASK                  0x1
+#define TSTORM_FCOE_CONN_AG_CTX_BIT2_SHIFT                 2
+#define TSTORM_FCOE_CONN_AG_CTX_BIT3_MASK                  0x1
+#define TSTORM_FCOE_CONN_AG_CTX_BIT3_SHIFT                 3
+#define TSTORM_FCOE_CONN_AG_CTX_BIT4_MASK                  0x1
+#define TSTORM_FCOE_CONN_AG_CTX_BIT4_SHIFT                 4
+#define TSTORM_FCOE_CONN_AG_CTX_BIT5_MASK                  0x1
+#define TSTORM_FCOE_CONN_AG_CTX_BIT5_SHIFT                 5
+#define TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_MASK        0x3
+#define TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_SHIFT       6
+	u8 flags1;
+#define TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_MASK           0x3
+#define TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT          0
+#define TSTORM_FCOE_CONN_AG_CTX_CF2_MASK                   0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT                  2
+#define TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_MASK     0x3
+#define TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_SHIFT    4
+#define TSTORM_FCOE_CONN_AG_CTX_CF4_MASK                   0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF4_SHIFT                  6
+	u8 flags2;
+#define TSTORM_FCOE_CONN_AG_CTX_CF5_MASK                   0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF5_SHIFT                  0
+#define TSTORM_FCOE_CONN_AG_CTX_CF6_MASK                   0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF6_SHIFT                  2
+#define TSTORM_FCOE_CONN_AG_CTX_CF7_MASK                   0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF7_SHIFT                  4
+#define TSTORM_FCOE_CONN_AG_CTX_CF8_MASK                   0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF8_SHIFT                  6
+	u8 flags3;
+#define TSTORM_FCOE_CONN_AG_CTX_CF9_MASK                   0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF9_SHIFT                  0
+#define TSTORM_FCOE_CONN_AG_CTX_CF10_MASK                  0x3
+#define TSTORM_FCOE_CONN_AG_CTX_CF10_SHIFT                 2
+#define TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_EN_MASK     0x1
+#define TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_EN_SHIFT    4
+#define TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK        0x1
+#define TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT       5
+#define TSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK                 0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT                6
+#define TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_MASK  0x1
+#define TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_SHIFT 7
+	u8 flags4;
+#define TSTORM_FCOE_CONN_AG_CTX_CF4EN_MASK                 0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF4EN_SHIFT                0
+#define TSTORM_FCOE_CONN_AG_CTX_CF5EN_MASK                 0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF5EN_SHIFT                1
+#define TSTORM_FCOE_CONN_AG_CTX_CF6EN_MASK                 0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF6EN_SHIFT                2
+#define TSTORM_FCOE_CONN_AG_CTX_CF7EN_MASK                 0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF7EN_SHIFT                3
+#define TSTORM_FCOE_CONN_AG_CTX_CF8EN_MASK                 0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF8EN_SHIFT                4
+#define TSTORM_FCOE_CONN_AG_CTX_CF9EN_MASK                 0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF9EN_SHIFT                5
+#define TSTORM_FCOE_CONN_AG_CTX_CF10EN_MASK                0x1
+#define TSTORM_FCOE_CONN_AG_CTX_CF10EN_SHIFT               6
+#define TSTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT              7
+	u8 flags5;
+#define TSTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT              0
+#define TSTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT              1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT              2
+#define TSTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT              3
+#define TSTORM_FCOE_CONN_AG_CTX_RULE5EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE5EN_SHIFT              4
+#define TSTORM_FCOE_CONN_AG_CTX_RULE6EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE6EN_SHIFT              5
+#define TSTORM_FCOE_CONN_AG_CTX_RULE7EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE7EN_SHIFT              6
+#define TSTORM_FCOE_CONN_AG_CTX_RULE8EN_MASK               0x1
+#define TSTORM_FCOE_CONN_AG_CTX_RULE8EN_SHIFT              7
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct ustorm_fcoe_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define USTORM_FCOE_CONN_AG_CTX_BIT0_MASK     0x1
+#define USTORM_FCOE_CONN_AG_CTX_BIT0_SHIFT    0
+#define USTORM_FCOE_CONN_AG_CTX_BIT1_MASK     0x1
+#define USTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT    1
+#define USTORM_FCOE_CONN_AG_CTX_CF0_MASK      0x3
+#define USTORM_FCOE_CONN_AG_CTX_CF0_SHIFT     2
+#define USTORM_FCOE_CONN_AG_CTX_CF1_MASK      0x3
+#define USTORM_FCOE_CONN_AG_CTX_CF1_SHIFT     4
+#define USTORM_FCOE_CONN_AG_CTX_CF2_MASK      0x3
+#define USTORM_FCOE_CONN_AG_CTX_CF2_SHIFT     6
+	u8 flags1;
+#define USTORM_FCOE_CONN_AG_CTX_CF3_MASK      0x3
+#define USTORM_FCOE_CONN_AG_CTX_CF3_SHIFT     0
+#define USTORM_FCOE_CONN_AG_CTX_CF4_MASK      0x3
+#define USTORM_FCOE_CONN_AG_CTX_CF4_SHIFT     2
+#define USTORM_FCOE_CONN_AG_CTX_CF5_MASK      0x3
+#define USTORM_FCOE_CONN_AG_CTX_CF5_SHIFT     4
+#define USTORM_FCOE_CONN_AG_CTX_CF6_MASK      0x3
+#define USTORM_FCOE_CONN_AG_CTX_CF6_SHIFT     6
+	u8 flags2;
+#define USTORM_FCOE_CONN_AG_CTX_CF0EN_MASK    0x1
+#define USTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT   0
+#define USTORM_FCOE_CONN_AG_CTX_CF1EN_MASK    0x1
+#define USTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT   1
+#define USTORM_FCOE_CONN_AG_CTX_CF2EN_MASK    0x1
+#define USTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT   2
+#define USTORM_FCOE_CONN_AG_CTX_CF3EN_MASK    0x1
+#define USTORM_FCOE_CONN_AG_CTX_CF3EN_SHIFT   3
+#define USTORM_FCOE_CONN_AG_CTX_CF4EN_MASK    0x1
+#define USTORM_FCOE_CONN_AG_CTX_CF4EN_SHIFT   4
+#define USTORM_FCOE_CONN_AG_CTX_CF5EN_MASK    0x1
+#define USTORM_FCOE_CONN_AG_CTX_CF5EN_SHIFT   5
+#define USTORM_FCOE_CONN_AG_CTX_CF6EN_MASK    0x1
+#define USTORM_FCOE_CONN_AG_CTX_CF6EN_SHIFT   6
+#define USTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT 7
+	u8 flags3;
+#define USTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT 0
+#define USTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT 1
+#define USTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT 2
+#define USTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT 3
+#define USTORM_FCOE_CONN_AG_CTX_RULE5EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE5EN_SHIFT 4
+#define USTORM_FCOE_CONN_AG_CTX_RULE6EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE6EN_SHIFT 5
+#define USTORM_FCOE_CONN_AG_CTX_RULE7EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE7EN_SHIFT 6
+#define USTORM_FCOE_CONN_AG_CTX_RULE8EN_MASK  0x1
+#define USTORM_FCOE_CONN_AG_CTX_RULE8EN_SHIFT 7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le16 word2;
+	__le16 word3;
+};
+
+struct tstorm_fcoe_conn_st_ctx {
+	__le16 stat_ram_addr;
+	__le16 rx_max_fc_payload_len;
+	__le16 e_d_tov_val;
+	u8 flags;
+#define TSTORM_FCOE_CONN_ST_CTX_INC_SEQ_CNT_MASK   0x1
+#define TSTORM_FCOE_CONN_ST_CTX_INC_SEQ_CNT_SHIFT  0
+#define TSTORM_FCOE_CONN_ST_CTX_SUPPORT_CONF_MASK  0x1
+#define TSTORM_FCOE_CONN_ST_CTX_SUPPORT_CONF_SHIFT 1
+#define TSTORM_FCOE_CONN_ST_CTX_DEF_Q_IDX_MASK     0x3F
+#define TSTORM_FCOE_CONN_ST_CTX_DEF_Q_IDX_SHIFT    2
+	u8 timers_cleanup_invocation_cnt;
+	__le32 reserved1[2];
+	__le32 dst_mac_address_bytes0to3;
+	__le16 dst_mac_address_bytes4to5;
+	__le16 ramrod_echo;
+	u8 flags1;
+#define TSTORM_FCOE_CONN_ST_CTX_MODE_MASK          0x3
+#define TSTORM_FCOE_CONN_ST_CTX_MODE_SHIFT         0
+#define TSTORM_FCOE_CONN_ST_CTX_RESERVED_MASK      0x3F
+#define TSTORM_FCOE_CONN_ST_CTX_RESERVED_SHIFT     2
+	u8 q_relative_offset;
+	u8 bdq_resource_id;
+	u8 reserved0[5];
+};
+
+struct mstorm_fcoe_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define MSTORM_FCOE_CONN_AG_CTX_BIT0_MASK     0x1
+#define MSTORM_FCOE_CONN_AG_CTX_BIT0_SHIFT    0
+#define MSTORM_FCOE_CONN_AG_CTX_BIT1_MASK     0x1
+#define MSTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT    1
+#define MSTORM_FCOE_CONN_AG_CTX_CF0_MASK      0x3
+#define MSTORM_FCOE_CONN_AG_CTX_CF0_SHIFT     2
+#define MSTORM_FCOE_CONN_AG_CTX_CF1_MASK      0x3
+#define MSTORM_FCOE_CONN_AG_CTX_CF1_SHIFT     4
+#define MSTORM_FCOE_CONN_AG_CTX_CF2_MASK      0x3
+#define MSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT     6
+	u8 flags1;
+#define MSTORM_FCOE_CONN_AG_CTX_CF0EN_MASK    0x1
+#define MSTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT   0
+#define MSTORM_FCOE_CONN_AG_CTX_CF1EN_MASK    0x1
+#define MSTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT   1
+#define MSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK    0x1
+#define MSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT   2
+#define MSTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK  0x1
+#define MSTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT 3
+#define MSTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK  0x1
+#define MSTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT 4
+#define MSTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK  0x1
+#define MSTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT 5
+#define MSTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK  0x1
+#define MSTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT 6
+#define MSTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK  0x1
+#define MSTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT 7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct fcoe_mstorm_fcoe_conn_st_ctx_fp {
+	__le16 xfer_prod;
+	__le16 reserved1;
+	u8 protection_info;
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_SUPPORT_PROTECTION_MASK  0x1
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_SUPPORT_PROTECTION_SHIFT 0
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_VALID_MASK               0x1
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_VALID_SHIFT              1
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_RESERVED0_MASK           0x3F
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_RESERVED0_SHIFT          2
+	u8 q_relative_offset;
+	u8 reserved2[2];
+};
+
+struct fcoe_mstorm_fcoe_conn_st_ctx_non_fp {
+	__le16 conn_id;
+	__le16 stat_ram_addr;
+	__le16 num_pages_in_pbl;
+	u8 ptu_log_page_size;
+	u8 log_page_size;
+	__le16 unsolicited_cq_count;
+	__le16 cmdq_count;
+	u8 bdq_resource_id;
+	u8 reserved0[3];
+	struct regpair xferq_pbl_addr;
+	struct regpair reserved1;
+	struct regpair reserved2[3];
+};
+
+struct mstorm_fcoe_conn_st_ctx {
+	struct fcoe_mstorm_fcoe_conn_st_ctx_fp fp;
+	struct fcoe_mstorm_fcoe_conn_st_ctx_non_fp non_fp;
+};
+
+struct fcoe_conn_context {
+	struct ystorm_fcoe_conn_st_ctx ystorm_st_context;
+	struct pstorm_fcoe_conn_st_ctx pstorm_st_context;
+	struct regpair pstorm_st_padding[2];
+	struct xstorm_fcoe_conn_st_ctx xstorm_st_context;
+	struct xstorm_fcoe_conn_ag_ctx xstorm_ag_context;
+	struct regpair xstorm_ag_padding[6];
+	struct ustorm_fcoe_conn_st_ctx ustorm_st_context;
+	struct regpair ustorm_st_padding[2];
+	struct tstorm_fcoe_conn_ag_ctx tstorm_ag_context;
+	struct regpair tstorm_ag_padding[2];
+	struct timers_context timer_context;
+	struct ustorm_fcoe_conn_ag_ctx ustorm_ag_context;
+	struct tstorm_fcoe_conn_st_ctx tstorm_st_context;
+	struct mstorm_fcoe_conn_ag_ctx mstorm_ag_context;
+	struct mstorm_fcoe_conn_st_ctx mstorm_st_context;
+};
+
+struct fcoe_conn_offload_ramrod_params {
+	struct fcoe_conn_offload_ramrod_data offload_ramrod_data;
+};
+
+struct fcoe_conn_terminate_ramrod_params {
+	struct fcoe_conn_terminate_ramrod_data terminate_ramrod_data;
+};
+
+enum fcoe_event_type {
+	FCOE_EVENT_INIT_FUNC,
+	FCOE_EVENT_DESTROY_FUNC,
+	FCOE_EVENT_STAT_FUNC,
+	FCOE_EVENT_OFFLOAD_CONN,
+	FCOE_EVENT_TERMINATE_CONN,
+	FCOE_EVENT_ERROR,
+	MAX_FCOE_EVENT_TYPE
+};
+
+struct fcoe_init_ramrod_params {
+	struct fcoe_init_func_ramrod_data init_ramrod_data;
+};
+
+enum fcoe_ramrod_cmd_id {
+	FCOE_RAMROD_CMD_ID_INIT_FUNC,
+	FCOE_RAMROD_CMD_ID_DESTROY_FUNC,
+	FCOE_RAMROD_CMD_ID_STAT_FUNC,
+	FCOE_RAMROD_CMD_ID_OFFLOAD_CONN,
+	FCOE_RAMROD_CMD_ID_TERMINATE_CONN,
+	MAX_FCOE_RAMROD_CMD_ID
+};
+
+struct fcoe_stat_ramrod_params {
+	struct fcoe_stat_ramrod_data stat_ramrod_data;
+};
+
+struct ystorm_fcoe_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define YSTORM_FCOE_CONN_AG_CTX_BIT0_MASK     0x1
+#define YSTORM_FCOE_CONN_AG_CTX_BIT0_SHIFT    0
+#define YSTORM_FCOE_CONN_AG_CTX_BIT1_MASK     0x1
+#define YSTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT    1
+#define YSTORM_FCOE_CONN_AG_CTX_CF0_MASK      0x3
+#define YSTORM_FCOE_CONN_AG_CTX_CF0_SHIFT     2
+#define YSTORM_FCOE_CONN_AG_CTX_CF1_MASK      0x3
+#define YSTORM_FCOE_CONN_AG_CTX_CF1_SHIFT     4
+#define YSTORM_FCOE_CONN_AG_CTX_CF2_MASK      0x3
+#define YSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT     6
+	u8 flags1;
+#define YSTORM_FCOE_CONN_AG_CTX_CF0EN_MASK    0x1
+#define YSTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT   0
+#define YSTORM_FCOE_CONN_AG_CTX_CF1EN_MASK    0x1
+#define YSTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT   1
+#define YSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK    0x1
+#define YSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT   2
+#define YSTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK  0x1
+#define YSTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT 3
+#define YSTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK  0x1
+#define YSTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT 4
+#define YSTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK  0x1
+#define YSTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT 5
+#define YSTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK  0x1
+#define YSTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT 6
+#define YSTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK  0x1
+#define YSTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT 7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
 struct ystorm_iscsi_conn_st_ctx {
 	__le32 reserved[4];
 };
@@ -8435,6 +9204,7 @@ struct public_func {
 #define FUNC_MF_CFG_PROTOCOL_SHIFT	4
 #define FUNC_MF_CFG_PROTOCOL_ETHERNET	0x00000000
 #define FUNC_MF_CFG_PROTOCOL_ISCSI              0x00000010
+#define FUNC_MF_CFG_PROTOCOL_FCOE               0x00000020
 #define FUNC_MF_CFG_PROTOCOL_ROCE               0x00000030
 #define FUNC_MF_CFG_PROTOCOL_MAX	0x00000030
 
@@ -8529,6 +9299,13 @@ struct lan_stats_stc {
 	u32 rserved;
 };
 
+struct fcoe_stats_stc {
+	u64 rx_pkts;
+	u64 tx_pkts;
+	u32 fcs_err;
+	u32 login_failure;
+};
+
 struct ocbb_data_stc {
 	u32 ocbb_host_addr;
 	u32 ocsd_host_addr;
@@ -8602,6 +9379,7 @@ union drv_union_data {
 	struct drv_version_stc drv_version;
 
 	struct lan_stats_stc lan_stats;
+	struct fcoe_stats_stc fcoe_stats;
 	struct ocbb_data_stc ocbb_info;
 	struct temperature_status_stc temp_info;
 	struct resource_info resource;
@@ -8905,6 +9683,7 @@ struct nvm_cfg1_glob {
 	u32 misc_sig;
 	u32 device_capabilities;
 #define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ETHERNET	0x1
+#define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_FCOE		0x2
 #define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ISCSI		0x4
 #define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ROCE		0x8
 	u32 power_dissipated;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index 1f606516b6aa..899cad7f97ea 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -841,6 +841,9 @@ u16 qed_get_qm_pq(struct qed_hwfn *p_hwfn,
 		if (pq_id > p_hwfn->qm_info.num_pf_rls)
 			pq_id = p_hwfn->qm_info.offload_pq;
 		break;
+	case PROTOCOLID_FCOE:
+		pq_id = p_hwfn->qm_info.offload_pq;
+		break;
 	default:
 		pq_id = 0;
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 02c5d47cfc6d..9a0b9af10a57 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1130,6 +1130,9 @@ static int qed_sp_ll2_tx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->qm_pq_id = cpu_to_le16(pq_id);
 
 	switch (conn_type) {
+	case QED_LL2_TYPE_FCOE:
+		p_ramrod->conn_type = PROTOCOLID_FCOE;
+		break;
 	case QED_LL2_TYPE_ISCSI:
 	case QED_LL2_TYPE_ISCSI_OOO:
 		p_ramrod->conn_type = PROTOCOLID_ISCSI;
@@ -1458,6 +1461,15 @@ int qed_ll2_establish_connection(struct qed_hwfn *p_hwfn, u8 connection_handle)
 
 	qed_ll2_establish_connection_ooo(p_hwfn, p_ll2_conn);
 
+	if (p_ll2_conn->conn.conn_type == QED_LL2_TYPE_FCOE) {
+		qed_llh_add_protocol_filter(p_hwfn, p_hwfn->p_main_ptt,
+					    0x8906, 0,
+					    QED_LLH_FILTER_ETHERTYPE);
+		qed_llh_add_protocol_filter(p_hwfn, p_hwfn->p_main_ptt,
+					    0x8914, 0,
+					    QED_LLH_FILTER_ETHERTYPE);
+	}
+
 	return rc;
 }
 
@@ -1831,6 +1843,15 @@ int qed_ll2_terminate_connection(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	if (p_ll2_conn->conn.conn_type == QED_LL2_TYPE_ISCSI_OOO)
 		qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
 
+	if (p_ll2_conn->conn.conn_type == QED_LL2_TYPE_FCOE) {
+		qed_llh_remove_protocol_filter(p_hwfn, p_hwfn->p_main_ptt,
+					       0x8906, 0,
+					       QED_LLH_FILTER_ETHERTYPE);
+		qed_llh_remove_protocol_filter(p_hwfn, p_hwfn->p_main_ptt,
+					       0x8914, 0,
+					       QED_LLH_FILTER_ETHERTYPE);
+	}
+
 	return rc;
 }
 
@@ -2039,6 +2060,10 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 	}
 
 	switch (QED_LEADING_HWFN(cdev)->hw_info.personality) {
+	case QED_PCI_FCOE:
+		conn_type = QED_LL2_TYPE_FCOE;
+		gsi_enable = 0;
+		break;
 	case QED_PCI_ISCSI:
 		conn_type = QED_LL2_TYPE_ISCSI;
 		gsi_enable = 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
index db3e4fc78e09..31a409033c41 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
@@ -54,7 +54,7 @@ enum qed_ll2_roce_flavor_type {
 };
 
 enum qed_ll2_conn_type {
-	QED_LL2_TYPE_RESERVED,
+	QED_LL2_TYPE_FCOE,
 	QED_LL2_TYPE_ISCSI,
 	QED_LL2_TYPE_TEST,
 	QED_LL2_TYPE_ISCSI_OOO,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 592e104687a7..942c03ca3b59 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -53,9 +53,11 @@
 #include "qed_sp.h"
 #include "qed_dev_api.h"
 #include "qed_ll2.h"
+#include "qed_fcoe.h"
 #include "qed_mcp.h"
 #include "qed_hw.h"
 #include "qed_selftest.h"
+#include "qed_debug.h"
 
 #define QED_ROCE_QPS			(8192)
 #define QED_ROCE_DPIS			(8)
@@ -1603,6 +1605,8 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.sb_release = &qed_sb_release,
 	.simd_handler_config = &qed_simd_handler_config,
 	.simd_handler_clean = &qed_simd_handler_clean,
+	.dbg_grc = &qed_dbg_grc,
+	.dbg_grc_size = &qed_dbg_grc_size,
 	.can_link_change = &qed_can_link_change,
 	.set_link = &qed_set_link,
 	.get_link = &qed_get_current_link,
@@ -1636,6 +1640,9 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 		stats->lan_stats.ucast_tx_pkts = eth_stats.tx_ucast_pkts;
 		stats->lan_stats.fcs_err = -1;
 		break;
+	case QED_MCP_FCOE_STATS:
+		qed_get_protocol_stats_fcoe(cdev, &stats->fcoe_stats);
+		break;
 	default:
 		DP_ERR(cdev, "Invalid protocol type = %d\n", type);
 		return;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index c8a877594032..7624a38cbd39 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1130,6 +1130,9 @@ qed_mcp_get_shmem_proto(struct qed_hwfn *p_hwfn,
 	case FUNC_MF_CFG_PROTOCOL_ISCSI:
 		*p_proto = QED_PCI_ISCSI;
 		break;
+	case FUNC_MF_CFG_PROTOCOL_FCOE:
+		*p_proto = QED_PCI_FCOE;
+		break;
 	case FUNC_MF_CFG_PROTOCOL_ROCE:
 		DP_NOTICE(p_hwfn, "RoCE personality is not a valid value!\n");
 	/* Fallthrough */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 363dce0f16b1..0792224ac219 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/qed/qed_fcoe_if.h>
 #include "qed_hsi.h"
 
 struct qed_mcp_link_speed_params {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index 3b7edf6ff234..d59d9df60cd2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -110,6 +110,8 @@
 	0x1e80000UL
 #define  NIG_REG_RX_LLH_BRB_GATE_DNTFWD_PERPF \
 	0x5011f4UL
+#define PRS_REG_SEARCH_RESP_INITIATOR_TYPE \
+	0x1f0164UL
 #define  PRS_REG_SEARCH_TCP \
 	0x1f0400UL
 #define  PRS_REG_SEARCH_UDP \
@@ -120,6 +122,12 @@
 	0x1f040cUL
 #define  PRS_REG_SEARCH_OPENFLOW	\
 	0x1f0434UL
+#define PRS_REG_SEARCH_TAG1 \
+	0x1f0444UL
+#define PRS_REG_PKT_LEN_STAT_TAGS_NOT_COUNTED_FIRST \
+	0x1f0a0cUL
+#define PRS_REG_SEARCH_TCP_FIRST_FRAG \
+	0x1f0410UL
 #define  TM_REG_PF_ENABLE_CONN \
 	0x2c043cUL
 #define  TM_REG_PF_ENABLE_TASK \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 043882959606..30393ffaa8e5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -109,6 +109,10 @@ union ramrod_data {
 	struct rdma_srq_destroy_ramrod_data rdma_destroy_srq;
 	struct rdma_srq_modify_ramrod_data rdma_modify_srq;
 	struct roce_init_func_ramrod_data roce_init_func;
+	struct fcoe_init_ramrod_params fcoe_init;
+	struct fcoe_conn_offload_ramrod_params fcoe_conn_ofld;
+	struct fcoe_conn_terminate_ramrod_params fcoe_conn_terminate;
+	struct fcoe_stat_ramrod_params fcoe_stat;
 
 	struct iscsi_slow_path_hdr iscsi_empty;
 	struct iscsi_init_ramrod_params iscsi_init;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 097a72987572..6fb80f9ef446 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -386,6 +386,9 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	case QED_PCI_ETH:
 		p_ramrod->personality = PERSONALITY_ETH;
 		break;
+	case QED_PCI_FCOE:
+		p_ramrod->personality = PERSONALITY_FCOE;
+		break;
 	case QED_PCI_ISCSI:
 		p_ramrod->personality = PERSONALITY_ISCSI;
 		break;
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index c33080baf38c..52966b9bfde3 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -62,6 +62,7 @@
 #define COMMON_QUEUE_ENTRY_MAX_BYTE_SIZE        64
 
 #define ISCSI_CDU_TASK_SEG_TYPE       0
+#define FCOE_CDU_TASK_SEG_TYPE        0
 #define RDMA_CDU_TASK_SEG_TYPE        1
 
 #define FW_ASSERT_GENERAL_ATTN_IDX    32
@@ -205,6 +206,9 @@
 #define	DQ_XCM_ETH_TX_BD_CONS_CMD	DQ_XCM_AGG_VAL_SEL_WORD3
 #define	DQ_XCM_ETH_TX_BD_PROD_CMD	DQ_XCM_AGG_VAL_SEL_WORD4
 #define	DQ_XCM_ETH_GO_TO_BD_CONS_CMD	DQ_XCM_AGG_VAL_SEL_WORD5
+#define DQ_XCM_FCOE_SQ_CONS_CMD             DQ_XCM_AGG_VAL_SEL_WORD3
+#define DQ_XCM_FCOE_SQ_PROD_CMD             DQ_XCM_AGG_VAL_SEL_WORD4
+#define DQ_XCM_FCOE_X_FERQ_PROD_CMD         DQ_XCM_AGG_VAL_SEL_WORD5
 #define DQ_XCM_ISCSI_SQ_CONS_CMD	DQ_XCM_AGG_VAL_SEL_WORD3
 #define DQ_XCM_ISCSI_SQ_PROD_CMD	DQ_XCM_AGG_VAL_SEL_WORD4
 #define DQ_XCM_ISCSI_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3
@@ -261,6 +265,7 @@
 #define DQ_XCM_ETH_TERMINATE_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
 #define DQ_XCM_ETH_SLOW_PATH_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
 #define DQ_XCM_ETH_TPH_EN_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF23)
+#define DQ_XCM_FCOE_SLOW_PATH_CMD           BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
 #define DQ_XCM_ISCSI_DQ_FLUSH_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
 #define DQ_XCM_ISCSI_SLOW_PATH_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
 #define DQ_XCM_ISCSI_PROC_ONLY_CLEANUP_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF23)
@@ -291,6 +296,9 @@
 #define DQ_TCM_AGG_FLG_SHIFT_CF6	6
 #define DQ_TCM_AGG_FLG_SHIFT_CF7	7
 /* TCM agg counter flag selection (FW) */
+#define DQ_TCM_FCOE_FLUSH_Q0_CMD            BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
+#define DQ_TCM_FCOE_DUMMY_TIMER_CMD         BIT(DQ_TCM_AGG_FLG_SHIFT_CF2)
+#define DQ_TCM_FCOE_TIMER_STOP_ALL_CMD      BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
 #define DQ_TCM_ISCSI_FLUSH_Q0_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
 #define DQ_TCM_ISCSI_TIMER_STOP_ALL_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
 
@@ -728,7 +736,7 @@ enum mf_mode {
 /* Per-protocol connection types */
 enum protocol_type {
 	PROTOCOLID_ISCSI,
-	PROTOCOLID_RESERVED2,
+	PROTOCOLID_FCOE,
 	PROTOCOLID_ROCE,
 	PROTOCOLID_CORE,
 	PROTOCOLID_ETH,
diff --git a/include/linux/qed/fcoe_common.h b/include/linux/qed/fcoe_common.h
new file mode 100644
index 000000000000..2e417a45c5f7
--- /dev/null
+++ b/include/linux/qed/fcoe_common.h
@@ -0,0 +1,715 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef __FCOE_COMMON__
+#define __FCOE_COMMON__
+/*********************/
+/* FCOE FW CONSTANTS */
+/*********************/
+
+#define FC_ABTS_REPLY_MAX_PAYLOAD_LEN	12
+#define FCOE_MAX_SIZE_FCP_DATA_SUPER	(8600)
+
+struct fcoe_abts_pkt {
+	__le32 abts_rsp_fc_payload_lo;
+	__le16 abts_rsp_rx_id;
+	u8 abts_rsp_rctl;
+	u8 reserved2;
+};
+
+/* FCoE additional WQE (Sq/XferQ) information */
+union fcoe_additional_info_union {
+	__le32 previous_tid;
+	__le32 parent_tid;
+	__le32 burst_length;
+	__le32 seq_rec_updated_offset;
+};
+
+struct fcoe_exp_ro {
+	__le32 data_offset;
+	__le32 reserved;
+};
+
+union fcoe_cleanup_addr_exp_ro_union {
+	struct regpair abts_rsp_fc_payload_hi;
+	struct fcoe_exp_ro exp_ro;
+};
+
+/* FCoE Ramrod Command IDs */
+enum fcoe_completion_status {
+	FCOE_COMPLETION_STATUS_SUCCESS,
+	FCOE_COMPLETION_STATUS_FCOE_VER_ERR,
+	FCOE_COMPLETION_STATUS_SRC_MAC_ADD_ARR_ERR,
+	MAX_FCOE_COMPLETION_STATUS
+};
+
+struct fc_addr_nw {
+	u8 addr_lo;
+	u8 addr_mid;
+	u8 addr_hi;
+};
+
+/* FCoE connection offload */
+struct fcoe_conn_offload_ramrod_data {
+	struct regpair sq_pbl_addr;
+	struct regpair sq_curr_page_addr;
+	struct regpair sq_next_page_addr;
+	struct regpair xferq_pbl_addr;
+	struct regpair xferq_curr_page_addr;
+	struct regpair xferq_next_page_addr;
+	struct regpair respq_pbl_addr;
+	struct regpair respq_curr_page_addr;
+	struct regpair respq_next_page_addr;
+	__le16 dst_mac_addr_lo;
+	__le16 dst_mac_addr_mid;
+	__le16 dst_mac_addr_hi;
+	__le16 src_mac_addr_lo;
+	__le16 src_mac_addr_mid;
+	__le16 src_mac_addr_hi;
+	__le16 tx_max_fc_pay_len;
+	__le16 e_d_tov_timer_val;
+	__le16 rx_max_fc_pay_len;
+	__le16 vlan_tag;
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_VLAN_ID_MASK              0xFFF
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_VLAN_ID_SHIFT             0
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_CFI_MASK                  0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_CFI_SHIFT                 12
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_PRIORITY_MASK             0x7
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_PRIORITY_SHIFT            13
+	__le16 physical_q0;
+	__le16 rec_rr_tov_timer_val;
+	struct fc_addr_nw s_id;
+	u8 max_conc_seqs_c3;
+	struct fc_addr_nw d_id;
+	u8 flags;
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONT_INCR_SEQ_CNT_MASK  0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONT_INCR_SEQ_CNT_SHIFT 0
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONF_REQ_MASK           0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONF_REQ_SHIFT          1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_REC_VALID_MASK          0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_REC_VALID_SHIFT         2
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_MASK          0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_SHIFT         3
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_MODE_MASK                 0x3
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_MODE_SHIFT                4
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_RESERVED0_MASK            0x3
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_RESERVED0_SHIFT           6
+	__le16 conn_id;
+	u8 def_q_idx;
+	u8 reserved[5];
+};
+
+/* FCoE terminate connection request */
+struct fcoe_conn_terminate_ramrod_data {
+	struct regpair terminate_params_addr;
+};
+
+struct fcoe_fast_sgl_ctx {
+	struct regpair sgl_start_addr;
+	__le32 sgl_byte_offset;
+	__le16 task_reuse_cnt;
+	__le16 init_offset_in_first_sge;
+};
+
+struct fcoe_slow_sgl_ctx {
+	struct regpair base_sgl_addr;
+	__le16 curr_sge_off;
+	__le16 remainder_num_sges;
+	__le16 curr_sgl_index;
+	__le16 reserved;
+};
+
+struct fcoe_sge {
+	struct regpair sge_addr;
+	__le16 size;
+	__le16 reserved0;
+	u8 reserved1[3];
+	u8 is_valid_sge;
+};
+
+union fcoe_data_desc_ctx {
+	struct fcoe_fast_sgl_ctx fast;
+	struct fcoe_slow_sgl_ctx slow;
+	struct fcoe_sge single_sge;
+};
+
+union fcoe_dix_desc_ctx {
+	struct fcoe_slow_sgl_ctx dix_sgl;
+	struct fcoe_sge cached_dix_sge;
+};
+
+struct fcoe_fcp_cmd_payload {
+	__le32 opaque[8];
+};
+
+struct fcoe_fcp_rsp_payload {
+	__le32 opaque[6];
+};
+
+struct fcoe_fcp_xfer_payload {
+	__le32 opaque[3];
+};
+
+/* FCoE firmware function init */
+struct fcoe_init_func_ramrod_data {
+	struct scsi_init_func_params func_params;
+	struct scsi_init_func_queues q_params;
+	__le16 mtu;
+	__le16 sq_num_pages_in_pbl;
+	__le32 reserved;
+};
+
+/* FCoE: Mode of the connection: Target or Initiator or both */
+enum fcoe_mode_type {
+	FCOE_INITIATOR_MODE = 0x0,
+	FCOE_TARGET_MODE = 0x1,
+	FCOE_BOTH_OR_NOT_CHOSEN = 0x3,
+	MAX_FCOE_MODE_TYPE
+};
+
+struct fcoe_mstorm_fcoe_task_st_ctx_fp {
+	__le16 flags;
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_RSRV0_MASK                 0x7FFF
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_RSRV0_SHIFT                0
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_MP_INCLUDE_FC_HEADER_MASK  0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_MP_INCLUDE_FC_HEADER_SHIFT 15
+	__le16 difDataResidue;
+	__le16 parent_id;
+	__le16 single_sge_saved_offset;
+	__le32 data_2_trns_rem;
+	__le32 offset_in_io;
+	union fcoe_dix_desc_ctx dix_desc;
+	union fcoe_data_desc_ctx data_desc;
+};
+
+struct fcoe_mstorm_fcoe_task_st_ctx_non_fp {
+	__le16 flags;
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HOST_INTERFACE_MASK            0x3
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HOST_INTERFACE_SHIFT           0
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_TO_PEER_MASK               0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_TO_PEER_SHIFT              2
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_APP_TAG_MASK      0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_APP_TAG_SHIFT     3
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_INTERVAL_SIZE_LOG_MASK         0xF
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_INTERVAL_SIZE_LOG_SHIFT        4
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_BLOCK_SIZE_MASK            0x3
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_BLOCK_SIZE_SHIFT           8
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RESERVED_MASK                  0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RESERVED_SHIFT                 10
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HAS_FIRST_PACKET_ARRIVED_MASK  0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HAS_FIRST_PACKET_ARRIVED_SHIFT 11
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_REF_TAG_MASK      0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_REF_TAG_SHIFT     12
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_CACHED_SGE_FLG_MASK        0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_CACHED_SGE_FLG_SHIFT       13
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_OFFSET_IN_IO_VALID_MASK        0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_OFFSET_IN_IO_VALID_SHIFT       14
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_SUPPORTED_MASK             0x1
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_SUPPORTED_SHIFT            15
+	u8 tx_rx_sgl_mode;
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_TX_SGL_MODE_MASK               0x7
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_TX_SGL_MODE_SHIFT              0
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE_MASK               0x7
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE_SHIFT              3
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RSRV1_MASK                     0x3
+#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RSRV1_SHIFT                    6
+	u8 rsrv2;
+	__le32 num_prm_zero_read;
+	struct regpair rsp_buf_addr;
+};
+
+struct fcoe_rx_stat {
+	struct regpair fcoe_rx_byte_cnt;
+	struct regpair fcoe_rx_data_pkt_cnt;
+	struct regpair fcoe_rx_xfer_pkt_cnt;
+	struct regpair fcoe_rx_other_pkt_cnt;
+	__le32 fcoe_silent_drop_pkt_cmdq_full_cnt;
+	__le32 fcoe_silent_drop_pkt_rq_full_cnt;
+	__le32 fcoe_silent_drop_pkt_crc_error_cnt;
+	__le32 fcoe_silent_drop_pkt_task_invalid_cnt;
+	__le32 fcoe_silent_drop_total_pkt_cnt;
+	__le32 rsrv;
+};
+
+enum fcoe_sgl_mode {
+	FCOE_SLOW_SGL,
+	FCOE_SINGLE_FAST_SGE,
+	FCOE_2_FAST_SGE,
+	FCOE_3_FAST_SGE,
+	FCOE_4_FAST_SGE,
+	FCOE_MUL_FAST_SGES,
+	MAX_FCOE_SGL_MODE
+};
+
+struct fcoe_stat_ramrod_data {
+	struct regpair stat_params_addr;
+};
+
+struct protection_info_ctx {
+	__le16 flags;
+#define PROTECTION_INFO_CTX_HOST_INTERFACE_MASK        0x3
+#define PROTECTION_INFO_CTX_HOST_INTERFACE_SHIFT       0
+#define PROTECTION_INFO_CTX_DIF_TO_PEER_MASK           0x1
+#define PROTECTION_INFO_CTX_DIF_TO_PEER_SHIFT          2
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_APP_TAG_MASK  0x1
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_APP_TAG_SHIFT 3
+#define PROTECTION_INFO_CTX_INTERVAL_SIZE_LOG_MASK     0xF
+#define PROTECTION_INFO_CTX_INTERVAL_SIZE_LOG_SHIFT    4
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_REF_TAG_MASK  0x1
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_REF_TAG_SHIFT 8
+#define PROTECTION_INFO_CTX_RESERVED0_MASK             0x7F
+#define PROTECTION_INFO_CTX_RESERVED0_SHIFT            9
+	u8 dix_block_size;
+	u8 dst_size;
+};
+
+union protection_info_union_ctx {
+	struct protection_info_ctx info;
+	__le32 value;
+};
+
+struct fcp_rsp_payload_padded {
+	struct fcoe_fcp_rsp_payload rsp_payload;
+	__le32 reserved[2];
+};
+
+struct fcp_xfer_payload_padded {
+	struct fcoe_fcp_xfer_payload xfer_payload;
+	__le32 reserved[5];
+};
+
+struct fcoe_tx_data_params {
+	__le32 data_offset;
+	__le32 offset_in_io;
+	u8 flags;
+#define FCOE_TX_DATA_PARAMS_OFFSET_IN_IO_VALID_MASK  0x1
+#define FCOE_TX_DATA_PARAMS_OFFSET_IN_IO_VALID_SHIFT 0
+#define FCOE_TX_DATA_PARAMS_DROP_DATA_MASK           0x1
+#define FCOE_TX_DATA_PARAMS_DROP_DATA_SHIFT          1
+#define FCOE_TX_DATA_PARAMS_AFTER_SEQ_REC_MASK       0x1
+#define FCOE_TX_DATA_PARAMS_AFTER_SEQ_REC_SHIFT      2
+#define FCOE_TX_DATA_PARAMS_RESERVED0_MASK           0x1F
+#define FCOE_TX_DATA_PARAMS_RESERVED0_SHIFT          3
+	u8 dif_residual;
+	__le16 seq_cnt;
+	__le16 single_sge_saved_offset;
+	__le16 next_dif_offset;
+	__le16 seq_id;
+	__le16 reserved3;
+};
+
+struct fcoe_tx_mid_path_params {
+	__le32 parameter;
+	u8 r_ctl;
+	u8 type;
+	u8 cs_ctl;
+	u8 df_ctl;
+	__le16 rx_id;
+	__le16 ox_id;
+};
+
+struct fcoe_tx_params {
+	struct fcoe_tx_data_params data;
+	struct fcoe_tx_mid_path_params mid_path;
+};
+
+union fcoe_tx_info_union_ctx {
+	struct fcoe_fcp_cmd_payload fcp_cmd_payload;
+	struct fcp_rsp_payload_padded fcp_rsp_payload;
+	struct fcp_xfer_payload_padded fcp_xfer_payload;
+	struct fcoe_tx_params tx_params;
+};
+
+struct ystorm_fcoe_task_st_ctx {
+	u8 task_type;
+	u8 sgl_mode;
+#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK  0x7
+#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_SHIFT 0
+#define YSTORM_FCOE_TASK_ST_CTX_RSRV_MASK         0x1F
+#define YSTORM_FCOE_TASK_ST_CTX_RSRV_SHIFT        3
+	u8 cached_dix_sge;
+	u8 expect_first_xfer;
+	__le32 num_pbf_zero_write;
+	union protection_info_union_ctx protection_info_union;
+	__le32 data_2_trns_rem;
+	union fcoe_tx_info_union_ctx tx_info_union;
+	union fcoe_dix_desc_ctx dix_desc;
+	union fcoe_data_desc_ctx data_desc;
+	__le16 ox_id;
+	__le16 rx_id;
+	__le32 task_rety_identifier;
+	__le32 reserved1[2];
+};
+
+struct ystorm_fcoe_task_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	__le16 word0;
+	u8 flags0;
+#define YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_MASK     0xF
+#define YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_SHIFT    0
+#define YSTORM_FCOE_TASK_AG_CTX_BIT0_MASK        0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT0_SHIFT       4
+#define YSTORM_FCOE_TASK_AG_CTX_BIT1_MASK        0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT       5
+#define YSTORM_FCOE_TASK_AG_CTX_BIT2_MASK        0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT       6
+#define YSTORM_FCOE_TASK_AG_CTX_BIT3_MASK        0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT       7
+	u8 flags1;
+#define YSTORM_FCOE_TASK_AG_CTX_CF0_MASK         0x3
+#define YSTORM_FCOE_TASK_AG_CTX_CF0_SHIFT        0
+#define YSTORM_FCOE_TASK_AG_CTX_CF1_MASK         0x3
+#define YSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT        2
+#define YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_MASK  0x3
+#define YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_SHIFT 4
+#define YSTORM_FCOE_TASK_AG_CTX_CF0EN_MASK       0x1
+#define YSTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT      6
+#define YSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK       0x1
+#define YSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT      7
+	u8 flags2;
+#define YSTORM_FCOE_TASK_AG_CTX_BIT4_MASK        0x1
+#define YSTORM_FCOE_TASK_AG_CTX_BIT4_SHIFT       0
+#define YSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK     0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT    1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK     0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT    2
+#define YSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK     0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT    3
+#define YSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK     0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT    4
+#define YSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK     0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT    5
+#define YSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK     0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT    6
+#define YSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK     0x1
+#define YSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT    7
+	u8 byte2;
+	__le32 reg0;
+	u8 byte3;
+	u8 byte4;
+	__le16 rx_id;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le16 word5;
+	__le32 reg1;
+	__le32 reg2;
+};
+
+struct tstorm_fcoe_task_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	__le16 icid;
+	u8 flags0;
+#define TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK     0xF
+#define TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT    0
+#define TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK        0x1
+#define TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT       4
+#define TSTORM_FCOE_TASK_AG_CTX_BIT1_MASK                0x1
+#define TSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT               5
+#define TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_MASK     0x1
+#define TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_SHIFT    6
+#define TSTORM_FCOE_TASK_AG_CTX_VALID_MASK               0x1
+#define TSTORM_FCOE_TASK_AG_CTX_VALID_SHIFT              7
+	u8 flags1;
+#define TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_MASK        0x1
+#define TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_SHIFT       0
+#define TSTORM_FCOE_TASK_AG_CTX_BIT5_MASK                0x1
+#define TSTORM_FCOE_TASK_AG_CTX_BIT5_SHIFT               1
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_MASK       0x3
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_SHIFT      2
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_MASK           0x3
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_SHIFT          4
+#define TSTORM_FCOE_TASK_AG_CTX_CF2_MASK                 0x3
+#define TSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT                6
+	u8 flags2;
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_MASK      0x3
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_SHIFT     0
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK       0x3
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT      2
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_MASK         0x3
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_SHIFT        4
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_MASK     0x3
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_SHIFT    6
+	u8 flags3;
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_MASK       0x3
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_SHIFT      0
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_MASK    0x1
+#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_SHIFT   2
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_MASK        0x1
+#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_SHIFT       3
+#define TSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK               0x1
+#define TSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT              4
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_MASK   0x1
+#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_SHIFT  5
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK    0x1
+#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT   6
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_MASK      0x1
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_SHIFT     7
+	u8 flags4;
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_MASK  0x1
+#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_SHIFT 0
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_MASK    0x1
+#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_SHIFT   1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK             0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT            2
+#define TSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK             0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT            3
+#define TSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK             0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT            4
+#define TSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK             0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT            5
+#define TSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK             0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT            6
+#define TSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK             0x1
+#define TSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT            7
+	u8 cleanup_state;
+	__le16 last_sent_tid;
+	__le32 rec_rr_tov_exp_timeout;
+	u8 byte3;
+	u8 byte4;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 data_offset_end_of_seq;
+	__le32 data_offset_next;
+};
+
+struct fcoe_tstorm_fcoe_task_st_ctx_read_write {
+	union fcoe_cleanup_addr_exp_ro_union cleanup_addr_exp_ro_union;
+	__le16 flags;
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK       0x7
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT      0
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK   0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT  3
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK        0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT       4
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK       0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT      5
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK  0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT 6
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MASK   0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT  7
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK        0x3
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT       8
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK             0x3F
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT            10
+	__le16 seq_cnt;
+	u8 seq_id;
+	u8 ooo_rx_seq_id;
+	__le16 rx_id;
+	struct fcoe_abts_pkt abts_data;
+	__le32 e_d_tov_exp_timeout_val;
+	__le16 ooo_rx_seq_cnt;
+	__le16 reserved1;
+};
+
+struct fcoe_tstorm_fcoe_task_st_ctx_read_only {
+	u8 task_type;
+	u8 dev_type;
+	u8 conf_supported;
+	u8 glbl_q_num;
+	__le32 cid;
+	__le32 fcp_cmd_trns_size;
+	__le32 rsrv;
+};
+
+struct tstorm_fcoe_task_st_ctx {
+	struct fcoe_tstorm_fcoe_task_st_ctx_read_write read_write;
+	struct fcoe_tstorm_fcoe_task_st_ctx_read_only read_only;
+};
+
+struct mstorm_fcoe_task_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	__le16 icid;
+	u8 flags0;
+#define MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK    0xF
+#define MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT   0
+#define MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK       0x1
+#define MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT      4
+#define MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_MASK         0x1
+#define MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_SHIFT        5
+#define MSTORM_FCOE_TASK_AG_CTX_BIT2_MASK               0x1
+#define MSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT              6
+#define MSTORM_FCOE_TASK_AG_CTX_BIT3_MASK               0x1
+#define MSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT              7
+	u8 flags1;
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK      0x3
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT     0
+#define MSTORM_FCOE_TASK_AG_CTX_CF1_MASK                0x3
+#define MSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT               2
+#define MSTORM_FCOE_TASK_AG_CTX_CF2_MASK                0x3
+#define MSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT               4
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK   0x1
+#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT  6
+#define MSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK              0x1
+#define MSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT             7
+	u8 flags2;
+#define MSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK              0x1
+#define MSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT             0
+#define MSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK            0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT           1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK            0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT           2
+#define MSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK            0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT           3
+#define MSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK            0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT           4
+#define MSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK            0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT           5
+#define MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_MASK  0x1
+#define MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_SHIFT 6
+#define MSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK            0x1
+#define MSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT           7
+	u8 cleanup_state;
+	__le32 received_bytes;
+	u8 byte3;
+	u8 glbl_q_num;
+	__le16 word1;
+	__le16 tid_to_xfer;
+	__le16 word3;
+	__le16 word4;
+	__le16 word5;
+	__le32 expected_bytes;
+	__le32 reg2;
+};
+
+struct mstorm_fcoe_task_st_ctx {
+	struct fcoe_mstorm_fcoe_task_st_ctx_non_fp non_fp;
+	struct fcoe_mstorm_fcoe_task_st_ctx_fp fp;
+};
+
+struct ustorm_fcoe_task_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	__le16 icid;
+	u8 flags0;
+#define USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK  0xF
+#define USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0
+#define USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK     0x1
+#define USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT    4
+#define USTORM_FCOE_TASK_AG_CTX_BIT1_MASK             0x1
+#define USTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT            5
+#define USTORM_FCOE_TASK_AG_CTX_CF0_MASK              0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF0_SHIFT             6
+	u8 flags1;
+#define USTORM_FCOE_TASK_AG_CTX_CF1_MASK              0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF1_SHIFT             0
+#define USTORM_FCOE_TASK_AG_CTX_CF2_MASK              0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF2_SHIFT             2
+#define USTORM_FCOE_TASK_AG_CTX_CF3_MASK              0x3
+#define USTORM_FCOE_TASK_AG_CTX_CF3_SHIFT             4
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_MASK     0x3
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_SHIFT    6
+	u8 flags2;
+#define USTORM_FCOE_TASK_AG_CTX_CF0EN_MASK            0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT           0
+#define USTORM_FCOE_TASK_AG_CTX_CF1EN_MASK            0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT           1
+#define USTORM_FCOE_TASK_AG_CTX_CF2EN_MASK            0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT           2
+#define USTORM_FCOE_TASK_AG_CTX_CF3EN_MASK            0x1
+#define USTORM_FCOE_TASK_AG_CTX_CF3EN_SHIFT           3
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK  0x1
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT 4
+#define USTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK          0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT         5
+#define USTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK          0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT         6
+#define USTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK          0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT         7
+	u8 flags3;
+#define USTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK          0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT         0
+#define USTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK          0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT         1
+#define USTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK          0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT         2
+#define USTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK          0x1
+#define USTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT         3
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_MASK   0xF
+#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT  4
+	__le32 dif_err_intervals;
+	__le32 dif_error_1st_interval;
+	__le32 global_cq_num;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+};
+
+struct fcoe_task_context {
+	struct ystorm_fcoe_task_st_ctx ystorm_st_context;
+	struct tdif_task_context tdif_context;
+	struct ystorm_fcoe_task_ag_ctx ystorm_ag_context;
+	struct tstorm_fcoe_task_ag_ctx tstorm_ag_context;
+	struct timers_context timer_context;
+	struct tstorm_fcoe_task_st_ctx tstorm_st_context;
+	struct regpair tstorm_st_padding[2];
+	struct mstorm_fcoe_task_ag_ctx mstorm_ag_context;
+	struct mstorm_fcoe_task_st_ctx mstorm_st_context;
+	struct ustorm_fcoe_task_ag_ctx ustorm_ag_context;
+	struct rdif_task_context rdif_context;
+};
+
+struct fcoe_tx_stat {
+	struct regpair fcoe_tx_byte_cnt;
+	struct regpair fcoe_tx_data_pkt_cnt;
+	struct regpair fcoe_tx_xfer_pkt_cnt;
+	struct regpair fcoe_tx_other_pkt_cnt;
+};
+
+struct fcoe_wqe {
+	__le16 task_id;
+	__le16 flags;
+#define FCOE_WQE_REQ_TYPE_MASK        0xF
+#define FCOE_WQE_REQ_TYPE_SHIFT       0
+#define FCOE_WQE_SGL_MODE_MASK        0x7
+#define FCOE_WQE_SGL_MODE_SHIFT       4
+#define FCOE_WQE_CONTINUATION_MASK    0x1
+#define FCOE_WQE_CONTINUATION_SHIFT   7
+#define FCOE_WQE_INVALIDATE_PTU_MASK  0x1
+#define FCOE_WQE_INVALIDATE_PTU_SHIFT 8
+#define FCOE_WQE_SUPER_IO_MASK        0x1
+#define FCOE_WQE_SUPER_IO_SHIFT       9
+#define FCOE_WQE_SEND_AUTO_RSP_MASK   0x1
+#define FCOE_WQE_SEND_AUTO_RSP_SHIFT  10
+#define FCOE_WQE_RESERVED0_MASK       0x1F
+#define FCOE_WQE_RESERVED0_SHIFT      11
+	union fcoe_additional_info_union additional_info_union;
+};
+
+struct xfrqe_prot_flags {
+	u8 flags;
+#define XFRQE_PROT_FLAGS_PROT_INTERVAL_SIZE_LOG_MASK  0xF
+#define XFRQE_PROT_FLAGS_PROT_INTERVAL_SIZE_LOG_SHIFT 0
+#define XFRQE_PROT_FLAGS_DIF_TO_PEER_MASK             0x1
+#define XFRQE_PROT_FLAGS_DIF_TO_PEER_SHIFT            4
+#define XFRQE_PROT_FLAGS_HOST_INTERFACE_MASK          0x3
+#define XFRQE_PROT_FLAGS_HOST_INTERFACE_SHIFT         5
+#define XFRQE_PROT_FLAGS_RESERVED_MASK                0x1
+#define XFRQE_PROT_FLAGS_RESERVED_SHIFT               7
+};
+
+struct fcoe_db_data {
+	u8 params;
+#define FCOE_DB_DATA_DEST_MASK         0x3
+#define FCOE_DB_DATA_DEST_SHIFT        0
+#define FCOE_DB_DATA_AGG_CMD_MASK      0x3
+#define FCOE_DB_DATA_AGG_CMD_SHIFT     2
+#define FCOE_DB_DATA_BYPASS_EN_MASK    0x1
+#define FCOE_DB_DATA_BYPASS_EN_SHIFT   4
+#define FCOE_DB_DATA_RESERVED_MASK     0x1
+#define FCOE_DB_DATA_RESERVED_SHIFT    5
+#define FCOE_DB_DATA_AGG_VAL_SEL_MASK  0x3
+#define FCOE_DB_DATA_AGG_VAL_SEL_SHIFT 6
+	u8 agg_flags;
+	__le16 sq_prod;
+};
+#endif /* __FCOE_COMMON__ */
diff --git a/include/linux/qed/qed_fcoe_if.h b/include/linux/qed/qed_fcoe_if.h
new file mode 100644
index 000000000000..bd6bcb809415
--- /dev/null
+++ b/include/linux/qed/qed_fcoe_if.h
@@ -0,0 +1,145 @@
+#ifndef _QED_FCOE_IF_H
+#define _QED_FCOE_IF_H
+#include <linux/types.h>
+#include <linux/qed/qed_if.h>
+struct qed_fcoe_stats {
+	u64 fcoe_rx_byte_cnt;
+	u64 fcoe_rx_data_pkt_cnt;
+	u64 fcoe_rx_xfer_pkt_cnt;
+	u64 fcoe_rx_other_pkt_cnt;
+	u32 fcoe_silent_drop_pkt_cmdq_full_cnt;
+	u32 fcoe_silent_drop_pkt_rq_full_cnt;
+	u32 fcoe_silent_drop_pkt_crc_error_cnt;
+	u32 fcoe_silent_drop_pkt_task_invalid_cnt;
+	u32 fcoe_silent_drop_total_pkt_cnt;
+
+	u64 fcoe_tx_byte_cnt;
+	u64 fcoe_tx_data_pkt_cnt;
+	u64 fcoe_tx_xfer_pkt_cnt;
+	u64 fcoe_tx_other_pkt_cnt;
+};
+
+struct qed_dev_fcoe_info {
+	struct qed_dev_info common;
+
+	void __iomem *primary_dbq_rq_addr;
+	void __iomem *secondary_bdq_rq_addr;
+};
+
+struct qed_fcoe_params_offload {
+	dma_addr_t sq_pbl_addr;
+	dma_addr_t sq_curr_page_addr;
+	dma_addr_t sq_next_page_addr;
+
+	u8 src_mac[ETH_ALEN];
+	u8 dst_mac[ETH_ALEN];
+
+	u16 tx_max_fc_pay_len;
+	u16 e_d_tov_timer_val;
+	u16 rec_tov_timer_val;
+	u16 rx_max_fc_pay_len;
+	u16 vlan_tag;
+
+	struct fc_addr_nw s_id;
+	u8 max_conc_seqs_c3;
+	struct fc_addr_nw d_id;
+	u8 flags;
+	u8 def_q_idx;
+};
+
+#define MAX_TID_BLOCKS_FCOE (512)
+struct qed_fcoe_tid {
+	u32 size;		/* In bytes per task */
+	u32 num_tids_per_block;
+	u8 *blocks[MAX_TID_BLOCKS_FCOE];
+};
+
+struct qed_fcoe_cb_ops {
+	struct qed_common_cb_ops common;
+	 u32 (*get_login_failures)(void *cookie);
+};
+
+void qed_fcoe_set_pf_params(struct qed_dev *cdev,
+			    struct qed_fcoe_pf_params *params);
+
+/**
+ * struct qed_fcoe_ops - qed FCoE operations.
+ * @common:		common operations pointer
+ * @fill_dev_info:	fills FCoE specific information
+ *			@param cdev
+ *			@param info
+ *			@return 0 on sucesss, otherwise error value.
+ * @register_ops:	register FCoE operations
+ *			@param cdev
+ *			@param ops - specified using qed_iscsi_cb_ops
+ *			@param cookie - driver private
+ * @ll2:		light L2 operations pointer
+ * @start:		fcoe in FW
+ *			@param cdev
+ *			@param tasks - qed will fill information about tasks
+ *			return 0 on success, otherwise error value.
+ * @stop:		stops fcoe in FW
+ *			@param cdev
+ *			return 0 on success, otherwise error value.
+ * @acquire_conn:	acquire a new fcoe connection
+ *			@param cdev
+ *			@param handle - qed will fill handle that should be
+ *				used henceforth as identifier of the
+ *				connection.
+ *			@param p_doorbell - qed will fill the address of the
+ *				doorbell.
+ *			return 0 on sucesss, otherwise error value.
+ * @release_conn:	release a previously acquired fcoe connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			return 0 on success, otherwise error value.
+ * @offload_conn:	configures an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			return 0 on success, otherwise error value.
+ * @destroy_conn:	stops an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param terminate_params
+ *			return 0 on success, otherwise error value.
+ * @get_stats:		gets FCoE related statistics
+ *			@param cdev
+ *			@param stats - pointer to struck that would be filled
+ *				we stats
+ *			return 0 on success, error otherwise.
+ */
+struct qed_fcoe_ops {
+	const struct qed_common_ops *common;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_fcoe_info *info);
+
+	void (*register_ops)(struct qed_dev *cdev,
+			     struct qed_fcoe_cb_ops *ops, void *cookie);
+
+	const struct qed_ll2_ops *ll2;
+
+	int (*start)(struct qed_dev *cdev, struct qed_fcoe_tid *tasks);
+
+	int (*stop)(struct qed_dev *cdev);
+
+	int (*acquire_conn)(struct qed_dev *cdev,
+			    u32 *handle,
+			    u32 *fw_cid, void __iomem **p_doorbell);
+
+	int (*release_conn)(struct qed_dev *cdev, u32 handle);
+
+	int (*offload_conn)(struct qed_dev *cdev,
+			    u32 handle,
+			    struct qed_fcoe_params_offload *conn_info);
+	int (*destroy_conn)(struct qed_dev *cdev,
+			    u32 handle, dma_addr_t terminate_params);
+
+	int (*get_stats)(struct qed_dev *cdev, struct qed_fcoe_stats *stats);
+};
+
+const struct qed_fcoe_ops *qed_get_fcoe_ops(void);
+void qed_put_fcoe_ops(void);
+#endif
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index d1576a2bcfc9..fde56c436f71 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -59,7 +59,6 @@ enum dcbx_protocol_type {
 
 #define QED_ROCE_PROTOCOL_INDEX (3)
 
-#ifdef CONFIG_DCB
 #define QED_LLDP_CHASSIS_ID_STAT_LEN 4
 #define QED_LLDP_PORT_ID_STAT_LEN 4
 #define QED_DCBX_MAX_APP_PROTOCOL 32
@@ -155,7 +154,6 @@ struct qed_dcbx_get {
 	struct qed_dcbx_remote_params remote;
 	struct qed_dcbx_admin_params local;
 };
-#endif
 
 enum qed_led_mode {
 	QED_LED_MODE_OFF,
@@ -182,6 +180,38 @@ struct qed_eth_pf_params {
 	u16 num_cons;
 };
 
+struct qed_fcoe_pf_params {
+	/* The following parameters are used during protocol-init */
+	u64 glbl_q_params_addr;
+	u64 bdq_pbl_base_addr[2];
+
+	/* The following parameters are used during HW-init
+	 * and these parameters need to be passed as arguments
+	 * to update_pf_params routine invoked before slowpath start
+	 */
+	u16 num_cons;
+	u16 num_tasks;
+
+	/* The following parameters are used during protocol-init */
+	u16 sq_num_pbl_pages;
+
+	u16 cq_num_entries;
+	u16 cmdq_num_entries;
+	u16 rq_buffer_log_size;
+	u16 mtu;
+	u16 dummy_icid;
+	u16 bdq_xoff_threshold[2];
+	u16 bdq_xon_threshold[2];
+	u16 rq_buffer_size;
+	u8 num_cqs;		/* num of global CQs */
+	u8 log_page_size;
+	u8 gl_rq_pi;
+	u8 gl_cmd_pi;
+	u8 debug_mode;
+	u8 is_target;
+	u8 bdq_pbl_num_entries[2];
+};
+
 /* Most of the the parameters below are described in the FW iSCSI / TCP HSI */
 struct qed_iscsi_pf_params {
 	u64 glbl_q_params_addr;
@@ -245,6 +275,7 @@ struct qed_rdma_pf_params {
 
 struct qed_pf_params {
 	struct qed_eth_pf_params eth_pf_params;
+	struct qed_fcoe_pf_params fcoe_pf_params;
 	struct qed_iscsi_pf_params iscsi_pf_params;
 	struct qed_rdma_pf_params rdma_pf_params;
 };
@@ -305,6 +336,7 @@ enum qed_sb_type {
 enum qed_protocol {
 	QED_PROTOCOL_ETH,
 	QED_PROTOCOL_ISCSI,
+	QED_PROTOCOL_FCOE,
 };
 
 enum qed_link_mode_bits {
@@ -391,6 +423,7 @@ struct qed_int_info {
 struct qed_common_cb_ops {
 	void	(*link_update)(void			*dev,
 			       struct qed_link_output	*link);
+	void	(*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type);
 };
 
 struct qed_selftest_ops {
@@ -494,6 +527,10 @@ struct qed_common_ops {
 
 	void		(*simd_handler_clean)(struct qed_dev *cdev,
 					      int index);
+	int (*dbg_grc)(struct qed_dev *cdev,
+		       void *buffer, u32 *num_dumped_bytes);
+
+	int (*dbg_grc_size)(struct qed_dev *cdev);
 
 	int (*dbg_all_data) (struct qed_dev *cdev, void *buffer);
 
-- 
cgit v1.2.3


From bd4b9f8b4af7be15fd162276ec9a2a1d49f10270 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 17 Feb 2017 12:45:37 +0800
Subject: sctp: add support for generating stream reconf resp chunk

This patch is to define Re-configuration Response Parameter described
in rfc6525 section 4.4. As optional fields are only for SSN/TSN Reset
Request Parameter, it uses another function to make that.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 24 ++++++++++++++++
 include/net/sctp/sm.h    |  7 +++++
 net/sctp/sm_make_chunk.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b055788de0cf..7a4804c4a593 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -749,4 +749,28 @@ struct sctp_strreset_addstrm {
 	__u16 reserved;
 };
 
+enum {
+	SCTP_STRRESET_NOTHING_TO_DO	= 0x00,
+	SCTP_STRRESET_PERFORMED		= 0x01,
+	SCTP_STRRESET_DENIED		= 0x02,
+	SCTP_STRRESET_ERR_WRONG_SSN	= 0x03,
+	SCTP_STRRESET_ERR_IN_PROGRESS	= 0x04,
+	SCTP_STRRESET_ERR_BAD_SEQNO	= 0x05,
+	SCTP_STRRESET_IN_PROGRESS	= 0x06,
+};
+
+struct sctp_strreset_resp {
+	sctp_paramhdr_t param_hdr;
+	__u32 response_seq;
+	__u32 result;
+};
+
+struct sctp_strreset_resptsn {
+	sctp_paramhdr_t param_hdr;
+	__u32 response_seq;
+	__u32 result;
+	__u32 senders_next_tsn;
+	__u32 receivers_next_tsn;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 3675fde3a26e..8a85dd7fc1ec 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -270,6 +270,13 @@ struct sctp_chunk *sctp_make_strreset_tsnreq(
 struct sctp_chunk *sctp_make_strreset_addstrm(
 				const struct sctp_association *asoc,
 				__u16 out, __u16 in);
+struct sctp_chunk *sctp_make_strreset_resp(
+				const struct sctp_association *asoc,
+				__u32 result, __u32 sn);
+struct sctp_chunk *sctp_make_strreset_tsnresp(
+				struct sctp_association *asoc,
+				__u32 result, __u32 sn,
+				__u32 sender_tsn, __u32 receiver_tsn);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 7f8dbf2c6cee..9680c580759b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3733,3 +3733,77 @@ struct sctp_chunk *sctp_make_strreset_addstrm(
 
 	return retval;
 }
+
+/* RE-CONFIG 4.4 (RESP)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 16       |      Parameter Length         |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Response Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                            Result                             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_resp(
+				const struct sctp_association *asoc,
+				__u32 result, __u32 sn)
+{
+	struct sctp_strreset_resp resp;
+	__u16 length = sizeof(resp);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+	resp.param_hdr.length = htons(length);
+	resp.response_seq = htonl(sn);
+	resp.result = htonl(result);
+
+	sctp_addto_chunk(retval, sizeof(resp), &resp);
+
+	return retval;
+}
+
+/* RE-CONFIG 4.4 OPTIONAL (TSNRESP)
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |     Parameter Type = 16       |      Parameter Length         |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |         Re-configuration Response Sequence Number             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                            Result                             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                   Sender's Next TSN (optional)                |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                  Receiver's Next TSN (optional)               |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_strreset_tsnresp(
+					struct sctp_association *asoc,
+					__u32 result, __u32 sn,
+					__u32 sender_tsn, __u32 receiver_tsn)
+{
+	struct sctp_strreset_resptsn tsnresp;
+	__u16 length = sizeof(tsnresp);
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_reconf(asoc, length);
+	if (!retval)
+		return NULL;
+
+	tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
+	tsnresp.param_hdr.length = htons(length);
+
+	tsnresp.response_seq = htonl(sn);
+	tsnresp.result = htonl(result);
+	tsnresp.senders_next_tsn = htonl(sender_tsn);
+	tsnresp.receivers_next_tsn = htonl(receiver_tsn);
+
+	sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp);
+
+	return retval;
+}
-- 
cgit v1.2.3


From d24cdcd3e40a6825135498e11c20c7976b9bf545 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 16 Jan 2017 12:06:09 +0100
Subject: libceph: use BUG() instead of BUG_ON(1)

I ran into this compile warning, which is the result of BUG_ON(1)
not always leading to the compiler treating the code path as
unreachable:

    include/linux/ceph/osdmap.h: In function 'ceph_can_shift_osds':
    include/linux/ceph/osdmap.h:62:1: error: control reaches end of non-void function [-Werror=return-type]

Using BUG() here avoids the warning.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 9a9041784dcf..412906609954 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
 	case CEPH_POOL_TYPE_EC:
 		return false;
 	default:
-		BUG_ON(1);
+		BUG();
 	}
 }
 
-- 
cgit v1.2.3


From 66a0e2d579dbec5c676cfe446234ffebb267c564 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 31 Jan 2017 15:55:06 +0100
Subject: crush: remove mutable part of CRUSH map

Then add it to the working state. It would be very nice if we didn't
have to take a lock to calculate a crush placement. By moving the
permutation array into the working data, we can treat the CRUSH map as
immutable.

Reflects ceph.git commit cbcd039651c0569551cb90d26ce27e1432671f2a.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h  |   1 +
 include/linux/crush/crush.h  |  41 +++++++--
 include/linux/crush/mapper.h |   4 +-
 net/ceph/crush/crush.c       |   5 --
 net/ceph/crush/mapper.c      | 206 +++++++++++++++++++++++++++++--------------
 net/ceph/osdmap.c            |  47 ++++++++--
 6 files changed, 218 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 412906609954..cef1cab789b9 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -175,6 +175,7 @@ struct ceph_osdmap {
 
 	struct mutex crush_scratch_mutex;
 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+	void *crush_workspace;
 };
 
 static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index be8f12b8f195..fbecbd089d75 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -135,13 +135,6 @@ struct crush_bucket {
 	__u32 size;      /* num items */
 	__s32 *items;
 
-	/*
-	 * cached random permutation: used for uniform bucket and for
-	 * the linear search fallback for the other bucket types.
-	 */
-	__u32 perm_x;  /* @x for which *perm is defined */
-	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
-	__u32 *perm;
 };
 
 struct crush_bucket_uniform {
@@ -211,6 +204,21 @@ struct crush_map {
 	 * device fails. */
 	__u8 chooseleaf_stable;
 
+	/*
+	 * This value is calculated after decode or construction by
+	 * the builder. It is exposed here (rather than having a
+	 * 'build CRUSH working space' function) so that callers can
+	 * reserve a static buffer, allocate space on the stack, or
+	 * otherwise avoid calling into the heap allocator if they
+	 * want to. The size of the working space depends on the map,
+	 * while the size of the scratch vector passed to the mapper
+	 * depends on the size of the desired result set.
+	 *
+	 * Nothing stops the caller from allocating both in one swell
+	 * foop and passing in two points, though.
+	 */
+	size_t working_size;
+
 #ifndef __KERNEL__
 	/*
 	 * version 0 (original) of straw_calc has various flaws.  version 1
@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
 	return ((i+1) << 1)-1;
 }
 
+/*
+ * These data structures are private to the CRUSH implementation. They
+ * are exposed in this header file because builder needs their
+ * definitions to calculate the total working size.
+ *
+ * Moving this out of the crush map allow us to treat the CRUSH map as
+ * immutable within the mapper and removes the requirement for a CRUSH
+ * map lock.
+ */
+struct crush_work_bucket {
+	__u32 perm_x; /* @x for which *perm is defined */
+	__u32 perm_n; /* num elements of *perm that are permuted/defined */
+	__u32 *perm;  /* Permutation of the bucket's items */
+};
+
+struct crush_work {
+	struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
 #endif
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index 5dfd5b1125d2..3303c7fd8a31 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -15,6 +15,8 @@ extern int crush_do_rule(const struct crush_map *map,
 			 int ruleno,
 			 int x, int *result, int result_max,
 			 const __u32 *weights, int weight_max,
-			 int *scratch);
+			 void *cwin, int *scratch);
+
+void crush_init_workspace(const struct crush_map *map, void *v);
 
 #endif
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 80d7c3a97cb8..5bf94c04f645 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 
 void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
 {
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
 {
 	kfree(b->item_weights);
 	kfree(b->sum_weights);
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
 
 void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
 {
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b->node_weights);
 	kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 {
 	kfree(b->straws);
 	kfree(b->item_weights);
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
 {
 	kfree(b->item_weights);
-	kfree(b->h.perm);
 	kfree(b->h.items);
 	kfree(b);
 }
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 130ab407c5ec..9e75be5ec716 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
 	return -1;
 }
 
-
 /*
  * bucket choose methods
  *
@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
  * Since this is expensive, we optimize for the r=0 case, which
  * captures the vast majority of calls.
  */
-static int bucket_perm_choose(struct crush_bucket *bucket,
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+			      struct crush_work_bucket *work,
 			      int x, int r)
 {
 	unsigned int pr = r % bucket->size;
 	unsigned int i, s;
 
 	/* start a new permutation if @x has changed */
-	if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+	if (work->perm_x != (__u32)x || work->perm_n == 0) {
 		dprintk("bucket %d new x=%d\n", bucket->id, x);
-		bucket->perm_x = x;
+		work->perm_x = x;
 
 		/* optimize common r=0 case */
 		if (pr == 0) {
 			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
 				bucket->size;
-			bucket->perm[0] = s;
-			bucket->perm_n = 0xffff;   /* magic value, see below */
+			work->perm[0] = s;
+			work->perm_n = 0xffff;   /* magic value, see below */
 			goto out;
 		}
 
 		for (i = 0; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm_n = 0;
-	} else if (bucket->perm_n == 0xffff) {
+			work->perm[i] = i;
+		work->perm_n = 0;
+	} else if (work->perm_n == 0xffff) {
 		/* clean up after the r=0 case above */
 		for (i = 1; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm[bucket->perm[0]] = 0;
-		bucket->perm_n = 1;
+			work->perm[i] = i;
+		work->perm[work->perm[0]] = 0;
+		work->perm_n = 1;
 	}
 
 	/* calculate permutation up to pr */
-	for (i = 0; i < bucket->perm_n; i++)
+	for (i = 0; i < work->perm_n; i++)
 		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-	while (bucket->perm_n <= pr) {
-		unsigned int p = bucket->perm_n;
+	while (work->perm_n <= pr) {
+		unsigned int p = work->perm_n;
 		/* no point in swapping the final entry */
 		if (p < bucket->size - 1) {
 			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
 				(bucket->size - p);
 			if (i) {
-				unsigned int t = bucket->perm[p + i];
-				bucket->perm[p + i] = bucket->perm[p];
-				bucket->perm[p] = t;
+				unsigned int t = work->perm[p + i];
+				work->perm[p + i] = work->perm[p];
+				work->perm[p] = t;
 			}
 			dprintk(" perm_choose swap %d with %d\n", p, p+i);
 		}
-		bucket->perm_n++;
+		work->perm_n++;
 	}
 	for (i = 0; i < bucket->size; i++)
 		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
 
-	s = bucket->perm[pr];
+	s = work->perm[pr];
 out:
 	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
 		bucket->size, x, r, pr, s);
@@ -132,14 +132,14 @@ out:
 }
 
 /* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-				 int x, int r)
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+				 struct crush_work_bucket *work, int x, int r)
 {
-	return bucket_perm_choose(&bucket->h, x, r);
+	return bucket_perm_choose(&bucket->h, work, x, r);
 }
 
 /* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
 			      int x, int r)
 {
 	int i;
@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
 		w *= bucket->sum_weights[i];
 		w = w >> 16;
 		/*dprintk(" scaled %llx\n", w);*/
-		if (w < bucket->item_weights[i])
+		if (w < bucket->item_weights[i]) {
 			return bucket->h.items[i];
+		}
 	}
 
 	dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -192,7 +193,7 @@ static int terminal(int x)
 	return x & 1;
 }
 
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
 			      int x, int r)
 {
 	int n;
@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 
 /* straw */
 
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
 			       int x, int r)
 {
 	__u32 i;
@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
  *
  */
 
-static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
 				int x, int r)
 {
 	unsigned int i, high = 0;
@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
 			high_draw = draw;
 		}
 	}
+
 	return bucket->h.items[high];
 }
 
 
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+static int crush_bucket_choose(const struct crush_bucket *in,
+			       struct crush_work_bucket *work,
+			       int x, int r)
 {
 	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
 	BUG_ON(in->size == 0);
 	switch (in->alg) {
 	case CRUSH_BUCKET_UNIFORM:
-		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-					  x, r);
+		return bucket_uniform_choose(
+			(const struct crush_bucket_uniform *)in,
+			work, x, r);
 	case CRUSH_BUCKET_LIST:
-		return bucket_list_choose((struct crush_bucket_list *)in,
+		return bucket_list_choose((const struct crush_bucket_list *)in,
 					  x, r);
 	case CRUSH_BUCKET_TREE:
-		return bucket_tree_choose((struct crush_bucket_tree *)in,
+		return bucket_tree_choose((const struct crush_bucket_tree *)in,
 					  x, r);
 	case CRUSH_BUCKET_STRAW:
-		return bucket_straw_choose((struct crush_bucket_straw *)in,
-					   x, r);
+		return bucket_straw_choose(
+			(const struct crush_bucket_straw *)in,
+			x, r);
 	case CRUSH_BUCKET_STRAW2:
-		return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
-					    x, r);
+		return bucket_straw2_choose(
+			(const struct crush_bucket_straw2 *)in,
+			x, r);
 	default:
 		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
 		return in->items[0];
 	}
 }
 
-
 /*
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
  * @parent_r: r value passed from the parent
  */
 static int crush_choose_firstn(const struct crush_map *map,
-			       struct crush_bucket *bucket,
+			       struct crush_work *work,
+			       const struct crush_bucket *bucket,
 			       const __u32 *weight, int weight_max,
 			       int x, int numrep, int type,
 			       int *out, int outpos,
@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 	int rep;
 	unsigned int ftotal, flocal;
 	int retry_descent, retry_bucket, skip_rep;
-	struct crush_bucket *in = bucket;
+	const struct crush_bucket *in = bucket;
 	int r;
 	int i;
 	int item = 0;
@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
 				if (local_fallback_retries > 0 &&
 				    flocal >= (in->size>>1) &&
 				    flocal > local_fallback_retries)
-					item = bucket_perm_choose(in, x, r);
+					item = bucket_perm_choose(
+						in, work->work[-1-in->id],
+						x, r);
 				else
-					item = crush_bucket_choose(in, x, r);
+					item = crush_bucket_choose(
+						in, work->work[-1-in->id],
+						x, r);
 				if (item >= map->max_devices) {
 					dprintk("   bad item %d\n", item);
 					skip_rep = 1;
@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
 							sub_r = r >> (vary_r-1);
 						else
 							sub_r = 0;
-						if (crush_choose_firstn(map,
-							 map->buckets[-1-item],
-							 weight, weight_max,
-							 x, stable ? 1 : outpos+1, 0,
-							 out2, outpos, count,
-							 recurse_tries, 0,
-							 local_retries,
-							 local_fallback_retries,
-							 0,
-							 vary_r,
-							 stable,
-							 NULL,
-							 sub_r) <= outpos)
+						if (crush_choose_firstn(
+							    map,
+							    work,
+							    map->buckets[-1-item],
+							    weight, weight_max,
+							    x, stable ? 1 : outpos+1, 0,
+							    out2, outpos, count,
+							    recurse_tries, 0,
+							    local_retries,
+							    local_fallback_retries,
+							    0,
+							    vary_r,
+							    stable,
+							    NULL,
+							    sub_r) <= outpos)
 							/* didn't get leaf */
 							reject = 1;
 					} else {
@@ -600,7 +613,8 @@ reject:
  *
  */
 static void crush_choose_indep(const struct crush_map *map,
-			       struct crush_bucket *bucket,
+			       struct crush_work *work,
+			       const struct crush_bucket *bucket,
 			       const __u32 *weight, int weight_max,
 			       int x, int left, int numrep, int type,
 			       int *out, int outpos,
@@ -610,7 +624,7 @@ static void crush_choose_indep(const struct crush_map *map,
 			       int *out2,
 			       int parent_r)
 {
-	struct crush_bucket *in = bucket;
+	const struct crush_bucket *in = bucket;
 	int endpos = outpos + left;
 	int rep;
 	unsigned int ftotal;
@@ -678,7 +692,9 @@ static void crush_choose_indep(const struct crush_map *map,
 					break;
 				}
 
-				item = crush_bucket_choose(in, x, r);
+				item = crush_bucket_choose(
+					in, work->work[-1-in->id],
+					x, r);
 				if (item >= map->max_devices) {
 					dprintk("   bad item %d\n", item);
 					out[rep] = CRUSH_ITEM_NONE;
@@ -724,13 +740,15 @@ static void crush_choose_indep(const struct crush_map *map,
 
 				if (recurse_to_leaf) {
 					if (item < 0) {
-						crush_choose_indep(map,
-						   map->buckets[-1-item],
-						   weight, weight_max,
-						   x, 1, numrep, 0,
-						   out2, rep,
-						   recurse_tries, 0,
-						   0, NULL, r);
+						crush_choose_indep(
+							map,
+							work,
+							map->buckets[-1-item],
+							weight, weight_max,
+							x, 1, numrep, 0,
+							out2, rep,
+							recurse_tries, 0,
+							0, NULL, r);
 						if (out2[rep] == CRUSH_ITEM_NONE) {
 							/* placed nothing; no leaf */
 							break;
@@ -781,6 +799,53 @@ static void crush_choose_indep(const struct crush_map *map,
 #endif
 }
 
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+	struct crush_work *w = v;
+	__s32 b;
+
+	/*
+	 * We work by moving through the available space and setting
+	 * values and pointers as we go.
+	 *
+	 * It's a bit like Forth's use of the 'allot' word since we
+	 * set the pointer first and then reserve the space for it to
+	 * point to by incrementing the point.
+	 */
+	v += sizeof(struct crush_work *);
+	w->work = v;
+	v += map->max_buckets * sizeof(struct crush_work_bucket *);
+	for (b = 0; b < map->max_buckets; ++b) {
+		if (!map->buckets[b])
+			continue;
+
+		w->work[b] = v;
+		switch (map->buckets[b]->alg) {
+		default:
+			v += sizeof(struct crush_work_bucket);
+			break;
+		}
+		w->work[b]->perm_x = 0;
+		w->work[b]->perm_n = 0;
+		w->work[b]->perm = v;
+		v += map->buckets[b]->size * sizeof(__u32);
+	}
+	BUG_ON(v - (void *)w != map->working_size);
+}
+
 /**
  * crush_do_rule - calculate a mapping with the given input and rule
  * @map: the crush_map
@@ -790,14 +855,16 @@ static void crush_choose_indep(const struct crush_map *map,
  * @result_max: maximum result size
  * @weight: weight vector (for map leaves)
  * @weight_max: size of weight vector
+ * @cwin: pointer to at least map->working_size bytes of memory
  * @scratch: scratch vector for private use; must be >= 3 * result_max
  */
 int crush_do_rule(const struct crush_map *map,
 		  int ruleno, int x, int *result, int result_max,
 		  const __u32 *weight, int weight_max,
-		  int *scratch)
+		  void *cwin, int *scratch)
 {
 	int result_len;
+	struct crush_work *cw = cwin;
 	int *a = scratch;
 	int *b = scratch + result_max;
 	int *c = scratch + result_max*2;
@@ -807,7 +874,7 @@ int crush_do_rule(const struct crush_map *map,
 	int *o;
 	int osize;
 	int *tmp;
-	struct crush_rule *rule;
+	const struct crush_rule *rule;
 	__u32 step;
 	int i, j;
 	int numrep;
@@ -840,7 +907,7 @@ int crush_do_rule(const struct crush_map *map,
 
 	for (step = 0; step < rule->len; step++) {
 		int firstn = 0;
-		struct crush_rule_step *curstep = &rule->steps[step];
+		const struct crush_rule_step *curstep = &rule->steps[step];
 
 		switch (curstep->op) {
 		case CRUSH_RULE_TAKE:
@@ -936,6 +1003,7 @@ int crush_do_rule(const struct crush_map *map,
 						recurse_tries = choose_tries;
 					osize += crush_choose_firstn(
 						map,
+						cw,
 						map->buckets[bno],
 						weight, weight_max,
 						x, numrep,
@@ -956,6 +1024,7 @@ int crush_do_rule(const struct crush_map *map,
 						    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
+						cw,
 						map->buckets[bno],
 						weight, weight_max,
 						x, out_size, numrep,
@@ -997,5 +1066,6 @@ int crush_do_rule(const struct crush_map *map,
 			break;
 		}
 	}
+
 	return result_len;
 }
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 47df075b31e5..3892e7fa7747 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -153,6 +153,32 @@ bad:
         return -EINVAL;
 }
 
+static void crush_finalize(struct crush_map *c)
+{
+	__s32 b;
+
+	/* Space for the array of pointers to per-bucket workspace */
+	c->working_size = sizeof(struct crush_work) +
+	    c->max_buckets * sizeof(struct crush_work_bucket *);
+
+	for (b = 0; b < c->max_buckets; b++) {
+		if (!c->buckets[b])
+			continue;
+
+		switch (c->buckets[b]->alg) {
+		default:
+			/*
+			 * The base case, permutation variables and
+			 * the pointer to the permutation array.
+			 */
+			c->working_size += sizeof(struct crush_work_bucket);
+			break;
+		}
+		/* Every bucket has a permutation array. */
+		c->working_size += c->buckets[b]->size * sizeof(__u32);
+	}
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
 	struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
 		if (b->items == NULL)
 			goto badmem;
-		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-		if (b->perm == NULL)
-			goto badmem;
-		b->perm_n = 0;
 
 		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
 		for (j = 0; j < b->size; j++)
@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	dout("crush decode tunable chooseleaf_stable = %d\n",
 	     c->chooseleaf_stable);
 
+	crush_finalize(c);
+
 done:
 	dout("crush_decode success\n");
 	return c;
@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 	kfree(map->osd_weight);
 	kfree(map->osd_addr);
 	kfree(map->osd_primary_affinity);
+	kfree(map->crush_workspace);
 	kfree(map);
 }
 
@@ -810,12 +835,23 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 
 static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
 {
+	void *workspace;
+
 	if (IS_ERR(crush))
 		return PTR_ERR(crush);
 
+	workspace = kmalloc(crush->working_size, GFP_NOIO);
+	if (!workspace) {
+		crush_destroy(crush);
+		return -ENOMEM;
+	}
+	crush_init_workspace(crush, workspace);
+
 	if (map->crush)
 		crush_destroy(map->crush);
+	kfree(map->crush_workspace);
 	map->crush = crush;
+	map->crush_workspace = workspace;
 	return 0;
 }
 
@@ -1940,7 +1976,8 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
 	mutex_lock(&map->crush_scratch_mutex);
 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-			  weight, weight_max, map->crush_scratch_ary);
+			  weight, weight_max, map->crush_workspace,
+			  map->crush_scratch_ary);
 	mutex_unlock(&map->crush_scratch_mutex);
 
 	return r;
-- 
cgit v1.2.3


From 743efcffffc6620ab44ea9ec67c7e4e28dfa7742 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 31 Jan 2017 15:55:06 +0100
Subject: crush: merge working data and scratch

Much like Arlo Guthrie, I decided that one big pile is better than two
little piles.

Reflects ceph.git commit 95c2df6c7e0b22d2ea9d91db500cf8b9441c73ba.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h  |  3 +--
 include/linux/crush/mapper.h | 14 +++++++++++++-
 net/ceph/crush/mapper.c      | 17 +++++++----------
 net/ceph/osdmap.c            | 14 ++++++++------
 4 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index cef1cab789b9..8cebdc4158c3 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -173,8 +173,7 @@ struct ceph_osdmap {
 	 * the list of osds that store+replicate them. */
 	struct crush_map *crush;
 
-	struct mutex crush_scratch_mutex;
-	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+	struct mutex crush_workspace_mutex;
 	void *crush_workspace;
 };
 
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index 3303c7fd8a31..c95e19e1ff11 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -15,7 +15,19 @@ extern int crush_do_rule(const struct crush_map *map,
 			 int ruleno,
 			 int x, int *result, int result_max,
 			 const __u32 *weights, int weight_max,
-			 void *cwin, int *scratch);
+			 void *cwin);
+
+/*
+ * Returns the exact amount of workspace that will need to be used
+ * for a given combination of crush_map and result_max. The caller can
+ * then allocate this much on its own, either on the stack, in a
+ * per-thread long-lived buffer, or however it likes.
+ */
+static inline size_t crush_work_size(const struct crush_map *map,
+				     int result_max)
+{
+	return map->working_size + result_max * 3 * sizeof(__u32);
+}
 
 void crush_init_workspace(const struct crush_map *map, void *v);
 
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 9e75be5ec716..2e31217ccae3 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -855,23 +855,22 @@ void crush_init_workspace(const struct crush_map *map, void *v)
  * @result_max: maximum result size
  * @weight: weight vector (for map leaves)
  * @weight_max: size of weight vector
- * @cwin: pointer to at least map->working_size bytes of memory
- * @scratch: scratch vector for private use; must be >= 3 * result_max
+ * @cwin: pointer to at least crush_work_size() bytes of memory
  */
 int crush_do_rule(const struct crush_map *map,
 		  int ruleno, int x, int *result, int result_max,
 		  const __u32 *weight, int weight_max,
-		  void *cwin, int *scratch)
+		  void *cwin)
 {
 	int result_len;
 	struct crush_work *cw = cwin;
-	int *a = scratch;
-	int *b = scratch + result_max;
-	int *c = scratch + result_max*2;
+	int *a = cwin + map->working_size;
+	int *b = a + result_max;
+	int *c = b + result_max;
+	int *w = a;
+	int *o = b;
 	int recurse_to_leaf;
-	int *w;
 	int wsize = 0;
-	int *o;
 	int osize;
 	int *tmp;
 	const struct crush_rule *rule;
@@ -902,8 +901,6 @@ int crush_do_rule(const struct crush_map *map,
 
 	rule = map->rules[ruleno];
 	result_len = 0;
-	w = a;
-	o = b;
 
 	for (step = 0; step < rule->len; step++) {
 		int firstn = 0;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 3892e7fa7747..2374956c4d40 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -743,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
 	map->pool_max = -1;
 	map->pg_temp = RB_ROOT;
 	map->primary_temp = RB_ROOT;
-	mutex_init(&map->crush_scratch_mutex);
+	mutex_init(&map->crush_workspace_mutex);
 
 	return map;
 }
@@ -836,11 +836,14 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
 {
 	void *workspace;
+	size_t work_size;
 
 	if (IS_ERR(crush))
 		return PTR_ERR(crush);
 
-	workspace = kmalloc(crush->working_size, GFP_NOIO);
+	work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+	dout("%s work_size %zu bytes\n", __func__, work_size);
+	workspace = kmalloc(work_size, GFP_NOIO);
 	if (!workspace) {
 		crush_destroy(crush);
 		return -ENOMEM;
@@ -1974,11 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
 	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
 
-	mutex_lock(&map->crush_scratch_mutex);
+	mutex_lock(&map->crush_workspace_mutex);
 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-			  weight, weight_max, map->crush_workspace,
-			  map->crush_scratch_ary);
-	mutex_unlock(&map->crush_scratch_mutex);
+			  weight, weight_max, map->crush_workspace);
+	mutex_unlock(&map->crush_workspace_mutex);
 
 	return r;
 }
-- 
cgit v1.2.3


From 083a51fbc57ca848ab087692f3cc97898fd88b54 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 9 Feb 2017 16:14:52 +0100
Subject: libceph: bump CEPH_PG_MAX_SIZE to 32

... to accommodate potentially very wide EC pools.  This increases the
size of a typical rbd ceph_osd_request by ~12% (from 1040 to 1168 bytes),
but I'd rather go future proof here.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 include/linux/ceph/rados.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 5c0da61cb763..5d0018782d50 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -50,7 +50,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
 
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+#define CEPH_PG_MAX_SIZE      32  /* max # osds in a single pg */
 
 /*
  * placement group.
-- 
cgit v1.2.3


From 6c696d8560e74cd42458931375875d62ae88c6ae Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 25 Jan 2017 18:16:23 +0100
Subject: rbd: kill obj_request->object_name and rbd_segment_name_cache

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jason Dillaman <dillaman@redhat.com>
---
 drivers/block/rbd.c         | 79 ++++-----------------------------------------
 include/linux/ceph/osdmap.h |  7 ----
 2 files changed, 7 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6c094b580eae..b6e269b4d534 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -231,7 +231,6 @@ enum obj_req_flags {
 };
 
 struct rbd_obj_request {
-	const char		*object_name;
 	u64			object_no;
 	u64			offset;		/* object start byte */
 	u64			length;		/* bytes from offset */
@@ -440,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock);
 
 static struct kmem_cache	*rbd_img_request_cache;
 static struct kmem_cache	*rbd_obj_request_cache;
-static struct kmem_cache	*rbd_segment_name_cache;
 
 static int rbd_major;
 static DEFINE_IDA(rbd_dev_id_ida);
@@ -1249,37 +1247,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
 	rbd_dev->mapping.features = 0;
 }
 
-static void rbd_segment_name_free(const char *name)
-{
-	/* The explicit cast here is needed to drop the const qualifier */
-
-	kmem_cache_free(rbd_segment_name_cache, (void *)name);
-}
-
-static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
-{
-	const char *name_format = rbd_dev->image_format == 1 ?
-				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
-	char *name;
-	u64 segment;
-	int ret;
-
-	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
-	if (!name)
-		return NULL;
-	segment = offset >> rbd_dev->header.obj_order;
-	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
-			rbd_dev->header.object_prefix, segment);
-	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
-		pr_err("error formatting segment name for #%llu (%d)\n",
-			segment, ret);
-		rbd_segment_name_free(name);
-		name = NULL;
-	}
-
-	return name;
-}
-
 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 {
 	u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
@@ -2050,29 +2017,17 @@ static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
 	ceph_osdc_put_request(osd_req);
 }
 
-/* object_name is assumed to be a non-null pointer and NUL-terminated */
-
-static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
-						enum obj_request_type type)
+static struct rbd_obj_request *
+rbd_obj_request_create(enum obj_request_type type)
 {
 	struct rbd_obj_request *obj_request;
-	size_t size;
-	char *name;
 
 	rbd_assert(obj_request_type_valid(type));
 
-	size = strlen(object_name) + 1;
-	name = kmalloc(size, GFP_NOIO);
-	if (!name)
-		return NULL;
-
 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
-	if (!obj_request) {
-		kfree(name);
+	if (!obj_request)
 		return NULL;
-	}
 
-	obj_request->object_name = memcpy(name, object_name, size);
 	obj_request->which = BAD_WHICH;
 	obj_request->type = type;
 	INIT_LIST_HEAD(&obj_request->links);
@@ -2114,8 +2069,6 @@ static void rbd_obj_request_destroy(struct kref *kref)
 		break;
 	}
 
-	kfree(obj_request->object_name);
-	obj_request->object_name = NULL;
 	kmem_cache_free(rbd_obj_request_cache, obj_request);
 }
 
@@ -2490,17 +2443,11 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 
 	while (resid) {
 		struct ceph_osd_request *osd_req;
-		const char *object_name;
 		u64 object_no = img_offset >> rbd_dev->header.obj_order;
 		u64 offset = rbd_segment_offset(rbd_dev, img_offset);
 		u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
 
-		object_name = rbd_segment_name(rbd_dev, img_offset);
-		if (!object_name)
-			goto out_unwind;
-		obj_request = rbd_obj_request_create(object_name, type);
-		/* object request has its own copy of the object name */
-		rbd_segment_name_free(object_name);
+		obj_request = rbd_obj_request_create(type);
 		if (!obj_request)
 			goto out_unwind;
 
@@ -2846,8 +2793,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
 	size_t size;
 	int ret;
 
-	stat_request = rbd_obj_request_create(obj_request->object_name,
-					      OBJ_REQUEST_PAGES);
+	stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
 	if (!stat_request)
 		return -ENOMEM;
 
@@ -6389,27 +6335,16 @@ static int rbd_slab_init(void)
 	if (!rbd_obj_request_cache)
 		goto out_err;
 
-	rbd_assert(!rbd_segment_name_cache);
-	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
-					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
-	if (rbd_segment_name_cache)
-		return 0;
-out_err:
-	kmem_cache_destroy(rbd_obj_request_cache);
-	rbd_obj_request_cache = NULL;
+	return 0;
 
+out_err:
 	kmem_cache_destroy(rbd_img_request_cache);
 	rbd_img_request_cache = NULL;
-
 	return -ENOMEM;
 }
 
 static void rbd_slab_exit(void)
 {
-	rbd_assert(rbd_segment_name_cache);
-	kmem_cache_destroy(rbd_segment_name_cache);
-	rbd_segment_name_cache = NULL;
-
 	rbd_assert(rbd_obj_request_cache);
 	kmem_cache_destroy(rbd_obj_request_cache);
 	rbd_obj_request_cache = NULL;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8cebdc4158c3..938656f70807 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -81,13 +81,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest,
 		    const struct ceph_object_locator *src);
 void ceph_oloc_destroy(struct ceph_object_locator *oloc);
 
-/*
- * Maximum supported by kernel client object name length
- *
- * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
- */
-#define CEPH_MAX_OID_NAME_LEN 100
-
 /*
  * 51-char inline_name is long enough for all cephfs and all but one
  * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
-- 
cgit v1.2.3


From 42f82367df2cb2b71ceead6164e4415851c51fa4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 20 Feb 2017 13:51:07 +0100
Subject: video: fbdev: fsl-diu-fb: fix spelling mistake "palette"

trivial fix to spelling mistakes of "palette"

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Timur Tabi <timur@tabi.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/fsl-diu-fb.c | 4 ++--
 include/linux/fsl-diu-fb.h       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/fsl-diu-fb.c b/drivers/video/fbdev/fsl-diu-fb.c
index c48a59e35fae..ca3d6b366471 100644
--- a/drivers/video/fbdev/fsl-diu-fb.c
+++ b/drivers/video/fbdev/fsl-diu-fb.c
@@ -439,12 +439,12 @@ static struct mfb_info mfb_template[] = {
 static void __attribute__ ((unused)) fsl_diu_dump(struct diu __iomem *hw)
 {
 	mb();
-	pr_debug("DIU: desc=%08x,%08x,%08x, gamma=%08x pallete=%08x "
+	pr_debug("DIU: desc=%08x,%08x,%08x, gamma=%08x palette=%08x "
 		 "cursor=%08x curs_pos=%08x diu_mode=%08x bgnd=%08x "
 		 "disp_size=%08x hsyn_para=%08x vsyn_para=%08x syn_pol=%08x "
 		 "thresholds=%08x int_mask=%08x plut=%08x\n",
 		 hw->desc[0], hw->desc[1], hw->desc[2], hw->gamma,
-		 hw->pallete, hw->cursor, hw->curs_pos, hw->diu_mode,
+		 hw->palette, hw->cursor, hw->curs_pos, hw->diu_mode,
 		 hw->bgnd, hw->disp_size, hw->hsyn_para, hw->vsyn_para,
 		 hw->syn_pol, hw->thresholds, hw->int_mask, hw->plut);
 	rmb();
diff --git a/include/linux/fsl-diu-fb.h b/include/linux/fsl-diu-fb.h
index a1e8277120c7..c46eab5bc893 100644
--- a/include/linux/fsl-diu-fb.h
+++ b/include/linux/fsl-diu-fb.h
@@ -73,7 +73,7 @@ struct diu_ad {
 	/* Word 0(32-bit) in DDR memory */
 /* 	__u16 comp; */
 /* 	__u16 pixel_s:2; */
-/* 	__u16 pallete:1; */
+/* 	__u16 palette:1; */
 /* 	__u16 red_c:2; */
 /* 	__u16 green_c:2; */
 /* 	__u16 blue_c:2; */
@@ -142,7 +142,7 @@ struct diu_ad {
 struct diu {
 	__be32 desc[3];
 	__be32 gamma;
-	__be32 pallete;
+	__be32 palette;
 	__be32 cursor;
 	__be32 curs_pos;
 	__be32 diu_mode;
-- 
cgit v1.2.3


From 25149ef9d25cafc4f4fe9f4461f18f876f397417 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 17 Feb 2017 16:07:34 -0800
Subject: net: phy: Check phydev->drv

There are number of function calls, originating from user-space,
typically through the Ethernet driver that can make us crash by
dereferencing phydev->drv which will be NULL once we unbind the driver
from the PHY.

There are still functional issues that prevent an unbind then rebind to
work, but these will be addressed separately.

Suggested-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        | 26 ++++++++++++++++++++++----
 drivers/net/phy/phy_device.c |  6 +++---
 include/linux/phy.h          |  3 +++
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 7cc1b7dcfe05..d6f7838455dd 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -580,7 +580,7 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	case SIOCSHWTSTAMP:
-		if (phydev->drv->hwtstamp)
+		if (phydev->drv && phydev->drv->hwtstamp)
 			return phydev->drv->hwtstamp(phydev, ifr);
 		/* fall through */
 
@@ -603,6 +603,9 @@ int phy_start_aneg(struct phy_device *phydev)
 {
 	int err;
 
+	if (!phydev->drv)
+		return -EIO;
+
 	mutex_lock(&phydev->lock);
 
 	if (AUTONEG_DISABLE == phydev->autoneg)
@@ -975,7 +978,7 @@ void phy_state_machine(struct work_struct *work)
 
 	old_state = phydev->state;
 
-	if (phydev->drv->link_change_notify)
+	if (phydev->drv && phydev->drv->link_change_notify)
 		phydev->drv->link_change_notify(phydev);
 
 	switch (phydev->state) {
@@ -1286,6 +1289,9 @@ EXPORT_SYMBOL(phy_write_mmd_indirect);
  */
 int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable)
 {
+	if (!phydev->drv)
+		return -EIO;
+
 	/* According to 802.3az,the EEE is supported only in full duplex-mode.
 	 * Also EEE feature is active when core is operating with MII, GMII
 	 * or RGMII (all kinds). Internal PHYs are also allowed to proceed and
@@ -1363,6 +1369,9 @@ EXPORT_SYMBOL(phy_init_eee);
  */
 int phy_get_eee_err(struct phy_device *phydev)
 {
+	if (!phydev->drv)
+		return -EIO;
+
 	return phy_read_mmd_indirect(phydev, MDIO_PCS_EEE_WK_ERR, MDIO_MMD_PCS);
 }
 EXPORT_SYMBOL(phy_get_eee_err);
@@ -1379,6 +1388,9 @@ int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data)
 {
 	int val;
 
+	if (!phydev->drv)
+		return -EIO;
+
 	/* Get Supported EEE */
 	val = phy_read_mmd_indirect(phydev, MDIO_PCS_EEE_ABLE, MDIO_MMD_PCS);
 	if (val < 0)
@@ -1412,6 +1424,9 @@ int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data)
 {
 	int val = ethtool_adv_to_mmd_eee_adv_t(data->advertised);
 
+	if (!phydev->drv)
+		return -EIO;
+
 	/* Mask prohibited EEE modes */
 	val &= ~phydev->eee_broken_modes;
 
@@ -1423,7 +1438,7 @@ EXPORT_SYMBOL(phy_ethtool_set_eee);
 
 int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)
 {
-	if (phydev->drv->set_wol)
+	if (phydev->drv && phydev->drv->set_wol)
 		return phydev->drv->set_wol(phydev, wol);
 
 	return -EOPNOTSUPP;
@@ -1432,7 +1447,7 @@ EXPORT_SYMBOL(phy_ethtool_set_wol);
 
 void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)
 {
-	if (phydev->drv->get_wol)
+	if (phydev->drv && phydev->drv->get_wol)
 		phydev->drv->get_wol(phydev, wol);
 }
 EXPORT_SYMBOL(phy_ethtool_get_wol);
@@ -1468,6 +1483,9 @@ int phy_ethtool_nway_reset(struct net_device *ndev)
 	if (!phydev)
 		return -ENODEV;
 
+	if (!phydev->drv)
+		return -EIO;
+
 	return genphy_restart_aneg(phydev);
 }
 EXPORT_SYMBOL(phy_ethtool_nway_reset);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 7f9319586b7e..daec6555f3b1 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1094,7 +1094,7 @@ int phy_suspend(struct phy_device *phydev)
 	if (wol.wolopts)
 		return -EBUSY;
 
-	if (phydrv->suspend)
+	if (phydev->drv && phydrv->suspend)
 		ret = phydrv->suspend(phydev);
 
 	if (ret)
@@ -1111,7 +1111,7 @@ int phy_resume(struct phy_device *phydev)
 	struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
 	int ret = 0;
 
-	if (phydrv->resume)
+	if (phydev->drv && phydrv->resume)
 		ret = phydrv->resume(phydev);
 
 	if (ret)
@@ -1790,7 +1790,7 @@ static int phy_remove(struct device *dev)
 	phydev->state = PHY_DOWN;
 	mutex_unlock(&phydev->lock);
 
-	if (phydev->drv->remove)
+	if (phydev->drv && phydev->drv->remove)
 		phydev->drv->remove(phydev);
 	phydev->drv = NULL;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d9bdf53e0514..772476028a65 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -807,6 +807,9 @@ int phy_stop_interrupts(struct phy_device *phydev);
 
 static inline int phy_read_status(struct phy_device *phydev)
 {
+	if (!phydev->drv)
+		return -EIO;
+
 	return phydev->drv->read_status(phydev);
 }
 
-- 
cgit v1.2.3


From e71695307114335be1ed912f4a347396c2ed0e69 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 19 Feb 2017 07:17:17 +0200
Subject: ptr_ring: fix race conditions when resizing

Resizing currently drops consumer lock.  This can cause entries to be
reordered, which isn't good in itself.  More importantly, consumer can
detect a false ring empty condition and block forever.

Further, nesting of consumer within producer lock is problematic for
tun, since it produces entries in a BH, which causes a lock order
reversal:

       CPU0                    CPU1
       ----                    ----
  consume:
  lock(&(&r->consumer_lock)->rlock);
                               resize:
                               local_irq_disable();
                               lock(&(&r->producer_lock)->rlock);
                               lock(&(&r->consumer_lock)->rlock);
  <Interrupt>
  produce:
  lock(&(&r->producer_lock)->rlock);

To fix, nest producer lock within consumer lock during resize,
and keep consumer lock during the whole swap operation.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: stable@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 2052011bf9fb..6c70444da3b9 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -111,6 +111,11 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
 	return 0;
 }
 
+/*
+ * Note: resize (below) nests producer lock within consumer lock, so if you
+ * consume in interrupt or BH context, you must disable interrupts/BH when
+ * calling this.
+ */
 static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
 {
 	int ret;
@@ -242,6 +247,11 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
 	return ptr;
 }
 
+/*
+ * Note: resize (below) nests producer lock within consumer lock, so if you
+ * call this in interrupt or BH context, you must disable interrupts/BH when
+ * producing.
+ */
 static inline void *ptr_ring_consume(struct ptr_ring *r)
 {
 	void *ptr;
@@ -357,7 +367,7 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
 	void **old;
 	void *ptr;
 
-	while ((ptr = ptr_ring_consume(r)))
+	while ((ptr = __ptr_ring_consume(r)))
 		if (producer < size)
 			queue[producer++] = ptr;
 		else if (destroy)
@@ -372,6 +382,12 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
 	return old;
 }
 
+/*
+ * Note: producer lock is nested within consumer lock, so if you
+ * resize you must make sure all uses nest correctly.
+ * In particular if you consume ring in interrupt or BH context, you must
+ * disable interrupts/BH when doing so.
+ */
 static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
 				  void (*destroy)(void *))
 {
@@ -382,17 +398,25 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
 	if (!queue)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&(r)->producer_lock, flags);
+	spin_lock_irqsave(&(r)->consumer_lock, flags);
+	spin_lock(&(r)->producer_lock);
 
 	old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);
 
-	spin_unlock_irqrestore(&(r)->producer_lock, flags);
+	spin_unlock(&(r)->producer_lock);
+	spin_unlock_irqrestore(&(r)->consumer_lock, flags);
 
 	kfree(old);
 
 	return 0;
 }
 
+/*
+ * Note: producer lock is nested within consumer lock, so if you
+ * resize you must make sure all uses nest correctly.
+ * In particular if you consume ring in interrupt or BH context, you must
+ * disable interrupts/BH when doing so.
+ */
 static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
 					   int size,
 					   gfp_t gfp, void (*destroy)(void *))
@@ -412,10 +436,12 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
 	}
 
 	for (i = 0; i < nrings; ++i) {
-		spin_lock_irqsave(&(rings[i])->producer_lock, flags);
+		spin_lock_irqsave(&(rings[i])->consumer_lock, flags);
+		spin_lock(&(rings[i])->producer_lock);
 		queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
 						  size, gfp, destroy);
-		spin_unlock_irqrestore(&(rings[i])->producer_lock, flags);
+		spin_unlock(&(rings[i])->producer_lock);
+		spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags);
 	}
 
 	for (i = 0; i < nrings; ++i)
-- 
cgit v1.2.3


From bb7462b6fd64e40809a857223bf7f0e628969f87 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 20 Feb 2017 16:51:23 +0100
Subject: vfs: use helpers for calling f_op->{read,write}_iter()

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 drivers/block/loop.c |  4 ++--
 fs/aio.c             |  4 ++--
 fs/read_write.c      | 12 ++++++------
 fs/splice.c          |  2 +-
 include/linux/fs.h   | 12 ++++++++++++
 5 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f347285c67ec..2cf2903a0715 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -501,9 +501,9 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_flags = IOCB_DIRECT;
 
 	if (rw == WRITE)
-		ret = file->f_op->write_iter(&cmd->iocb, &iter);
+		ret = call_write_iter(file, &cmd->iocb, &iter);
 	else
-		ret = file->f_op->read_iter(&cmd->iocb, &iter);
+		ret = call_read_iter(file, &cmd->iocb, &iter);
 
 	if (ret != -EIOCBQUEUED)
 		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
diff --git a/fs/aio.c b/fs/aio.c
index 4ab67e8cb776..9f4a9a2e7ae4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1494,7 +1494,7 @@ static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
 		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
-		ret = aio_ret(req, file->f_op->read_iter(req, &iter));
+		ret = aio_ret(req, call_read_iter(file, req, &iter));
 	kfree(iovec);
 	return ret;
 }
@@ -1519,7 +1519,7 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 	if (!ret) {
 		req->ki_flags |= IOCB_WRITE;
 		file_start_write(file);
-		ret = aio_ret(req, file->f_op->write_iter(req, &iter));
+		ret = aio_ret(req, call_write_iter(file, req, &iter));
 		/*
 		 * We release freeze protection in aio_complete().  Fool lockdep
 		 * by telling it the lock got released so that it doesn't
diff --git a/fs/read_write.c b/fs/read_write.c
index 198614f757fa..f2ed9fdc98fd 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -367,7 +367,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
 	kiocb.ki_pos = *ppos;
 
 	iter->type |= READ;
-	ret = file->f_op->read_iter(&kiocb, iter);
+	ret = call_read_iter(file, &kiocb, iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (ret > 0)
 		*ppos = kiocb.ki_pos;
@@ -387,7 +387,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 	kiocb.ki_pos = *ppos;
 
 	iter->type |= WRITE;
-	ret = file->f_op->write_iter(&kiocb, iter);
+	ret = call_write_iter(file, &kiocb, iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (ret > 0)
 		*ppos = kiocb.ki_pos;
@@ -436,7 +436,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
 	kiocb.ki_pos = *ppos;
 	iov_iter_init(&iter, READ, &iov, 1, len);
 
-	ret = filp->f_op->read_iter(&kiocb, &iter);
+	ret = call_read_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
@@ -493,7 +493,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 	kiocb.ki_pos = *ppos;
 	iov_iter_init(&iter, WRITE, &iov, 1, len);
 
-	ret = filp->f_op->write_iter(&kiocb, &iter);
+	ret = call_write_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (ret > 0)
 		*ppos = kiocb.ki_pos;
@@ -690,9 +690,9 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	kiocb.ki_pos = *ppos;
 
 	if (type == READ)
-		ret = filp->f_op->read_iter(&kiocb, iter);
+		ret = call_read_iter(filp, &kiocb, iter);
 	else
-		ret = filp->f_op->write_iter(&kiocb, iter);
+		ret = call_write_iter(filp, &kiocb, iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	*ppos = kiocb.ki_pos;
 	return ret;
diff --git a/fs/splice.c b/fs/splice.c
index 873d83104e79..6518f058bd7f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -306,7 +306,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 	idx = to.idx;
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
-	ret = in->f_op->read_iter(&kiocb, &to);
+	ret = call_read_iter(in, &kiocb, &to);
 	if (ret > 0) {
 		*ppos = kiocb.ki_pos;
 		file_accessed(in);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 78c81e6f5d76..a3145cdff8f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1715,6 +1715,18 @@ struct inode_operations {
 	int (*set_acl)(struct inode *, struct posix_acl *, int);
 } ____cacheline_aligned;
 
+static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
+				     struct iov_iter *iter)
+{
+	return file->f_op->read_iter(kio, iter);
+}
+
+static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
+				      struct iov_iter *iter)
+{
+	return file->f_op->write_iter(kio, iter);
+}
+
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
-- 
cgit v1.2.3


From f74ac01520c9f6d89bbc3c6931a72f757b742f86 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 20 Feb 2017 16:51:23 +0100
Subject: mm: use helper for calling f_op->mmap()

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 drivers/gpu/drm/i915/i915_gem_dmabuf.c | 2 +-
 drivers/gpu/drm/vgem/vgem_drv.c        | 2 +-
 fs/coda/file.c                         | 2 +-
 include/linux/fs.h                     | 5 +++++
 ipc/shm.c                              | 2 +-
 mm/mmap.c                              | 2 +-
 mm/nommu.c                             | 4 ++--
 7 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
index 5e38299b5df6..84fc9f001576 100644
--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
@@ -141,7 +141,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	if (!obj->base.filp)
 		return -ENODEV;
 
-	ret = obj->base.filp->f_op->mmap(obj->base.filp, vma);
+	ret = call_mmap(obj->base.filp, vma);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 477e07f0ecb6..9f7e222b3e89 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -287,7 +287,7 @@ static int vgem_prime_mmap(struct drm_gem_object *obj,
 	if (!obj->filp)
 		return -ENODEV;
 
-	ret = obj->filp->f_op->mmap(obj->filp, vma);
+	ret = call_mmap(obj->filp, vma);
 	if (ret)
 		return ret;
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6e0154eb6fcc..9d956cd6d46f 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -96,7 +96,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 	cfi->cfi_mapcount++;
 	spin_unlock(&cii->c_lock);
 
-	return host_file->f_op->mmap(host_file, vma);
+	return call_mmap(host_file, vma);
 }
 
 int coda_open(struct inode *coda_inode, struct file *coda_file)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a3145cdff8f2..93691b59e476 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1727,6 +1727,11 @@ static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
 	return file->f_op->write_iter(kio, iter);
 }
 
+static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return file->f_op->mmap(file, vma);
+}
+
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
diff --git a/ipc/shm.c b/ipc/shm.c
index 81203e8ba013..4329fe3ef594 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -423,7 +423,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
-	ret = sfd->file->f_op->mmap(sfd->file, vma);
+	ret = call_mmap(sfd->file, vma);
 	if (ret) {
 		shm_close(vma);
 		return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index dc4291dcc99b..3714aa4e6f81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1668,7 +1668,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		 * new file must not have been exposed to user-space, yet.
 		 */
 		vma->vm_file = get_file(file);
-		error = file->f_op->mmap(file, vma);
+		error = call_mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 24f9f5f39145..e366354f777d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1084,7 +1084,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
 {
 	int ret;
 
-	ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+	ret = call_mmap(vma->vm_file, vma);
 	if (ret == 0) {
 		vma->vm_region->vm_top = vma->vm_region->vm_end;
 		return 0;
@@ -1115,7 +1115,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 	 * - VM_MAYSHARE will be set if it may attempt to share
 	 */
 	if (capabilities & NOMMU_MAP_DIRECT) {
-		ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+		ret = call_mmap(vma->vm_file, vma);
 		if (ret == 0) {
 			/* shouldn't return success if we're not sharing */
 			BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
-- 
cgit v1.2.3


From 0eb8af4916a540c362a2950e5ab54eca32eb7d58 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 20 Feb 2017 16:51:23 +0100
Subject: vfs: use helper for calling f_op->fsync()

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/sync.c          | 2 +-
 include/linux/fs.h | 6 ++++++
 ipc/shm.c          | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sync.c b/fs/sync.c
index 2a54c1f22035..11ba023434b1 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -192,7 +192,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 		spin_unlock(&inode->i_lock);
 		mark_inode_dirty_sync(inode);
 	}
-	return file->f_op->fsync(file, start, end, datasync);
+	return call_fsync(file, start, end, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync_range);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 93691b59e476..72a33007ec24 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1732,6 +1732,12 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
 	return file->f_op->mmap(file, vma);
 }
 
+static inline int call_fsync(struct file *file, loff_t start, loff_t end,
+			     int datasync)
+{
+	return file->f_op->fsync(file, start, end, datasync);
+}
+
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
diff --git a/ipc/shm.c b/ipc/shm.c
index 4329fe3ef594..258aff2e03bb 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -452,7 +452,7 @@ static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
 	if (!sfd->file->f_op->fsync)
 		return -EINVAL;
-	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
+	return call_fsync(sfd->file, start, end, datasync);
 }
 
 static long shm_fallocate(struct file *file, int mode, loff_t offset,
-- 
cgit v1.2.3


From 3498d8694d41af701c51289e7caf3ffefc7bfb6b Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 21 Feb 2017 14:19:45 +0100
Subject: gpio: reintroduce devm_get_gpiod_from_child()

We need to keep this API around for the merge window to avoid
nasty build problems in the merges.

Cc: Lee Jones <lee.jones@linaro.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/consumer.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index ea9b01d40017..2484b2fcc6eb 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -143,6 +143,16 @@ struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
 						struct fwnode_handle *child,
 						enum gpiod_flags flags,
 						const char *label);
+/* FIXME: delete this helper when users are switched over */
+static inline struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
+			  const char *con_id, struct fwnode_handle *child)
+{
+	return devm_fwnode_get_index_gpiod_from_child(dev, con_id,
+						      0, child,
+						      GPIOD_ASIS,
+						      "?");
+}
+
 #else /* CONFIG_GPIOLIB */
 
 static inline int gpiod_count(struct device *dev, const char *con_id)
@@ -434,6 +444,13 @@ struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev,
 	return ERR_PTR(-ENOSYS);
 }
 
+/* FIXME: delete this when all users are switched over */
+static inline struct gpio_desc *devm_get_gpiod_from_child(struct device *dev,
+			  const char *con_id, struct fwnode_handle *child)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
 #endif /* CONFIG_GPIOLIB */
 
 static inline
-- 
cgit v1.2.3


From 9d876e79df6a2f364b9f2737eacd72ceb27da53a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 21 Feb 2017 16:09:34 +0100
Subject: bpf: fix unlocking of jited image when module ronx not set

Eric and Willem reported that they recently saw random crashes when
JIT was in use and bisected this to 74451e66d516 ("bpf: make jited
programs visible in traces"). Issue was that the consolidation part
added bpf_jit_binary_unlock_ro() that would unlock previously made
read-only memory back to read-write. However, DEBUG_SET_MODULE_RONX
cannot be used for this to test for presence of set_memory_*()
functions. We need to use ARCH_HAS_SET_MEMORY instead to fix this;
also add the corresponding bpf_jit_binary_lock_ro() to filter.h.

Fixes: 74451e66d516 ("bpf: make jited programs visible in traces")
Reported-by: Eric Dumazet <edumazet@google.com>
Reported-by: Willem de Bruijn <willemb@google.com>
Bisected-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/net/bpf_jit_comp.c |  2 +-
 arch/s390/net/bpf_jit_comp.c  |  2 +-
 arch/x86/net/bpf_jit_comp.c   |  2 +-
 include/linux/filter.h        | 13 +++++++++++--
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 05d12104d270..a785554916c0 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -898,7 +898,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
 	bpf_flush_icache(header, ctx.image + ctx.idx);
 
-	set_memory_ro((unsigned long)header, header->pages);
+	bpf_jit_binary_lock_ro(header);
 	prog->bpf_func = (void *)ctx.image;
 	prog->jited = 1;
 
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index f1d0e62ec1dd..b49c52a02087 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1327,7 +1327,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 			print_fn_code(jit.prg_buf, jit.size_prg);
 	}
 	if (jit.prg_buf) {
-		set_memory_ro((unsigned long)header, header->pages);
+		bpf_jit_binary_lock_ro(header);
 		fp->bpf_func = (void *) jit.prg_buf;
 		fp->jited = 1;
 	}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 18a62e208826..32322ce9b405 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1165,7 +1165,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
 	if (image) {
 		bpf_flush_icache(header, image + proglen);
-		set_memory_ro((unsigned long)header, header->pages);
+		bpf_jit_binary_lock_ro(header);
 		prog->bpf_func = (void *)image;
 		prog->jited = 1;
 	} else {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0c1cc9143cb2..0c167fdee5f7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -551,7 +551,7 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_ARCH_HAS_SET_MEMORY
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
 	set_memory_ro((unsigned long)fp, fp->pages);
@@ -562,6 +562,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 	set_memory_rw((unsigned long)fp, fp->pages);
 }
 
+static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
+{
+	set_memory_ro((unsigned long)hdr, hdr->pages);
+}
+
 static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
 {
 	set_memory_rw((unsigned long)hdr, hdr->pages);
@@ -575,10 +580,14 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 }
 
+static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
+{
+}
+
 static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
 {
 }
-#endif /* CONFIG_DEBUG_SET_MODULE_RONX */
+#endif /* CONFIG_ARCH_HAS_SET_MEMORY */
 
 static inline struct bpf_binary_header *
 bpf_jit_binary_hdr(const struct bpf_prog *fp)
-- 
cgit v1.2.3


From 0500ce589aa7b5325af161d3c992ffb6be138ff9 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Wed, 15 Feb 2017 09:35:27 -0700
Subject: rtc: m48t86: remove unused platform_data

All users of this driver have been updated to allow the driver to
manage it's own resources and do the read/write operations internally.
The m48t86_ops are no longer used.

Remove the platform_data header and the support code in the driver.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/rtc/rtc-m48t86.c                 | 51 ++++++++++++--------------------
 include/linux/platform_data/rtc-m48t86.h | 16 ----------
 2 files changed, 19 insertions(+), 48 deletions(-)
 delete mode 100644 include/linux/platform_data/rtc-m48t86.h

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 491e8e4b300b..02af045305dd 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
 #include <linux/platform_device.h>
-#include <linux/platform_data/rtc-m48t86.h>
 #include <linux/bcd.h>
 #include <linux/io.h>
 
@@ -45,7 +44,6 @@ struct m48t86_rtc_info {
 	void __iomem *index_reg;
 	void __iomem *data_reg;
 	struct rtc_device *rtc;
-	struct m48t86_ops *ops;
 };
 
 static unsigned char m48t86_readb(struct device *dev, unsigned long addr)
@@ -53,12 +51,9 @@ static unsigned char m48t86_readb(struct device *dev, unsigned long addr)
 	struct m48t86_rtc_info *info = dev_get_drvdata(dev);
 	unsigned char value;
 
-	if (info->ops) {
-		value = info->ops->readbyte(addr);
-	} else {
-		writeb(addr, info->index_reg);
-		value = readb(info->data_reg);
-	}
+	writeb(addr, info->index_reg);
+	value = readb(info->data_reg);
+
 	return value;
 }
 
@@ -67,12 +62,8 @@ static void m48t86_writeb(struct device *dev,
 {
 	struct m48t86_rtc_info *info = dev_get_drvdata(dev);
 
-	if (info->ops) {
-		info->ops->writebyte(value, addr);
-	} else {
-		writeb(addr, info->index_reg);
-		writeb(value, info->data_reg);
-	}
+	writeb(addr, info->index_reg);
+	writeb(value, info->data_reg);
 }
 
 static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
@@ -235,30 +226,26 @@ static bool m48t86_verify_chip(struct platform_device *pdev)
 static int m48t86_rtc_probe(struct platform_device *pdev)
 {
 	struct m48t86_rtc_info *info;
+	struct resource *res;
 	unsigned char reg;
 
 	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 
-	info->ops = dev_get_platdata(&pdev->dev);
-	if (!info->ops) {
-		struct resource *res;
-
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-		if (!res)
-			return -ENODEV;
-		info->index_reg = devm_ioremap_resource(&pdev->dev, res);
-		if (IS_ERR(info->index_reg))
-			return PTR_ERR(info->index_reg);
-
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		if (!res)
-			return -ENODEV;
-		info->data_reg = devm_ioremap_resource(&pdev->dev, res);
-		if (IS_ERR(info->data_reg))
-			return PTR_ERR(info->data_reg);
-	}
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+	info->index_reg = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(info->index_reg))
+		return PTR_ERR(info->index_reg);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	if (!res)
+		return -ENODEV;
+	info->data_reg = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(info->data_reg))
+		return PTR_ERR(info->data_reg);
 
 	dev_set_drvdata(&pdev->dev, info);
 
diff --git a/include/linux/platform_data/rtc-m48t86.h b/include/linux/platform_data/rtc-m48t86.h
deleted file mode 100644
index 915d6b4f0f89..000000000000
--- a/include/linux/platform_data/rtc-m48t86.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * ST M48T86 / Dallas DS12887 RTC driver
- * Copyright (c) 2006 Tower Technologies
- *
- * Author: Alessandro Zummo <a.zummo@towertech.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-struct m48t86_ops
-{
-	void (*writebyte)(unsigned char value, unsigned long addr);
-	unsigned char (*readbyte)(unsigned long addr);
-};
-- 
cgit v1.2.3


From 0ae060ca2bdfe11181094699b0f76437ec210b17 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 19 Feb 2017 16:08:25 -0500
Subject: SUNRPC: Add generic helpers for xdr_stream encode/decode

Add some generic helpers for encoding/decoding opaque structures and
basic u32/u64.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h | 177 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 56c48c884a24..dc7e51a769ac 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -242,6 +242,183 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 
+/**
+ * xdr_align_size - Calculate padded size of an object
+ * @n: Size of an object being XDR encoded (in bytes)
+ *
+ * Return value:
+ *   Size (in bytes) of the object including xdr padding
+ */
+static inline size_t
+xdr_align_size(size_t n)
+{
+	const size_t mask = sizeof(__u32) - 1;
+
+	return (n + mask) & ~mask;
+}
+
+/**
+ * xdr_stream_encode_u32 - Encode a 32-bit integer
+ * @xdr: pointer to xdr_stream
+ * @n: integer to encode
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n)
+{
+	const size_t len = sizeof(n);
+	__be32 *p = xdr_reserve_space(xdr, len);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	*p = cpu_to_be32(n);
+	return len;
+}
+
+/**
+ * xdr_stream_encode_u64 - Encode a 64-bit integer
+ * @xdr: pointer to xdr_stream
+ * @n: 64-bit integer to encode
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n)
+{
+	const size_t len = sizeof(n);
+	__be32 *p = xdr_reserve_space(xdr, len);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	xdr_encode_hyper(p, n);
+	return len;
+}
+
+/**
+ * xdr_stream_encode_opaque_fixed - Encode fixed length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: pointer to opaque data object
+ * @len: size of object pointed to by @ptr
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t len)
+{
+	__be32 *p = xdr_reserve_space(xdr, len);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	xdr_encode_opaque_fixed(p, ptr, len);
+	return xdr_align_size(len);
+}
+
+/**
+ * xdr_stream_encode_opaque - Encode variable length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: pointer to opaque data object
+ * @len: size of object pointed to by @ptr
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len)
+{
+	size_t count = sizeof(__u32) + xdr_align_size(len);
+	__be32 *p = xdr_reserve_space(xdr, count);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	xdr_encode_opaque(p, ptr, len);
+	return count;
+}
+
+/**
+ * xdr_stream_decode_u32 - Decode a 32-bit integer
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store integer
+ *
+ * Return values:
+ *   %0 on success
+ *   %-EBADMSG on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr)
+{
+	const size_t count = sizeof(*ptr);
+	__be32 *p = xdr_inline_decode(xdr, count);
+
+	if (unlikely(!p))
+		return -EBADMSG;
+	*ptr = be32_to_cpup(p);
+	return 0;
+}
+
+/**
+ * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store data
+ * @len: size of buffer pointed to by @ptr
+ *
+ * Return values:
+ *   On success, returns size of object stored in @ptr
+ *   %-EBADMSG on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len)
+{
+	__be32 *p = xdr_inline_decode(xdr, len);
+
+	if (unlikely(!p))
+		return -EBADMSG;
+	xdr_decode_opaque_fixed(p, ptr, len);
+	return len;
+}
+
+/**
+ * xdr_stream_decode_opaque_inline - Decode variable length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store pointer to opaque data
+ * @maxlen: maximum acceptable object size
+ *
+ * Note: the pointer stored in @ptr cannot be assumed valid after the XDR
+ * buffer has been destroyed, or even after calling xdr_inline_decode()
+ * on @xdr. It is therefore expected that the object it points to should
+ * be processed immediately.
+ *
+ * Return values:
+ *   On success, returns size of object stored in *@ptr
+ *   %-EBADMSG on XDR buffer overflow
+ *   %-EMSGSIZE if the size of the object would exceed @maxlen
+ */
+static inline ssize_t
+xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxlen)
+{
+	__be32 *p;
+	__u32 len;
+
+	*ptr = NULL;
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
+		return -EBADMSG;
+	if (len != 0) {
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			return -EBADMSG;
+		if (unlikely(len > maxlen))
+			return -EMSGSIZE;
+		*ptr = p;
+	}
+	return len;
+}
 #endif /* __KERNEL__ */
 
 #endif /* _SUNRPC_XDR_H_ */
-- 
cgit v1.2.3


From 5c741d4f2215371ae17e3dbdf3d46da850662e13 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 19 Feb 2017 16:08:31 -0500
Subject: SUNRPC: Add a helper function xdr_stream_decode_string_dup()

Create a helper function that decodes a xdr string object, allocates a memory
buffer and then store it as a NUL terminated string.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h |  2 ++
 net/sunrpc/xdr.c           | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index dc7e51a769ac..054c8cde18f3 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -242,6 +242,8 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 
+ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
+		size_t maxlen, gfp_t gfp_flags);
 /**
  * xdr_align_size - Calculate padded size of an object
  * @n: Size of an object being XDR encoded (in bytes)
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 7f1071e103ca..1f7082144e01 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1518,3 +1518,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(xdr_process_buf);
 
+/**
+ * xdr_stream_decode_string_dup - Decode and duplicate variable length string
+ * @xdr: pointer to xdr_stream
+ * @str: location to store pointer to string
+ * @maxlen: maximum acceptable string length
+ * @gfp_flags: GFP mask to use
+ *
+ * Return values:
+ *   On success, returns length of NUL-terminated string stored in *@ptr
+ *   %-EBADMSG on XDR buffer overflow
+ *   %-EMSGSIZE if the size of the string would exceed @maxlen
+ *   %-ENOMEM on memory allocation failure
+ */
+ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
+		size_t maxlen, gfp_t gfp_flags)
+{
+	void *p;
+	ssize_t ret;
+
+	ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
+	if (ret > 0) {
+		char *s = kmalloc(ret + 1, gfp_flags);
+		if (s != NULL) {
+			memcpy(s, p, ret);
+			s[ret] = '\0';
+			*str = s;
+			return strlen(s);
+		}
+		ret = -ENOMEM;
+	}
+	*str = NULL;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup);
-- 
cgit v1.2.3


From 986994a27587efd8ce4c595cab89b570f7475359 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 26 Jan 2017 17:17:28 +0200
Subject: nvme: Use CNS as 8-bit field and avoid endianness conversion

This patch defines CNS field as 8-bit field and avoids cpu_to/from_le
conversions.
Also initialize nvme_command cns value explicitly to NVME_ID_CNS_NS
for readability (don't rely on the fact that NVME_ID_CNS_NS = 0).

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c        | 5 +++--
 drivers/nvme/target/admin-cmd.c | 2 +-
 drivers/nvme/target/discovery.c | 4 ++--
 include/linux/nvme.h            | 4 +++-
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a68bf6954ead..8b1be128b66e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -560,7 +560,7 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 	c.identify.opcode = nvme_admin_identify;
-	c.identify.cns = cpu_to_le32(NVME_ID_CNS_CTRL);
+	c.identify.cns = NVME_ID_CNS_CTRL;
 
 	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
 	if (!*id)
@@ -578,7 +578,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
 	struct nvme_command c = { };
 
 	c.identify.opcode = nvme_admin_identify;
-	c.identify.cns = cpu_to_le32(NVME_ID_CNS_NS_ACTIVE_LIST);
+	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
 	c.identify.nsid = cpu_to_le32(nsid);
 	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
 }
@@ -592,6 +592,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.cns = NVME_ID_CNS_NS;
 
 	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
 	if (!*id)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 204e85efebab..94e524fea568 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -509,7 +509,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
 		break;
 	case nvme_admin_identify:
 		req->data_len = 4096;
-		switch (le32_to_cpu(cmd->identify.cns)) {
+		switch (cmd->identify.cns) {
 		case NVME_ID_CNS_NS:
 			req->execute = nvmet_execute_identify_ns;
 			return 0;
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 12f39eea569f..af8aabf05335 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -186,14 +186,14 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
 		}
 	case nvme_admin_identify:
 		req->data_len = 4096;
-		switch (le32_to_cpu(cmd->identify.cns)) {
+		switch (cmd->identify.cns) {
 		case NVME_ID_CNS_CTRL:
 			req->execute =
 				nvmet_execute_identify_disc_ctrl;
 			return 0;
 		default:
 			pr_err("nvmet: unsupported identify cns %d\n",
-				le32_to_cpu(cmd->identify.cns));
+				cmd->identify.cns);
 			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		}
 	default:
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0b676a02cf3e..5b32521456d6 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -644,7 +644,9 @@ struct nvme_identify {
 	__le32			nsid;
 	__u64			rsvd2[2];
 	union nvme_data_ptr	dptr;
-	__le32			cns;
+	__u8			cns;
+	__u8			rsvd3;
+	__le16			ctrlid;
 	__u32			rsvd11[5];
 };
 
-- 
cgit v1.2.3


From c5552fde102fcc3f2cf9e502b8ac90e3500d8fdf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 7 Feb 2017 10:08:45 -0800
Subject: nvme: Enable autonomous power state transitions

NVMe devices can advertise multiple power states.  These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.

The hardware configuration is a table.  For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.

This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely.  On hardware without APST support, the sysfs file will
not be exposed.

The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.

In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful.  There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset.  This can be configured from
userspace, but it doesn't seem useful to support in the driver.

On my laptop, enabling APST seems to save nearly 1W.

The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.

This feature is quirked off on a known-buggy Samsung device.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  11 ++++
 include/linux/nvme.h     |   6 ++
 3 files changed, 171 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 38efe4f752a1..849e85847ad4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/nvme_ioctl.h>
 #include <linux/t10-pi.h>
+#include <linux/pm_qos.h>
 #include <scsi/sg.h>
 #include <asm/unaligned.h>
 
@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
 static int nvme_char_major;
 module_param(nvme_char_major, int, 0);
 
+static unsigned long default_ps_max_latency_us = 25000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+		 "max power saving latency for new devices; use PM QOS to change per device");
+
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
@@ -1252,6 +1258,122 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 	blk_queue_write_cache(q, vwc, vwc);
 }
 
+static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * APST (Autonomous Power State Transition) lets us program a
+	 * table of power state transitions that the controller will
+	 * perform automatically.  We configure it with a simple
+	 * heuristic: we are willing to spend at most 2% of the time
+	 * transitioning between power states.  Therefore, when running
+	 * in any given state, we will enter the next lower-power
+	 * non-operational state after waiting 100 * (enlat + exlat)
+	 * microseconds, as long as that state's total latency is under
+	 * the requested maximum latency.
+	 *
+	 * We will not autonomously enter any non-operational state for
+	 * which the total latency exceeds ps_max_latency_us.  Users
+	 * can set ps_max_latency_us to zero to turn off APST.
+	 */
+
+	unsigned apste;
+	struct nvme_feat_auto_pst *table;
+	int ret;
+
+	/*
+	 * If APST isn't supported or if we haven't been initialized yet,
+	 * then don't do anything.
+	 */
+	if (!ctrl->apsta)
+		return;
+
+	if (ctrl->npss > 31) {
+		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+		return;
+	}
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return;
+
+	if (ctrl->ps_max_latency_us == 0) {
+		/* Turn off APST. */
+		apste = 0;
+	} else {
+		__le64 target = cpu_to_le64(0);
+		int state;
+
+		/*
+		 * Walk through all states from lowest- to highest-power.
+		 * According to the spec, lower-numbered states use more
+		 * power.  NPSS, despite the name, is the index of the
+		 * lowest-power state, not the number of states.
+		 */
+		for (state = (int)ctrl->npss; state >= 0; state--) {
+			u64 total_latency_us, transition_ms;
+
+			if (target)
+				table->entries[state] = target;
+
+			/*
+			 * Is this state a useful non-operational state for
+			 * higher-power states to autonomously transition to?
+			 */
+			if (!(ctrl->psd[state].flags &
+			      NVME_PS_FLAGS_NON_OP_STATE))
+				continue;
+
+			total_latency_us =
+				(u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
+				+ le32_to_cpu(ctrl->psd[state].exit_lat);
+			if (total_latency_us > ctrl->ps_max_latency_us)
+				continue;
+
+			/*
+			 * This state is good.  Use it as the APST idle
+			 * target for higher power states.
+			 */
+			transition_ms = total_latency_us + 19;
+			do_div(transition_ms, 20);
+			if (transition_ms > (1 << 24) - 1)
+				transition_ms = (1 << 24) - 1;
+
+			target = cpu_to_le64((state << 3) |
+					     (transition_ms << 8));
+		}
+
+		apste = 1;
+	}
+
+	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+				table, sizeof(*table), NULL);
+	if (ret)
+		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+	kfree(table);
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	u64 latency;
+
+	switch (val) {
+	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+	case PM_QOS_LATENCY_ANY:
+		latency = U64_MAX;
+		break;
+
+	default:
+		latency = val;
+	}
+
+	if (ctrl->ps_max_latency_us != latency) {
+		ctrl->ps_max_latency_us = latency;
+		nvme_configure_apst(ctrl);
+	}
+}
+
 struct nvme_core_quirk_entry {
 	/*
 	 * NVMe model and firmware strings are padded with spaces.  For
@@ -1265,6 +1387,16 @@ struct nvme_core_quirk_entry {
 };
 
 static const struct nvme_core_quirk_entry core_quirks[] = {
+	/*
+	 * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
+	 * the controller to go out to lunch.  It dies when the watchdog
+	 * timer reads CSTS and gets 0xffffffff.
+	 */
+	{
+		.vid = 0x144d,
+		.fr = "BXW75D0Q",
+		.quirks = NVME_QUIRK_NO_APST,
+	},
 };
 
 /* match is null-terminated but idstr is space-padded. */
@@ -1307,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	u64 cap;
 	int ret, page_shift;
 	u32 max_hw_sectors;
+	u8 prev_apsta;
 
 	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
 	if (ret) {
@@ -1368,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->sgls = le32_to_cpu(id->sgls);
 	ctrl->kas = le16_to_cpu(id->kas);
 
+	ctrl->npss = id->npss;
+	prev_apsta = ctrl->apsta;
+	ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
 	if (ctrl->ops->is_fabrics) {
 		ctrl->icdoff = le16_to_cpu(id->icdoff);
 		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
@@ -1392,7 +1530,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 	kfree(id);
 
+	if (ctrl->apsta && !prev_apsta)
+		dev_pm_qos_expose_latency_tolerance(ctrl->device);
+	else if (!ctrl->apsta && prev_apsta)
+		dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+	nvme_configure_apst(ctrl);
+
 	ctrl->identified = true;
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_identify);
@@ -2154,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
 	spin_unlock(&dev_list_lock);
 
+	/*
+	 * Initialize latency tolerance controls.  The sysfs files won't
+	 * be visible to userspace unless the device actually supports APST.
+	 */
+	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
 	return 0;
 out_release_instance:
 	nvme_release_instance(ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 42ede67bfbc6..a3da1e90b99d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -78,6 +78,11 @@ enum nvme_quirks {
 	 * readiness, which is done by reading the NVME_CSTS_RDY bit.
 	 */
 	NVME_QUIRK_DELAY_BEFORE_CHK_RDY		= (1 << 3),
+
+	/*
+	 * APST should not be used.
+	 */
+	NVME_QUIRK_NO_APST			= (1 << 4),
 };
 
 /*
@@ -148,13 +153,19 @@ struct nvme_ctrl {
 	u32 vs;
 	u32 sgls;
 	u16 kas;
+	u8 npss;
+	u8 apsta;
 	unsigned int kato;
 	bool subsystem;
 	unsigned long quirks;
+	struct nvme_id_power_state psd[32];
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
 
+	/* Power saving configuration */
+	u64 ps_max_latency_us;
+
 	/* Fabrics only */
 	u16 sqsize;
 	u32 ioccsz;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5b32521456d6..c43d435d4225 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -579,6 +579,12 @@ struct nvme_write_zeroes_cmd {
 	__le16			appmask;
 };
 
+/* Features */
+
+struct nvme_feat_auto_pst {
+	__le64 entries[32];
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
-- 
cgit v1.2.3


From 3ee80c3d15b61e61611903033df414a0590cfde9 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 20 Feb 2017 13:44:28 +0200
Subject: nvme-rdma: move nvme cm status helper to .h file

This will enable the usage for nvme rdma target.
Also move from a lookup array to a switch statement.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/rdma.c  | 22 ----------------------
 include/linux/nvme-rdma.h | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index aedac6e28c6f..ee789b38bae5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -42,28 +42,6 @@
 
 #define NVME_RDMA_MAX_INLINE_SEGMENTS	1
 
-static const char *const nvme_rdma_cm_status_strs[] = {
-	[NVME_RDMA_CM_INVALID_LEN]	= "invalid length",
-	[NVME_RDMA_CM_INVALID_RECFMT]	= "invalid record format",
-	[NVME_RDMA_CM_INVALID_QID]	= "invalid queue ID",
-	[NVME_RDMA_CM_INVALID_HSQSIZE]	= "invalid host SQ size",
-	[NVME_RDMA_CM_INVALID_HRQSIZE]	= "invalid host RQ size",
-	[NVME_RDMA_CM_NO_RSC]		= "resource not found",
-	[NVME_RDMA_CM_INVALID_IRD]	= "invalid IRD",
-	[NVME_RDMA_CM_INVALID_ORD]	= "Invalid ORD",
-};
-
-static const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
-{
-	size_t index = status;
-
-	if (index < ARRAY_SIZE(nvme_rdma_cm_status_strs) &&
-	    nvme_rdma_cm_status_strs[index])
-		return nvme_rdma_cm_status_strs[index];
-	else
-		return "unrecognized reason";
-};
-
 /*
  * We handle AEN commands ourselves and don't even let the
  * block layer know about them.
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index bf240a3cbf99..a72fd04aa5e1 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -29,6 +29,30 @@ enum nvme_rdma_cm_status {
 	NVME_RDMA_CM_INVALID_ORD	= 0x08,
 };
 
+static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
+{
+	switch (status) {
+	case NVME_RDMA_CM_INVALID_LEN:
+		return "invalid length";
+	case NVME_RDMA_CM_INVALID_RECFMT:
+		return "invalid record format";
+	case NVME_RDMA_CM_INVALID_QID:
+		return "invalid queue ID";
+	case NVME_RDMA_CM_INVALID_HSQSIZE:
+		return "invalid host SQ size";
+	case NVME_RDMA_CM_INVALID_HRQSIZE:
+		return "invalid host RQ size";
+	case NVME_RDMA_CM_NO_RSC:
+		return "resource not found";
+	case NVME_RDMA_CM_INVALID_IRD:
+		return "invalid IRD";
+	case NVME_RDMA_CM_INVALID_ORD:
+		return "Invalid ORD";
+	default:
+		return "unrecognized reason";
+	}
+}
+
 /**
  * struct nvme_rdma_cm_req - rdma connect request
  *
-- 
cgit v1.2.3


From d3213e8fd4b0f18dfd438268ff480406ba743abb Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 22 Feb 2017 15:39:47 -0800
Subject: tracing: add __print_flags_u64()

Patch series "DAX tracepoints, mm argument simplification", v4.

This contains both my DAX tracepoint code and Dave Jiang's MM argument
simplifications.  Dave's code was written with my tracepoint code as a
baseline, so it seemed simplest to keep them together in a single series.

This patch (of 7):

Add __print_flags_u64() and the helper trace_print_flags_seq_u64() in the
same spirit as __print_symbolic_u64() and trace_print_symbols_seq_u64().
These functions allow us to print symbols associated with flags that are
64 bits wide even on 32 bit machines.

These will be used by the DAX code so that we can print the flags set in a
pfn_t such as PFN_SG_CHAIN, PFN_SG_LAST, PFN_DEV and PFN_MAP.

Without this new function I was getting errors like the following when
compiling for i386:

  include/linux/pfn_t.h:13:22: warning: large integer implicitly truncated to unsigned type [-Woverflow]
   #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
    ^

Link: http://lkml.kernel.org/r/1484085142-2297-2-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/trace_events.h |  4 ++++
 include/trace/trace_events.h | 11 +++++++++++
 kernel/trace/trace_output.c  | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0f165507495c..0af63c4381b9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -23,6 +23,10 @@ const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 				    const struct trace_print_flags *symbol_array);
 
 #if BITS_PER_LONG == 32
+const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+		      unsigned long long flags,
+		      const struct trace_print_flags_u64 *flag_array);
+
 const char *trace_print_symbols_seq_u64(struct trace_seq *p,
 					unsigned long long val,
 					const struct trace_print_flags_u64
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 5c06f4af8323..00f643164ca2 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -283,8 +283,16 @@ TRACE_MAKE_SYSTEM_STR();
 		trace_print_symbols_seq(p, value, symbols);		\
 	})
 
+#undef __print_flags_u64
 #undef __print_symbolic_u64
 #if BITS_PER_LONG == 32
+#define __print_flags_u64(flag, delim, flag_array...)			\
+	({								\
+		static const struct trace_print_flags_u64 __flags[] =	\
+			{ flag_array, { -1, NULL } };			\
+		trace_print_flags_seq_u64(p, delim, flag, __flags);	\
+	})
+
 #define __print_symbolic_u64(value, symbol_array...)			\
 	({								\
 		static const struct trace_print_flags_u64 symbols[] =	\
@@ -292,6 +300,9 @@ TRACE_MAKE_SYSTEM_STR();
 		trace_print_symbols_seq_u64(p, value, symbols);	\
 	})
 #else
+#define __print_flags_u64(flag, delim, flag_array...)			\
+			__print_flags(flag, delim, flag_array)
+
 #define __print_symbolic_u64(value, symbol_array...)			\
 			__print_symbolic(value, symbol_array)
 #endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index aea6a1218c7d..070866c32eb9 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -123,6 +123,44 @@ trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 EXPORT_SYMBOL(trace_print_symbols_seq);
 
 #if BITS_PER_LONG == 32
+const char *
+trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+		      unsigned long long flags,
+		      const struct trace_print_flags_u64 *flag_array)
+{
+	unsigned long long mask;
+	const char *str;
+	const char *ret = trace_seq_buffer_ptr(p);
+	int i, first = 1;
+
+	for (i = 0;  flag_array[i].name && flags; i++) {
+
+		mask = flag_array[i].mask;
+		if ((flags & mask) != mask)
+			continue;
+
+		str = flag_array[i].name;
+		flags &= ~mask;
+		if (!first && delim)
+			trace_seq_puts(p, delim);
+		else
+			first = 0;
+		trace_seq_puts(p, str);
+	}
+
+	/* check for left over flags */
+	if (flags) {
+		if (!first && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_printf(p, "0x%llx", flags);
+	}
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(trace_print_flags_seq_u64);
+
 const char *
 trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 			 const struct trace_print_flags_u64 *symbol_array)
-- 
cgit v1.2.3


From 282a8e0391c377b64c7d065b18fc2f6267a24dad Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 22 Feb 2017 15:39:50 -0800
Subject: dax: add tracepoint infrastructure, PMD tracing

Tracepoints are the standard way to capture debugging and tracing
information in many parts of the kernel, including the XFS and ext4
filesystems.  Create a tracepoint header for FS DAX and add the first DAX
tracepoints to the PMD fault handler.  This allows the tracing for DAX to
be done in the same way as the filesystem tracing so that developers can
look at them together and get a coherent idea of what the system is doing.

I added both an entry and exit tracepoint because future patches will add
tracepoints to child functions of dax_iomap_pmd_fault() like
dax_pmd_load_hole() and dax_pmd_insert_mapping().  We want those messages
to be wrapped by the parent function tracepoints so the code flow is more
easily understood.  Having entry and exit tracepoints for faults also
allows us to easily see what filesystems functions were called during the
fault.  These filesystem functions get executed via iomap_begin() and
iomap_end() calls, for example, and will have their own tracepoints.

For PMD faults we primarily want to understand the type of mapping, the
fault flags, the faulting address and whether it fell back to 4k faults.
If it fell back to 4k faults the tracepoints should let us understand why.

I named the new tracepoint header file "fs_dax.h" to allow for device DAX
to have its own separate tracing header in the same directory at some
point.

Here is an example output for these events from a successful PMD fault:

  big-1441  [005] ....    32.582758: xfs_filemap_pmd_fault: dev 259:0 ino 0x1003

  big-1441  [005] ....    32.582776: dax_pmd_fault: dev 259:0 ino 0x1003
  shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400

  big-1441  [005] ....    32.583292: dax_pmd_fault_done: dev 259:0 ino 0x1003
  shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 NOPAGE

Link: http://lkml.kernel.org/r/1484085142-2297-3-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                      | 30 ++++++++++++-------
 include/linux/mm.h            | 25 ++++++++++++++++
 include/trace/events/fs_dax.h | 68 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 10 deletions(-)
 create mode 100644 include/trace/events/fs_dax.h

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index e9cf8b4cd234..dc11a2b90731 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -35,6 +35,9 @@
 #include <linux/iomap.h>
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs_dax.h>
+
 /* We choose 4096 entries - same as per-zone page wait tables */
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -1341,6 +1344,16 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	loff_t pos;
 	int error;
 
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is
+	 * supposed to hold locks serializing us with truncate / punch hole so
+	 * this is a reliable test.
+	 */
+	pgoff = linear_page_index(vma, pmd_addr);
+	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+	trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
+
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
 		goto fallback;
@@ -1351,16 +1364,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	/*
-	 * Check whether offset isn't beyond end of file now. Caller is
-	 * supposed to hold locks serializing us with truncate / punch hole so
-	 * this is a reliable test.
-	 */
-	pgoff = linear_page_index(vma, pmd_addr);
-	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
-
-	if (pgoff > max_pgoff)
-		return VM_FAULT_SIGBUS;
+	if (pgoff > max_pgoff) {
+		result = VM_FAULT_SIGBUS;
+		goto out;
+	}
 
 	/* If the PMD would extend beyond the file size */
 	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
@@ -1432,6 +1439,9 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		split_huge_pmd(vma, pmd, address);
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
+out:
+	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
+			result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6ff66d6fe8e2..d22f7837ad90 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,6 +285,17 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
+#define FAULT_FLAG_TRACE \
+	{ FAULT_FLAG_WRITE,		"WRITE" }, \
+	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
+	{ FAULT_FLAG_ALLOW_RETRY,	"ALLOW_RETRY" }, \
+	{ FAULT_FLAG_RETRY_NOWAIT,	"RETRY_NOWAIT" }, \
+	{ FAULT_FLAG_KILLABLE,		"KILLABLE" }, \
+	{ FAULT_FLAG_TRIED,		"TRIED" }, \
+	{ FAULT_FLAG_USER,		"USER" }, \
+	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
+	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }
+
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
@@ -1111,6 +1122,20 @@ static inline void clear_page_pfmemalloc(struct page *page)
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
 			 VM_FAULT_FALLBACK)
 
+#define VM_FAULT_RESULT_TRACE \
+	{ VM_FAULT_OOM,			"OOM" }, \
+	{ VM_FAULT_SIGBUS,		"SIGBUS" }, \
+	{ VM_FAULT_MAJOR,		"MAJOR" }, \
+	{ VM_FAULT_WRITE,		"WRITE" }, \
+	{ VM_FAULT_HWPOISON,		"HWPOISON" }, \
+	{ VM_FAULT_HWPOISON_LARGE,	"HWPOISON_LARGE" }, \
+	{ VM_FAULT_SIGSEGV,		"SIGSEGV" }, \
+	{ VM_FAULT_NOPAGE,		"NOPAGE" }, \
+	{ VM_FAULT_LOCKED,		"LOCKED" }, \
+	{ VM_FAULT_RETRY,		"RETRY" }, \
+	{ VM_FAULT_FALLBACK,		"FALLBACK" }, \
+	{ VM_FAULT_DONE_COW,		"DONE_COW" }
+
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
 #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
new file mode 100644
index 000000000000..58b0b5612683
--- /dev/null
+++ b/include/trace/events/fs_dax.h
@@ -0,0 +1,68 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs_dax
+
+#if !defined(_TRACE_FS_DAX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FS_DAX_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(dax_pmd_fault_class,
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
+		unsigned long address, unsigned int flags, pgoff_t pgoff,
+		pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long, vm_start)
+		__field(unsigned long, vm_end)
+		__field(unsigned long, vm_flags)
+		__field(unsigned long, address)
+		__field(pgoff_t, pgoff)
+		__field(pgoff_t, max_pgoff)
+		__field(dev_t, dev)
+		__field(unsigned int, flags)
+		__field(int, result)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->vm_start = vma->vm_start;
+		__entry->vm_end = vma->vm_end;
+		__entry->vm_flags = vma->vm_flags;
+		__entry->address = address;
+		__entry->flags = flags;
+		__entry->pgoff = pgoff;
+		__entry->max_pgoff = max_pgoff;
+		__entry->result = result;
+	),
+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start "
+			"%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
+		__print_flags(__entry->flags, "|", FAULT_FLAG_TRACE),
+		__entry->address,
+		__entry->vm_start,
+		__entry->vm_end,
+		__entry->pgoff,
+		__entry->max_pgoff,
+		__print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
+	)
+)
+
+#define DEFINE_PMD_FAULT_EVENT(name) \
+DEFINE_EVENT(dax_pmd_fault_class, name, \
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
+		unsigned long address, unsigned int flags, pgoff_t pgoff, \
+		pgoff_t max_pgoff, int result), \
+	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
+
+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
+
+
+#endif /* _TRACE_FS_DAX_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 27a7ffaccd915675c88045ba83493804a0267ab7 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 22 Feb 2017 15:40:00 -0800
Subject: dax: add tracepoints to dax_pmd_insert_mapping()

Add tracepoints to dax_pmd_insert_mapping(), following the same logging
conventions as the tracepoints in dax_iomap_pmd_fault().

Here is an example PMD fault showing the new tracepoints:

big-1504  [001] ....   326.960743: xfs_filemap_pmd_fault: dev 259:0 ino 0x1003

big-1504  [001] ....   326.960753: dax_pmd_fault: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400

big-1504  [001] ....   326.960981: dax_pmd_insert_mapping: dev 259:0 ino 0x1003 shared write address 0x10505000 length 0x200000 pfn 0x100600 DEV|MAP radix_entry 0xc000e

big-1504  [001] ....   326.960986: dax_pmd_fault_done: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 NOPAGE

Link: http://lkml.kernel.org/r/1484085142-2297-6-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                      | 12 +++++++---
 include/linux/pfn_t.h         |  6 +++++
 include/trace/events/fs_dax.h | 51 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index a622961a717c..06c5dffaa7bc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1262,15 +1262,16 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct block_device *bdev = iomap->bdev;
+	struct inode *inode = mapping->host;
 	struct blk_dax_ctl dax = {
 		.sector = dax_iomap_sector(iomap, pos),
 		.size = PMD_SIZE,
 	};
 	long length = dax_map_atomic(bdev, &dax);
-	void *ret;
+	void *ret = NULL;
 
 	if (length < 0) /* dax_map_atomic() failed */
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	if (length < PMD_SIZE)
 		goto unmap_fallback;
 	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
@@ -1283,13 +1284,18 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	*entryp = ret;
 
+	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
+			dax.pfn, ret);
 	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
 
  unmap_fallback:
 	dax_unmap_atomic(bdev, &dax);
+fallback:
+	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
+			length, dax.pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index a3d90b9da18d..033fc7bbcefa 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -15,6 +15,12 @@
 #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
 #define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
 
+#define PFN_FLAGS_TRACE \
+	{ PFN_SG_CHAIN,	"SG_CHAIN" }, \
+	{ PFN_SG_LAST,	"SG_LAST" }, \
+	{ PFN_DEV,	"DEV" }, \
+	{ PFN_MAP,	"MAP" }
+
 static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
 {
 	pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 43f1263131a4..c3b0aae216dc 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -104,6 +104,57 @@ DEFINE_EVENT(dax_pmd_load_hole_class, name, \
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
+DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
+		unsigned long address, int write, long length, pfn_t pfn,
+		void *radix_entry),
+	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long, vm_flags)
+		__field(unsigned long, address)
+		__field(long, length)
+		__field(u64, pfn_val)
+		__field(void *, radix_entry)
+		__field(dev_t, dev)
+		__field(int, write)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->vm_flags = vma->vm_flags;
+		__entry->address = address;
+		__entry->write = write;
+		__entry->length = length;
+		__entry->pfn_val = pfn.val;
+		__entry->radix_entry = radix_entry;
+	),
+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx length %#lx "
+			"pfn %#llx %s radix_entry %#lx",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
+		__entry->write ? "write" : "read",
+		__entry->address,
+		__entry->length,
+		__entry->pfn_val & ~PFN_FLAGS_MASK,
+		__print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|",
+			PFN_FLAGS_TRACE),
+		(unsigned long)__entry->radix_entry
+	)
+)
+
+#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
+DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
+		unsigned long address, int write, long length, pfn_t pfn, \
+		void *radix_entry), \
+	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
+
+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
+
 #endif /* _TRACE_FS_DAX_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From d8a849e1bc123790bbbf1facba94452a3aef5736 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 22 Feb 2017 15:40:03 -0800
Subject: mm, dax: make pmd_fault() and friends be the same as fault()

Instead of passing in multiple parameters in the pmd_fault() handler,
a vmf can be passed in just like a fault() handler. This will simplify
code and remove the need for the actual pmd fault handlers to allocate a
vmf. Related functions are also modified to do the same.

[dave.jiang@intel.com: fix issue with xfs_tests stall when DAX option is off]
  Link: http://lkml.kernel.org/r/148469861071.195597.3619476895250028518.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/1484085142-2297-7-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dax/dax.c             | 16 +++++++---------
 fs/dax.c                      | 28 +++++++++++-----------------
 fs/ext4/file.c                |  9 ++++-----
 fs/xfs/xfs_file.c             | 10 ++++------
 include/linux/dax.h           |  7 +++----
 include/linux/mm.h            |  3 +--
 include/trace/events/fs_dax.h | 15 +++++++--------
 mm/memory.c                   |  6 ++----
 8 files changed, 39 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index ed758b74ddf0..a8833cc35697 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -473,10 +473,9 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
-		unsigned int flags)
+		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	unsigned long pmd_addr = addr & PMD_MASK;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
 	phys_addr_t phys;
@@ -508,23 +507,22 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
-			flags & FAULT_FLAG_WRITE);
+	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int rc;
 	struct file *filp = vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
-			current->comm, (flags & FAULT_FLAG_WRITE)
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read", vma->vm_start, vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 06c5dffaa7bc..01fdbc86ee8c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1340,18 +1340,17 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = flags & FAULT_FLAG_WRITE;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
 	pgoff_t max_pgoff, pgoff;
-	struct vm_fault vmf;
 	void *entry;
 	loff_t pos;
 	int error;
@@ -1364,7 +1363,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1408,21 +1407,17 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (IS_ERR(entry))
 		goto finish_iomap;
 
-	vmf.pgoff = pgoff;
-	vmf.flags = flags;
-	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
-
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
-				&iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
+				vmf->address, &iomap, pos, write, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
-				&entry);
+		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
+				&iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1448,12 +1443,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
-		split_huge_pmd(vma, pmd, address);
+		split_huge_pmd(vma, vmf->pmd, vmf->address);
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
-			result);
+	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 87e11dfe3cde..d3f589b3602c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,21 +273,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return result;
 }
 
-static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-						pmd_t *pmd, unsigned int flags)
+static int
+ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
-	bool write = flags & FAULT_FLAG_WRITE;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
-				     &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bbb9eb6811b2..44a8d2356e31 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1432,9 +1432,7 @@ xfs_filemap_fault(
 STATIC int
 xfs_filemap_pmd_fault(
 	struct vm_area_struct	*vma,
-	unsigned long		addr,
-	pmd_t			*pmd,
-	unsigned int		flags)
+	struct vm_fault		*vmf)
 {
 	struct inode		*inode = file_inode(vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1445,16 +1443,16 @@ xfs_filemap_pmd_fault(
 
 	trace_xfs_filemap_pmd_fault(ip);
 
-	if (flags & FAULT_FLAG_WRITE) {
+	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
 		file_update_time(vma->vm_file);
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
-	if (flags & FAULT_FLAG_WRITE)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(inode->i_sb);
 
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 24ad71173995..a829fee2b42b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
 static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd, unsigned int flags,
-		struct iomap_ops *ops)
+		struct vm_fault *vmf, struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d22f7837ad90..0961e95e904e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -351,8 +351,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
-						pmd_t *, unsigned int flags);
+	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index c3b0aae216dc..a98665bfb38f 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -8,9 +8,8 @@
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, unsigned int flags, pgoff_t pgoff,
-		pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
+		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vma, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 		__entry->vm_start = vma->vm_start;
 		__entry->vm_end = vma->vm_end;
 		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->flags = flags;
-		__entry->pgoff = pgoff;
+		__entry->address = vmf->address;
+		__entry->flags = vmf->flags;
+		__entry->pgoff = vmf->pgoff;
 		__entry->max_pgoff = max_pgoff;
 		__entry->result = result;
 	),
@@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, unsigned int flags, pgoff_t pgoff, \
+		struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
+	TP_ARGS(inode, vma, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
diff --git a/mm/memory.c b/mm/memory.c
index 6bf2b471e30c..2376f8528800 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3475,8 +3475,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vma))
 		return do_huge_pmd_anonymous_page(vmf);
 	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
-				vmf->flags);
+		return vma->vm_ops->pmd_fault(vma, vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3485,8 +3484,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
-						   vmf->pmd, vmf->flags);
+		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
-- 
cgit v1.2.3


From f42003917b4569a2f4f0c79c35e1e3df2859f81a Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 22 Feb 2017 15:40:06 -0800
Subject: mm, dax: change pmd_fault() to take only vmf parameter

pmd_fault() and related functions really only need the vmf parameter since
the additional parameters are all included in the vmf struct.  Remove the
additional parameter and simplify pmd_fault() and friends.

Link: http://lkml.kernel.org/r/1484085142-2297-8-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dax/dax.c             | 18 +++++++--------
 fs/dax.c                      | 54 ++++++++++++++++++++-----------------------
 fs/ext4/file.c                |  8 +++----
 fs/xfs/xfs_file.c             |  7 +++---
 include/linux/dax.h           |  7 +++---
 include/linux/mm.h            |  2 +-
 include/trace/events/fs_dax.h | 54 ++++++++++++++++++++-----------------------
 mm/memory.c                   |  9 ++++----
 8 files changed, 74 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index a8833cc35697..18e9875f6277 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -472,8 +472,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return rc;
 }
 
-static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, struct vm_fault *vmf)
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
@@ -482,7 +481,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 	pgoff_t pgoff;
 	pfn_t pfn;
 
-	if (check_vma(dax_dev, vma, __func__))
+	if (check_vma(dax_dev, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
 	dax_region = dax_dev->region;
@@ -497,7 +496,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pgoff = linear_page_index(vma, pmd_addr);
+	pgoff = linear_page_index(vmf->vma, pmd_addr);
 	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
@@ -507,22 +506,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
 			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int dax_dev_pmd_fault(struct vm_fault *vmf)
 {
 	int rc;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read", vma->vm_start, vma->vm_end);
+			? "write" : "read",
+			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
+	rc = __dax_dev_pmd_fault(dax_dev, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 01fdbc86ee8c..d800197aba34 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1256,11 +1256,10 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
  */
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
-static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+		loff_t pos, void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
 	struct blk_dax_ctl dax = {
@@ -1287,31 +1286,30 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
-			dax.pfn, ret);
-	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
 
  unmap_fallback:
 	dax_unmap_atomic(bdev, &dax);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
-			length, dax.pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
+			dax.pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, void **entryp)
+static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+		void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct inode *inode = mapping->host;
 	struct page *zero_page;
 	void *ret = NULL;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
 
-	zero_page = mm_get_huge_zero_page(vma->vm_mm);
+	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
 
 	if (unlikely(!zero_page))
 		goto fallback;
@@ -1322,27 +1320,27 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (!pmd_none(*pmd)) {
+	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+	if (!pmd_none(*(vmf->pmd))) {
 		spin_unlock(ptl);
 		goto fallback;
 	}
 
-	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
-	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
-	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
 	return VM_FAULT_NOPAGE;
 
 fallback:
-	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -1363,7 +1361,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1409,15 +1407,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
-				vmf->address, &iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
-				&iomap, &entry);
+		result = dax_pmd_load_hole(vmf, &iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1447,7 +1443,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
+	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d3f589b3602c..13021a054fc0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -274,19 +274,19 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int
-ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ext4_dax_pmd_fault(struct vm_fault *vmf)
 {
 	int result;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 44a8d2356e31..9d8440b07b53 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1431,10 +1431,9 @@ xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_pmd_fault(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret;
 
@@ -1445,11 +1444,11 @@ xfs_filemap_pmd_fault(
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a829fee2b42b..c1bd6ab5e974 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,15 +71,14 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
-static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		struct vm_fault *vmf, struct iomap_ops *ops)
+static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0961e95e904e..3787f047a098 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -351,7 +351,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pmd_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index a98665bfb38f..c566ddc87f73 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -7,9 +7,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, vmf, max_pgoff, result),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -25,9 +25,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_start = vma->vm_start;
-		__entry->vm_end = vma->vm_end;
-		__entry->vm_flags = vma->vm_flags;
+		__entry->vm_start = vmf->vma->vm_start;
+		__entry->vm_end = vmf->vma->vm_end;
+		__entry->vm_flags = vmf->vma->vm_flags;
 		__entry->address = vmf->address;
 		__entry->flags = vmf->flags;
 		__entry->pgoff = vmf->pgoff;
@@ -52,19 +52,18 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		struct vm_fault *vmf, \
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, vmf, max_pgoff, result))
+	TP_ARGS(inode, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
 
 DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, struct page *zero_page,
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		struct page *zero_page,
 		void *radix_entry),
-	TP_ARGS(inode, vma, address, zero_page, radix_entry),
+	TP_ARGS(inode, vmf, zero_page, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -76,8 +75,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
 		__entry->zero_page = zero_page;
 		__entry->radix_entry = radix_entry;
 	),
@@ -95,19 +94,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 
 #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
 DEFINE_EVENT(dax_pmd_load_hole_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, struct page *zero_page, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, zero_page, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		struct page *zero_page, void *radix_entry), \
+	TP_ARGS(inode, vmf, zero_page, radix_entry))
 
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
 DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, int write, long length, pfn_t pfn,
-		void *radix_entry),
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		long length, pfn_t pfn, void *radix_entry),
+	TP_ARGS(inode, vmf, length, pfn, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -121,9 +118,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->write = write;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
+		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
 		__entry->length = length;
 		__entry->pfn_val = pfn.val;
 		__entry->radix_entry = radix_entry;
@@ -146,10 +143,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 
 #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
 DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, int write, long length, pfn_t pfn, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		long length, pfn_t pfn, void *radix_entry), \
+	TP_ARGS(inode, vmf, length, pfn, radix_entry))
 
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
diff --git a/mm/memory.c b/mm/memory.c
index 2376f8528800..ececdc4a2892 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3471,11 +3471,10 @@ out:
 
 static int create_huge_pmd(struct vm_fault *vmf)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	if (vma_is_anonymous(vma))
+	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
-	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf);
+	if (vmf->vma->vm_ops->pmd_fault)
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3484,7 +3483,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
-- 
cgit v1.2.3


From bf5eb3de3847ebcfd1fea7bc14072ef9f21d4e8d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 22 Feb 2017 15:41:11 -0800
Subject: slub: separate out sysfs_slab_release() from sysfs_slab_remove()

Separate out slub sysfs removal and release, and call the former earlier
from __kmem_cache_shutdown().  There's no reason to defer sysfs removal
through RCU and this will later allow us to remove sysfs files way
earlier during memory cgroup offline instead of release.

Link: http://lkml.kernel.org/r/20170117235411.9408-3-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h |  4 ++--
 mm/slab_common.c         |  2 +-
 mm/slub.c                | 12 ++++++++++--
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 75f56c2ef2d4..07ef550c6627 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -113,9 +113,9 @@ struct kmem_cache {
 
 #ifdef CONFIG_SYSFS
 #define SLAB_SUPPORTS_SYSFS
-void sysfs_slab_remove(struct kmem_cache *);
+void sysfs_slab_release(struct kmem_cache *);
 #else
-static inline void sysfs_slab_remove(struct kmem_cache *s)
+static inline void sysfs_slab_release(struct kmem_cache *s)
 {
 }
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4a999d749d2b..e6311b2dbba6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -483,7 +483,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier)
 
 	list_for_each_entry_safe(s, s2, release, list) {
 #ifdef SLAB_SUPPORTS_SYSFS
-		sysfs_slab_remove(s);
+		sysfs_slab_release(s);
 #else
 		slab_kmem_cache_release(s);
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index 6de08005d9cd..caac5456f0ec 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -214,11 +214,13 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void memcg_propagate_slab_attrs(struct kmem_cache *s);
+static void sysfs_slab_remove(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
 static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 #endif
 
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -3687,6 +3689,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 		if (n->nr_partial || slabs_node(s, node))
 			return 1;
 	}
+	sysfs_slab_remove(s);
 	return 0;
 }
 
@@ -5637,7 +5640,7 @@ out_del_kobj:
 	goto out;
 }
 
-void sysfs_slab_remove(struct kmem_cache *s)
+static void sysfs_slab_remove(struct kmem_cache *s)
 {
 	if (slab_state < FULL)
 		/*
@@ -5651,7 +5654,12 @@ void sysfs_slab_remove(struct kmem_cache *s)
 #endif
 	kobject_uevent(&s->kobj, KOBJ_REMOVE);
 	kobject_del(&s->kobj);
-	kobject_put(&s->kobj);
+}
+
+void sysfs_slab_release(struct kmem_cache *s)
+{
+	if (slab_state >= FULL)
+		kobject_put(&s->kobj);
 }
 
 /*
-- 
cgit v1.2.3


From 9eeadc8b6e0e31f9aea1f8886ef472f62c2b7f55 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 22 Feb 2017 15:41:17 -0800
Subject: slab: reorganize memcg_cache_params

We're going to change how memcg caches are iterated.  In preparation,
clean up and reorganize memcg_cache_params.

* The shared ->list is replaced by ->children in root and
  ->children_node in children.

* ->is_root_cache is removed.  Instead ->root_cache is moved out of
  the child union and now used by both root and children.  NULL
  indicates root cache.  Non-NULL a memcg one.

This patch doesn't cause any observable behavior changes.

Link: http://lkml.kernel.org/r/20170117235411.9408-5-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 33 ++++++++++++++++++++++++---------
 mm/slab.h            |  6 +++---
 mm/slab_common.c     | 25 +++++++++++++------------
 3 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4c5363566815..1f611ba00f1d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -545,22 +545,37 @@ struct memcg_cache_array {
  * array to be accessed without taking any locks, on relocation we free the old
  * version only after a grace period.
  *
- * Child caches will hold extra metadata needed for its operation. Fields are:
+ * Root and child caches hold different metadata.
  *
- * @memcg: pointer to the memcg this cache belongs to
- * @root_cache: pointer to the global, root cache, this cache was derived from
+ * @root_cache:	Common to root and child caches.  NULL for root, pointer to
+ *		the root cache for children.
  *
- * Both root and child caches of the same kind are linked into a list chained
- * through @list.
+ * The following fields are specific to root caches.
+ *
+ * @memcg_caches: kmemcg ID indexed table of child caches.  This table is
+ *		used to index child cachces during allocation and cleared
+ *		early during shutdown.
+ *
+ * @children:	List of all child caches.  While the child caches are also
+ *		reachable through @memcg_caches, a child cache remains on
+ *		this list until it is actually destroyed.
+ *
+ * The following fields are specific to child caches.
+ *
+ * @memcg:	Pointer to the memcg this cache belongs to.
+ *
+ * @children_node: List node for @root_cache->children list.
  */
 struct memcg_cache_params {
-	bool is_root_cache;
-	struct list_head list;
+	struct kmem_cache *root_cache;
 	union {
-		struct memcg_cache_array __rcu *memcg_caches;
+		struct {
+			struct memcg_cache_array __rcu *memcg_caches;
+			struct list_head children;
+		};
 		struct {
 			struct mem_cgroup *memcg;
-			struct kmem_cache *root_cache;
+			struct list_head children_node;
 		};
 	};
 };
diff --git a/mm/slab.h b/mm/slab.h
index d07563f37f33..3ed3336883ed 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -206,12 +206,12 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
  * slab_mutex.
  */
 #define for_each_memcg_cache(iter, root) \
-	list_for_each_entry(iter, &(root)->memcg_params.list, \
-			    memcg_params.list)
+	list_for_each_entry(iter, &(root)->memcg_params.children, \
+			    memcg_params.children_node)
 
 static inline bool is_root_cache(struct kmem_cache *s)
 {
-	return s->memcg_params.is_root_cache;
+	return !s->memcg_params.root_cache;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ac469c815587..c3885032dbce 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -140,9 +140,9 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 void slab_init_memcg_params(struct kmem_cache *s)
 {
-	s->memcg_params.is_root_cache = true;
-	INIT_LIST_HEAD(&s->memcg_params.list);
+	s->memcg_params.root_cache = NULL;
 	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+	INIT_LIST_HEAD(&s->memcg_params.children);
 }
 
 static int init_memcg_params(struct kmem_cache *s,
@@ -150,10 +150,10 @@ static int init_memcg_params(struct kmem_cache *s,
 {
 	struct memcg_cache_array *arr;
 
-	if (memcg) {
-		s->memcg_params.is_root_cache = false;
-		s->memcg_params.memcg = memcg;
+	if (root_cache) {
 		s->memcg_params.root_cache = root_cache;
+		s->memcg_params.memcg = memcg;
+		INIT_LIST_HEAD(&s->memcg_params.children_node);
 		return 0;
 	}
 
@@ -223,7 +223,7 @@ int memcg_update_all_caches(int num_memcgs)
 
 static void unlink_memcg_cache(struct kmem_cache *s)
 {
-	list_del(&s->memcg_params.list);
+	list_del(&s->memcg_params.children_node);
 }
 #else
 static inline int init_memcg_params(struct kmem_cache *s,
@@ -594,7 +594,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
-	list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+	list_add(&s->memcg_params.children_node,
+		 &root_cache->memcg_params.children);
 
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -690,7 +691,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
 			 * list so as not to try to destroy it for a second
 			 * time while iterating over inactive caches below.
 			 */
-			list_move(&c->memcg_params.list, &busy);
+			list_move(&c->memcg_params.children_node, &busy);
 		else
 			/*
 			 * The cache is empty and will be destroyed soon. Clear
@@ -705,17 +706,17 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
 	 * Second, shutdown all caches left from memory cgroups that are now
 	 * offline.
 	 */
-	list_for_each_entry_safe(c, c2, &s->memcg_params.list,
-				 memcg_params.list)
+	list_for_each_entry_safe(c, c2, &s->memcg_params.children,
+				 memcg_params.children_node)
 		shutdown_cache(c);
 
-	list_splice(&busy, &s->memcg_params.list);
+	list_splice(&busy, &s->memcg_params.children);
 
 	/*
 	 * A cache being destroyed must be empty. In particular, this means
 	 * that all per memcg caches attached to it must be empty too.
 	 */
-	if (!list_empty(&s->memcg_params.list))
+	if (!list_empty(&s->memcg_params.children))
 		return -EBUSY;
 	return 0;
 }
-- 
cgit v1.2.3


From bc2791f857e1984b7548d2a2de2ffb1a913dee62 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 22 Feb 2017 15:41:21 -0800
Subject: slab: link memcg kmem_caches on their associated memory cgroup

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is not
under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

While a memcg kmem_cache is listed on its root cache's ->children list,
there is no direct way to iterate all kmem_caches which are assocaited
with a memory cgroup.  The only way to iterate them is walking all
caches while filtering out caches which don't match, which would be most
of them.

This makes memcg destruction operations O(N^2) where N is the total
number of slab caches which can be huge.  This combined with the
synchronous RCU operations can tie up a CPU and affect the whole machine
for many hours when memory reclaim triggers offlining and destruction of
the stale memcgs.

This patch adds mem_cgroup->kmem_caches list which goes through
memcg_cache_params->kmem_caches_node of all kmem_caches which are
associated with the memcg.  All memcg specific iterations, including
stat file access, are updated to use the new list instead.

Link: http://lkml.kernel.org/r/20170117235411.9408-6-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 +
 include/linux/slab.h       |  3 +++
 mm/memcontrol.c            |  7 ++++---
 mm/slab.h                  |  3 +++
 mm/slab_common.c           | 36 +++++++++++++++++++++++++++++-------
 5 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 254698856b8f..9fcece9be85d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -253,6 +253,7 @@ struct mem_cgroup {
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
 	enum memcg_kmem_state kmem_state;
+	struct list_head kmem_caches;
 #endif
 
 	int last_scanned_node;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1f611ba00f1d..a0cc7a77cda2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -565,6 +565,8 @@ struct memcg_cache_array {
  * @memcg:	Pointer to the memcg this cache belongs to.
  *
  * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
  */
 struct memcg_cache_params {
 	struct kmem_cache *root_cache;
@@ -576,6 +578,7 @@ struct memcg_cache_params {
 		struct {
 			struct mem_cgroup *memcg;
 			struct list_head children_node;
+			struct list_head kmem_caches_node;
 		};
 	};
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b822e158b319..834d641dfa8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2837,6 +2837,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 	 */
 	memcg->kmemcg_id = memcg_id;
 	memcg->kmem_state = KMEM_ONLINE;
+	INIT_LIST_HEAD(&memcg->kmem_caches);
 
 	return 0;
 }
@@ -4002,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_files[] = {
 #ifdef CONFIG_SLABINFO
 	{
 		.name = "kmem.slabinfo",
-		.seq_start = slab_start,
-		.seq_next = slab_next,
-		.seq_stop = slab_stop,
+		.seq_start = memcg_slab_start,
+		.seq_next = memcg_slab_next,
+		.seq_stop = memcg_slab_stop,
 		.seq_show = memcg_slab_show,
 	},
 #endif
diff --git a/mm/slab.h b/mm/slab.h
index 3ed3336883ed..a08f01016a3f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -494,6 +494,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 void *slab_start(struct seq_file *m, loff_t *pos);
 void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
+void *memcg_slab_start(struct seq_file *m, loff_t *pos);
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
+void memcg_slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c3885032dbce..c3bbeddaeaaf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -154,6 +154,7 @@ static int init_memcg_params(struct kmem_cache *s,
 		s->memcg_params.root_cache = root_cache;
 		s->memcg_params.memcg = memcg;
 		INIT_LIST_HEAD(&s->memcg_params.children_node);
+		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
 		return 0;
 	}
 
@@ -224,6 +225,7 @@ int memcg_update_all_caches(int num_memcgs)
 static void unlink_memcg_cache(struct kmem_cache *s)
 {
 	list_del(&s->memcg_params.children_node);
+	list_del(&s->memcg_params.kmem_caches_node);
 }
 #else
 static inline int init_memcg_params(struct kmem_cache *s,
@@ -596,6 +598,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	list_add(&s->memcg_params.children_node,
 		 &root_cache->memcg_params.children);
+	list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches);
 
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -651,9 +654,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry_safe(s, s2, &slab_caches, list) {
-		if (is_root_cache(s) || s->memcg_params.memcg != memcg)
-			continue;
+	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
+				 memcg_params.kmem_caches_node) {
 		/*
 		 * The cgroup is about to be freed and therefore has no charges
 		 * left. Hence, all its caches must be empty by now.
@@ -1201,15 +1203,35 @@ static int slab_show(struct seq_file *m, void *p)
 }
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+void *memcg_slab_start(struct seq_file *m, loff_t *pos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	mutex_lock(&slab_mutex);
+	return seq_list_start(&memcg->kmem_caches, *pos);
+}
+
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	return seq_list_next(p, &memcg->kmem_caches, pos);
+}
+
+void memcg_slab_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&slab_mutex);
+}
+
 int memcg_slab_show(struct seq_file *m, void *p)
 {
-	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+	struct kmem_cache *s = list_entry(p, struct kmem_cache,
+					  memcg_params.kmem_caches_node);
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 
-	if (p == slab_caches.next)
+	if (p == memcg->kmem_caches.next)
 		print_slabinfo_header(m);
-	if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
-		cache_show(s, m);
+	cache_show(s, m);
 	return 0;
 }
 #endif
-- 
cgit v1.2.3


From 510ded33e075c2bd662b1efab0110f4240325fc9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 22 Feb 2017 15:41:24 -0800
Subject: slab: implement slab_root_caches list

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is not
under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

slab_caches currently lists all caches including root and memcg ones.
This is the only data structure which lists the root caches and
iterating root caches can only be done by walking the list while
skipping over memcg caches.  As there can be a huge number of memcg
caches, this can become very expensive.

This also can make /proc/slabinfo behave very badly.  seq_file processes
reads in 4k chunks and seeks to the previous Nth position on slab_caches
list to resume after each chunk.  With a lot of memcg cache churns on
the list, reading /proc/slabinfo can become very slow and its content
often ends up with duplicate and/or missing entries.

This patch adds a new list slab_root_caches which lists only the root
caches.  When memcg is not enabled, it becomes just an alias of
slab_caches.  memcg specific list operations are collected into
memcg_[un]link_cache().

Link: http://lkml.kernel.org/r/20170117235411.9408-7-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov@tarantool.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  3 +++
 mm/slab.h            | 15 +++++++++++++
 mm/slab_common.c     | 59 ++++++++++++++++++++++++++++++----------------------
 mm/slub.c            |  1 +
 4 files changed, 53 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index a0cc7a77cda2..af1a5bef80f4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -556,6 +556,8 @@ struct memcg_cache_array {
  *		used to index child cachces during allocation and cleared
  *		early during shutdown.
  *
+ * @root_caches_node: List node for slab_root_caches list.
+ *
  * @children:	List of all child caches.  While the child caches are also
  *		reachable through @memcg_caches, a child cache remains on
  *		this list until it is actually destroyed.
@@ -573,6 +575,7 @@ struct memcg_cache_params {
 	union {
 		struct {
 			struct memcg_cache_array __rcu *memcg_caches;
+			struct list_head __root_caches_node;
 			struct list_head children;
 		};
 		struct {
diff --git a/mm/slab.h b/mm/slab.h
index a08f01016a3f..9631bb27c772 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -201,6 +201,11 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+/* List of all root caches. */
+extern struct list_head		slab_root_caches;
+#define root_caches_node	memcg_params.__root_caches_node
+
 /*
  * Iterate over all memcg caches of the given root cache. The caller must hold
  * slab_mutex.
@@ -300,9 +305,14 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 }
 
 extern void slab_init_memcg_params(struct kmem_cache *);
+extern void memcg_link_cache(struct kmem_cache *s);
 
 #else /* CONFIG_MEMCG && !CONFIG_SLOB */
 
+/* If !memcg, all caches are root. */
+#define slab_root_caches	slab_caches
+#define root_caches_node	list
+
 #define for_each_memcg_cache(iter, root) \
 	for ((void)(iter), (void)(root); 0; )
 
@@ -347,6 +357,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order,
 static inline void slab_init_memcg_params(struct kmem_cache *s)
 {
 }
+
+static inline void memcg_link_cache(struct kmem_cache *s)
+{
+}
+
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c3bbeddaeaaf..274697e1a42a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -138,6 +138,9 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 }
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+LIST_HEAD(slab_root_caches);
+
 void slab_init_memcg_params(struct kmem_cache *s)
 {
 	s->memcg_params.root_cache = NULL;
@@ -183,9 +186,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
 	struct memcg_cache_array *old, *new;
 
-	if (!is_root_cache(s))
-		return 0;
-
 	new = kzalloc(sizeof(struct memcg_cache_array) +
 		      new_array_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
@@ -209,7 +209,7 @@ int memcg_update_all_caches(int num_memcgs)
 	int ret = 0;
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list) {
+	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
 		ret = update_memcg_params(s, num_memcgs);
 		/*
 		 * Instead of freeing the memory, we'll just leave the caches
@@ -222,10 +222,26 @@ int memcg_update_all_caches(int num_memcgs)
 	return ret;
 }
 
-static void unlink_memcg_cache(struct kmem_cache *s)
+void memcg_link_cache(struct kmem_cache *s)
+{
+	if (is_root_cache(s)) {
+		list_add(&s->root_caches_node, &slab_root_caches);
+	} else {
+		list_add(&s->memcg_params.children_node,
+			 &s->memcg_params.root_cache->memcg_params.children);
+		list_add(&s->memcg_params.kmem_caches_node,
+			 &s->memcg_params.memcg->kmem_caches);
+	}
+}
+
+static void memcg_unlink_cache(struct kmem_cache *s)
 {
-	list_del(&s->memcg_params.children_node);
-	list_del(&s->memcg_params.kmem_caches_node);
+	if (is_root_cache(s)) {
+		list_del(&s->root_caches_node);
+	} else {
+		list_del(&s->memcg_params.children_node);
+		list_del(&s->memcg_params.kmem_caches_node);
+	}
 }
 #else
 static inline int init_memcg_params(struct kmem_cache *s,
@@ -238,7 +254,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s)
 {
 }
 
-static inline void unlink_memcg_cache(struct kmem_cache *s)
+static inline void memcg_unlink_cache(struct kmem_cache *s)
 {
 }
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
@@ -285,7 +301,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
 	if (flags & SLAB_NEVER_MERGE)
 		return NULL;
 
-	list_for_each_entry_reverse(s, &slab_caches, list) {
+	list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
 		if (slab_unmergeable(s))
 			continue;
 
@@ -369,6 +385,7 @@ static struct kmem_cache *create_cache(const char *name,
 
 	s->refcount = 1;
 	list_add(&s->list, &slab_caches);
+	memcg_link_cache(s);
 out:
 	if (err)
 		return ERR_PTR(err);
@@ -514,9 +531,8 @@ static int shutdown_cache(struct kmem_cache *s)
 	if (__kmem_cache_shutdown(s) != 0)
 		return -EBUSY;
 
+	memcg_unlink_cache(s);
 	list_del(&s->list);
-	if (!is_root_cache(s))
-		unlink_memcg_cache(s);
 
 	if (s->flags & SLAB_DESTROY_BY_RCU) {
 		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
@@ -596,10 +612,6 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
-	list_add(&s->memcg_params.children_node,
-		 &root_cache->memcg_params.children);
-	list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches);
-
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
 	 * barrier here to ensure nobody will see the kmem_cache partially
@@ -627,10 +639,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
+	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
 		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
 						lockdep_is_held(&slab_mutex));
 		c = arr->entries[idx];
@@ -829,6 +838,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
 
 	create_boot_cache(s, name, size, flags);
 	list_add(&s->list, &slab_caches);
+	memcg_link_cache(s);
 	s->refcount = 1;
 	return s;
 }
@@ -1136,12 +1146,12 @@ static void print_slabinfo_header(struct seq_file *m)
 void *slab_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&slab_mutex);
-	return seq_list_start(&slab_caches, *pos);
+	return seq_list_start(&slab_root_caches, *pos);
 }
 
 void *slab_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	return seq_list_next(p, &slab_caches, pos);
+	return seq_list_next(p, &slab_root_caches, pos);
 }
 
 void slab_stop(struct seq_file *m, void *p)
@@ -1193,12 +1203,11 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
 
 static int slab_show(struct seq_file *m, void *p)
 {
-	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+	struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
 
-	if (p == slab_caches.next)
+	if (p == slab_root_caches.next)
 		print_slabinfo_header(m);
-	if (is_root_cache(s))
-		cache_show(s, m);
+	cache_show(s, m);
 	return 0;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index caac5456f0ec..03b012bcb5fa 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4127,6 +4127,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 	}
 	slab_init_memcg_params(s);
 	list_add(&s->list, &slab_caches);
+	memcg_link_cache(s);
 	return s;
 }
 
-- 
cgit v1.2.3


From 01fb58bcba63f8fba37581c24c99e9a515dd0335 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 22 Feb 2017 15:41:30 -0800
Subject: slab: remove synchronous synchronize_sched() from memcg cache
 deactivation path

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is not
under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

slub uses synchronize_sched() to deactivate a memcg cache.
synchronize_sched() is an expensive and slow operation and doesn't scale
when a huge number of caches are destroyed back-to-back.  While there
used to be a simple batching mechanism, the batching was too restricted
to be helpful.

This patch implements slab_deactivate_memcg_cache_rcu_sched() which slub
can use to schedule sched RCU callback instead of performing
synchronize_sched() synchronously while holding cgroup_mutex.  While
this adds online cpus, mems and slab_mutex operations, operating on
these locks back-to-back from the same kworker, which is what's gonna
happen when there are many to deactivate, isn't expensive at all and
this gets rid of the scalability problem completely.

Link: http://lkml.kernel.org/r/20170117235411.9408-9-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  6 ++++++
 mm/slab.h            |  2 ++
 mm/slab_common.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/slub.c            | 12 +++++++----
 4 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index af1a5bef80f4..3c37a8c51921 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -582,6 +582,12 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head children_node;
 			struct list_head kmem_caches_node;
+
+			void (*deact_fn)(struct kmem_cache *);
+			union {
+				struct rcu_head deact_rcu_head;
+				struct work_struct deact_work;
+			};
 		};
 	};
 };
diff --git a/mm/slab.h b/mm/slab.h
index 7bff1ee513c2..65e7c3fcac72 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -307,6 +307,8 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 
 extern void slab_init_memcg_params(struct kmem_cache *);
 extern void memcg_link_cache(struct kmem_cache *s);
+extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+				void (*deact_fn)(struct kmem_cache *));
 
 #else /* CONFIG_MEMCG && !CONFIG_SLOB */
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 59e41bb81575..c549296c7981 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -627,6 +627,66 @@ out_unlock:
 	put_online_cpus();
 }
 
+static void kmemcg_deactivate_workfn(struct work_struct *work)
+{
+	struct kmem_cache *s = container_of(work, struct kmem_cache,
+					    memcg_params.deact_work);
+
+	get_online_cpus();
+	get_online_mems();
+
+	mutex_lock(&slab_mutex);
+
+	s->memcg_params.deact_fn(s);
+
+	mutex_unlock(&slab_mutex);
+
+	put_online_mems();
+	put_online_cpus();
+
+	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
+	css_put(&s->memcg_params.memcg->css);
+}
+
+static void kmemcg_deactivate_rcufn(struct rcu_head *head)
+{
+	struct kmem_cache *s = container_of(head, struct kmem_cache,
+					    memcg_params.deact_rcu_head);
+
+	/*
+	 * We need to grab blocking locks.  Bounce to ->deact_work.  The
+	 * work item shares the space with the RCU head and can't be
+	 * initialized eariler.
+	 */
+	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
+	schedule_work(&s->memcg_params.deact_work);
+}
+
+/**
+ * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
+ *					   sched RCU grace period
+ * @s: target kmem_cache
+ * @deact_fn: deactivation function to call
+ *
+ * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
+ * held after a sched RCU grace period.  The slab is guaranteed to stay
+ * alive until @deact_fn is finished.  This is to be used from
+ * __kmemcg_cache_deactivate().
+ */
+void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+					   void (*deact_fn)(struct kmem_cache *))
+{
+	if (WARN_ON_ONCE(is_root_cache(s)) ||
+	    WARN_ON_ONCE(s->memcg_params.deact_fn))
+		return;
+
+	/* pin memcg so that @s doesn't get destroyed in the middle */
+	css_get(&s->memcg_params.memcg->css);
+
+	s->memcg_params.deact_fn = deact_fn;
+	call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+}
+
 void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 {
 	int idx;
diff --git a/mm/slub.c b/mm/slub.c
index 8a4591526f37..62d0b557a596 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3957,6 +3957,12 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 }
 
 #ifdef CONFIG_MEMCG
+static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
+{
+	/* called with all the locks held after a sched RCU grace period */
+	__kmem_cache_shrink(s);
+}
+
 void __kmemcg_cache_deactivate(struct kmem_cache *s)
 {
 	/*
@@ -3968,11 +3974,9 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
 
 	/*
 	 * s->cpu_partial is checked locklessly (see put_cpu_partial), so
-	 * we have to make sure the change is visible.
+	 * we have to make sure the change is visible before shrinking.
 	 */
-	synchronize_sched();
-
-	__kmem_cache_shrink(s);
+	slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
 }
 #endif
 
-- 
cgit v1.2.3


From 17cc4dfeda97636d67e83de8cd41940b65a93bc7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 22 Feb 2017 15:41:36 -0800
Subject: slab: use memcg_kmem_cache_wq for slab destruction operations

If there's contention on slab_mutex, queueing the per-cache destruction
work item on the system_wq can unnecessarily create and tie up a lot of
kworkers.

Rename memcg_kmem_cache_create_wq to memcg_kmem_cache_wq and make it
global and use that workqueue for the destruction work items too.  While
at it, convert the workqueue from an unbound workqueue to a per-cpu one
with concurrency limited to 1.  It's generally preferable to use per-cpu
workqueues and concurrency limit of 1 is safe enough.

This is suggested by Joonsoo Kim.

Link: http://lkml.kernel.org/r/20170117235411.9408-11-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov@tarantool.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 +
 mm/memcontrol.c            | 16 ++++++++--------
 mm/slab_common.c           |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9fcece9be85d..5af377303880 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -830,6 +830,7 @@ void memcg_kmem_uncharge(struct page *page, int order);
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 extern struct static_key_false memcg_kmem_enabled_key;
+extern struct workqueue_struct *memcg_kmem_cache_wq;
 
 extern int memcg_nr_cache_ids;
 void memcg_get_cache_ids(void);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 834d641dfa8c..1fd6affcdde7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -317,6 +317,8 @@ void memcg_put_cache_ids(void)
 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
+struct workqueue_struct *memcg_kmem_cache_wq;
+
 #endif /* !CONFIG_SLOB */
 
 /**
@@ -2143,8 +2145,6 @@ struct memcg_kmem_cache_create_work {
 	struct work_struct work;
 };
 
-static struct workqueue_struct *memcg_kmem_cache_create_wq;
-
 static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
 	struct memcg_kmem_cache_create_work *cw =
@@ -2176,7 +2176,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 	cw->cachep = cachep;
 	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 
-	queue_work(memcg_kmem_cache_create_wq, &cw->work);
+	queue_work(memcg_kmem_cache_wq, &cw->work);
 }
 
 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -5778,12 +5778,12 @@ static int __init mem_cgroup_init(void)
 #ifndef CONFIG_SLOB
 	/*
 	 * Kmem cache creation is mostly done with the slab_mutex held,
-	 * so use a special workqueue to avoid stalling all worker
-	 * threads in case lots of cgroups are created simultaneously.
+	 * so use a workqueue with limited concurrency to avoid stalling
+	 * all worker threads in case lots of cgroups are created and
+	 * destroyed simultaneously.
 	 */
-	memcg_kmem_cache_create_wq =
-		alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
-	BUG_ON(!memcg_kmem_cache_create_wq);
+	memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
+	BUG_ON(!memcg_kmem_cache_wq);
 #endif
 
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c549296c7981..23ff74e61838 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -659,7 +659,7 @@ static void kmemcg_deactivate_rcufn(struct rcu_head *head)
 	 * initialized eariler.
 	 */
 	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
-	schedule_work(&s->memcg_params.deact_work);
+	queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
 }
 
 /**
-- 
cgit v1.2.3


From 893e26e61d04eac974ded0c11e1647b335c8cb7b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 22 Feb 2017 15:42:27 -0800
Subject: userfaultfd: non-cooperative: Add fork() event

When the mm with uffd-ed vmas fork()-s the respective vmas notify their
uffds with the event which contains a descriptor with new uffd.  This
new descriptor can then be used to get events from the child and
populate its mm with data.  Note, that there can be different uffd-s
controlling different vmas within one mm, so first we should collect all
those uffds (and ctx-s) in a list and then notify them all one by one
but only once per fork().

The context is created at fork() time but the descriptor, file struct
and anon inode object is created at event read time.  So some trickery
is added to the userfaultfd_ctx_read() to handle the ctx queues' locking
vs file creation.

Another thing worth noticing is that the task that fork()-s waits for
the uffd event to get processed WITHOUT the mmap sem.

[aarcange@redhat.com: build warning fix]
  Link: http://lkml.kernel.org/r/20161216144821.5183-10-aarcange@redhat.com
Link: http://lkml.kernel.org/r/20161216144821.5183-9-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 148 ++++++++++++++++++++++++++++++++++++++-
 include/linux/userfaultfd_k.h    |  13 ++++
 include/uapi/linux/userfaultfd.h |  15 ++--
 kernel/fork.c                    |  10 ++-
 4 files changed, 170 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87d31921b66c..6046e0b552b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
 	struct mm_struct *mm;
 };
 
+struct userfaultfd_fork_ctx {
+	struct userfaultfd_ctx *orig;
+	struct userfaultfd_ctx *new;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -465,9 +471,8 @@ out:
 	return ret;
 }
 
-static int __maybe_unused userfaultfd_event_wait_completion(
-		struct userfaultfd_ctx *ctx,
-		struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+					     struct userfaultfd_wait_queue *ewq)
 {
 	int ret = 0;
 
@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 }
 
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+	struct userfaultfd_ctx *ctx = NULL, *octx;
+	struct userfaultfd_fork_ctx *fctx;
+
+	octx = vma->vm_userfaultfd_ctx.ctx;
+	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		return 0;
+	}
+
+	list_for_each_entry(fctx, fcs, list)
+		if (fctx->orig == octx) {
+			ctx = fctx->new;
+			break;
+		}
+
+	if (!ctx) {
+		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+		if (!fctx)
+			return -ENOMEM;
+
+		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+		if (!ctx) {
+			kfree(fctx);
+			return -ENOMEM;
+		}
+
+		atomic_set(&ctx->refcount, 1);
+		ctx->flags = octx->flags;
+		ctx->state = UFFD_STATE_RUNNING;
+		ctx->features = octx->features;
+		ctx->released = false;
+		ctx->mm = vma->vm_mm;
+		atomic_inc(&ctx->mm->mm_users);
+
+		userfaultfd_ctx_get(octx);
+		fctx->orig = octx;
+		fctx->new = ctx;
+		list_add_tail(&fctx->list, fcs);
+	}
+
+	vma->vm_userfaultfd_ctx.ctx = ctx;
+	return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+	struct userfaultfd_ctx *ctx = fctx->orig;
+	struct userfaultfd_wait_queue ewq;
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_FORK;
+	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+	return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+	int ret = 0;
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		if (!ret)
+			ret = dup_fctx(fctx);
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 	}
 }
 
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+				  struct userfaultfd_ctx *new,
+				  struct uffd_msg *msg)
+{
+	int fd;
+	struct file *file;
+	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+				  O_RDWR | flags);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	fd_install(fd, file);
+	msg->arg.reserved.reserved1 = 0;
+	msg->arg.fork.ufd = fd;
+
+	return 0;
+}
+
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
 	struct userfaultfd_wait_queue *uwq;
+	/*
+	 * Handling fork event requires sleeping operations, so
+	 * we drop the event_wqh lock, then do these ops, then
+	 * lock it back and wake up the waiter. While the lock is
+	 * dropped the ewq may go away so we keep track of it
+	 * carefully.
+	 */
+	LIST_HEAD(fork_event);
+	struct userfaultfd_ctx *fork_nctx = NULL;
 
 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
 	spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		if (uwq) {
 			*msg = uwq->msg;
 
+			if (uwq->msg.event == UFFD_EVENT_FORK) {
+				fork_nctx = (struct userfaultfd_ctx *)
+					(unsigned long)
+					uwq->msg.arg.reserved.reserved1;
+				list_move(&uwq->wq.task_list, &fork_event);
+				spin_unlock(&ctx->event_wqh.lock);
+				ret = 0;
+				break;
+			}
+
 			userfaultfd_event_complete(ctx, uwq);
 			spin_unlock(&ctx->event_wqh.lock);
 			ret = 0;
@@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->fd_wqh.lock);
 
+	if (!ret && msg->event == UFFD_EVENT_FORK) {
+		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+		if (!ret) {
+			spin_lock(&ctx->event_wqh.lock);
+			if (!list_empty(&fork_event)) {
+				uwq = list_first_entry(&fork_event,
+						       typeof(*uwq),
+						       wq.task_list);
+				list_del(&uwq->wq.task_list);
+				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+				userfaultfd_event_complete(ctx, uwq);
+			}
+			spin_unlock(&ctx->event_wqh.lock);
+		}
+	}
+
 	return ret;
 }
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 11b92b047a1e..79002bca1f43 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
 }
 
+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
+extern void dup_userfaultfd_complete(struct list_head *);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline int dup_userfaultfd(struct vm_area_struct *vma,
+				  struct list_head *l)
+{
+	return 0;
+}
+
+static inline void dup_userfaultfd_complete(struct list_head *l)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 94046b8aa6ad..c8953c84fdcc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,12 +18,7 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-/*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- *			      UFFD_FEATURE_EVENT_FORK)
- */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -77,6 +72,10 @@ struct uffd_msg {
 			__u64	address;
 		} pagefault;
 
+		struct {
+			__u32	ufd;
+		} fork;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -90,9 +89,7 @@ struct uffd_msg {
  * Start at 0x12 and not at 0 to be more strict against bugs.
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
-#if 0 /* not available yet */
 #define UFFD_EVENT_FORK		0x13
-#endif
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
 	 * are to be considered implicitly always enabled in all kernels as
 	 * long as the uffdio_api.api requested matches UFFD_API.
 	 */
-#if 0 /* not available yet */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
-#endif
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ff82e24573b6..d12fcc4db8a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
 #include <linux/rmap.h>
 #include <linux/ksm.h>
 #include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
+	LIST_HEAD(uf);
 
 	uprobe_start_dup_mmap();
 	if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (retval)
 			goto fail_nomem_policy;
 		tmp->vm_mm = mm;
+		retval = dup_userfaultfd(tmp, &uf);
+		if (retval)
+			goto fail_nomem_anon_vma_fork;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &=
-			~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		tmp->vm_next = tmp->vm_prev = NULL;
-		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
@@ -678,6 +681,7 @@ out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
+	dup_userfaultfd_complete(&uf);
 fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
-- 
cgit v1.2.3


From 72f87654c69690ff4721bd9b4a39983f971de9a5 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 22 Feb 2017 15:42:34 -0800
Subject: userfaultfd: non-cooperative: add mremap() event

The event denotes that an area [start:end] moves to different location.
Length change isn't reported as "new" addresses, if they appear on the
uffd reader side they will not contain any data and the latter can just
zeromap them.

Waiting for the event ACK is also done outside of mmap sem, as for fork
event.

Link: http://lkml.kernel.org/r/20161216144821.5183-12-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    | 17 +++++++++++++++++
 include/uapi/linux/userfaultfd.h | 11 ++++++++++-
 mm/mremap.c                      | 17 ++++++++++++-----
 4 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 27978f249016..68f978beefac 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -596,6 +596,43 @@ void dup_userfaultfd_complete(struct list_head *fcs)
 	}
 }
 
+void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+			     struct vm_userfaultfd_ctx *vm_ctx)
+{
+	struct userfaultfd_ctx *ctx;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+		vm_ctx->ctx = ctx;
+		userfaultfd_ctx_get(ctx);
+	}
+}
+
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx,
+				 unsigned long from, unsigned long to,
+				 unsigned long len)
+{
+	struct userfaultfd_ctx *ctx = vm_ctx.ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	if (!ctx)
+		return;
+
+	if (to & ~PAGE_MASK) {
+		userfaultfd_ctx_put(ctx);
+		return;
+	}
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_REMAP;
+	ewq.msg.arg.remap.from = from;
+	ewq.msg.arg.remap.to = to;
+	ewq.msg.arg.remap.len = len;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 79002bca1f43..7f318a46044b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -55,6 +55,12 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
 extern void dup_userfaultfd_complete(struct list_head *);
 
+extern void mremap_userfaultfd_prep(struct vm_area_struct *,
+				    struct vm_userfaultfd_ctx *);
+extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx,
+					unsigned long from, unsigned long to,
+					unsigned long len);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -89,6 +95,17 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
 {
 }
 
+static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+					   struct vm_userfaultfd_ctx *ctx)
+{
+}
+
+static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx,
+					       unsigned long from,
+					       unsigned long to,
+					       unsigned long len)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index c8953c84fdcc..79a85e5bd388 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,7 +18,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
+			   UFFD_FEATURE_EVENT_REMAP)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -76,6 +77,12 @@ struct uffd_msg {
 			__u32	ufd;
 		} fork;
 
+		struct {
+			__u64	from;
+			__u64	to;
+			__u64	len;
+		} remap;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -90,6 +97,7 @@ struct uffd_msg {
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
+#define UFFD_EVENT_REMAP	0x14
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -110,6 +118,7 @@ struct uffdio_api {
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#define UFFD_FEATURE_EVENT_REMAP		(1<<2)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/mremap.c b/mm/mremap.c
index 30d7d2482eea..504b560c013c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
 #include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -250,7 +251,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
-		unsigned long new_len, unsigned long new_addr, bool *locked)
+		unsigned long new_len, unsigned long new_addr,
+		bool *locked, struct vm_userfaultfd_ctx *uf)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -309,6 +311,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		old_addr = new_addr;
 		new_addr = err;
 	} else {
+		mremap_userfaultfd_prep(new_vma, uf);
 		arch_remap(mm, old_addr, old_addr + old_len,
 			   new_addr, new_addr + new_len);
 	}
@@ -413,7 +416,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-		unsigned long new_addr, unsigned long new_len, bool *locked)
+		unsigned long new_addr, unsigned long new_len, bool *locked,
+		struct vm_userfaultfd_ctx *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -458,7 +462,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
 	if (!(offset_in_page(ret)))
 		goto out;
 out1:
@@ -497,6 +501,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
 	bool locked = false;
+	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;
@@ -523,7 +528,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked);
+				&locked, &uf);
 		goto out;
 	}
 
@@ -592,7 +597,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 			goto out;
 		}
 
-		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+		ret = move_vma(vma, addr, old_len, new_len, new_addr,
+			       &locked, &uf);
 	}
 out:
 	if (offset_in_page(ret)) {
@@ -602,5 +608,6 @@ out:
 	up_write(&current->mm->mmap_sem);
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
+	mremap_userfaultfd_complete(uf, addr, new_addr, old_len);
 	return ret;
 }
-- 
cgit v1.2.3


From 90794bf19dc19691a16b71bcd75c04094d9e392d Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 22 Feb 2017 15:42:37 -0800
Subject: userfaultfd: non-cooperative: optimize mremap_userfaultfd_complete()

Optimize the mremap_userfaultfd_complete() interface to pass only the
vm_userfaultfd_ctx pointer through the stack as a microoptimization.

Link: http://lkml.kernel.org/r/20161216144821.5183-13-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c              | 4 ++--
 include/linux/userfaultfd_k.h | 4 ++--
 mm/mremap.c                   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 68f978beefac..5d37c37854b0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -608,11 +608,11 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	}
 }
 
-void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx,
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 				 unsigned long from, unsigned long to,
 				 unsigned long len)
 {
-	struct userfaultfd_ctx *ctx = vm_ctx.ctx;
+	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
 	struct userfaultfd_wait_queue ewq;
 
 	if (!ctx)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 7f318a46044b..78ec197e8b47 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -57,7 +57,7 @@ extern void dup_userfaultfd_complete(struct list_head *);
 
 extern void mremap_userfaultfd_prep(struct vm_area_struct *,
 				    struct vm_userfaultfd_ctx *);
-extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx,
+extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
@@ -100,7 +100,7 @@ static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 {
 }
 
-static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx,
+static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 					       unsigned long from,
 					       unsigned long to,
 					       unsigned long len)
diff --git a/mm/mremap.c b/mm/mremap.c
index 504b560c013c..8779928d6a70 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -608,6 +608,6 @@ out:
 	up_write(&current->mm->mmap_sem);
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
-	mremap_userfaultfd_complete(uf, addr, new_addr, old_len);
+	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
 	return ret;
 }
-- 
cgit v1.2.3


From 05ce77249d5068b057082d24ec22d3824f4816ac Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 22 Feb 2017 15:42:40 -0800
Subject: userfaultfd: non-cooperative: add madvise() event for MADV_DONTNEED
 request

If the page is punched out of the address space the uffd reader should
know this and zeromap the respective area in case of the #PF event.

Link: http://lkml.kernel.org/r/20161216144821.5183-14-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 28 ++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    | 12 ++++++++++++
 include/uapi/linux/userfaultfd.h | 10 +++++++++-
 mm/madvise.c                     |  2 ++
 4 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5d37c37854b0..ea9008254df4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -633,6 +633,34 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
+void madvise_userfault_dontneed(struct vm_area_struct *vma,
+				struct vm_area_struct **prev,
+				unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct userfaultfd_ctx *ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+		return;
+
+	userfaultfd_ctx_get(ctx);
+	up_read(&mm->mmap_sem);
+
+	*prev = NULL; /* We wait for ACK w/o the mmap semaphore */
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
+	ewq.msg.arg.madv_dn.start = start;
+	ewq.msg.arg.madv_dn.end = end;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+
+	down_read(&mm->mmap_sem);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 78ec197e8b47..f431861f22f1 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,6 +61,11 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
+extern void madvise_userfault_dontneed(struct vm_area_struct *vma,
+				       struct vm_area_struct **prev,
+				       unsigned long start,
+				       unsigned long end);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -106,6 +111,13 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 					       unsigned long len)
 {
 }
+
+static inline void madvise_userfault_dontneed(struct vm_area_struct *vma,
+					      struct vm_area_struct **prev,
+					      unsigned long start,
+					      unsigned long end)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 79a85e5bd388..2bbf32319cf5 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -19,7 +19,8 @@
  */
 #define UFFD_API ((__u64)0xAA)
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
-			   UFFD_FEATURE_EVENT_REMAP)
+			   UFFD_FEATURE_EVENT_REMAP |	    \
+			   UFFD_FEATURE_EVENT_MADVDONTNEED)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -83,6 +84,11 @@ struct uffd_msg {
 			__u64	len;
 		} remap;
 
+		struct {
+			__u64	start;
+			__u64	end;
+		} madv_dn;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -98,6 +104,7 @@ struct uffd_msg {
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
+#define UFFD_EVENT_MADVDONTNEED	0x15
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -119,6 +126,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
+#define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0e3828eae9f8..06ffb5a170de 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
 #include <linux/sched.h>
@@ -477,6 +478,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 		return -EINVAL;
 
 	zap_page_range(vma, start, end - start, NULL);
+	madvise_userfault_dontneed(vma, prev, start, end);
 	return 0;
 }
 
-- 
cgit v1.2.3


From fa4d75c1de13299c61b5e18a1ae46bc00888b599 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 22 Feb 2017 15:42:49 -0800
Subject: userfaultfd: hugetlbfs: add copy_huge_page_from_user for hugetlb
 userfaultfd support

userfaultfd UFFDIO_COPY allows user level code to copy data to a page at
fault time.  The data is copied from user space to a newly allocated
huge page.  The new routine copy_huge_page_from_user performs this copy.

Link: http://lkml.kernel.org/r/20161216144821.5183-17-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/memory.c        | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3787f047a098..6bc7f2c1a6d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2424,6 +2424,9 @@ extern void clear_huge_page(struct page *page,
 extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned long addr, struct vm_area_struct *vma,
 				unsigned int pages_per_huge_page);
+extern long copy_huge_page_from_user(struct page *dst_page,
+				const void __user *usr_src,
+				unsigned int pages_per_huge_page);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 extern struct page_ext_operations debug_guardpage_ops;
diff --git a/mm/memory.c b/mm/memory.c
index ececdc4a2892..4ade940d105c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4152,6 +4152,31 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
 	}
 }
+
+long copy_huge_page_from_user(struct page *dst_page,
+				const void __user *usr_src,
+				unsigned int pages_per_huge_page)
+{
+	void *src = (void *)usr_src;
+	void *page_kaddr;
+	unsigned long i, rc = 0;
+	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+
+	for (i = 0; i < pages_per_huge_page; i++) {
+		page_kaddr = kmap_atomic(dst_page + i);
+		rc = copy_from_user(page_kaddr,
+				(const void __user *)(src + i * PAGE_SIZE),
+				PAGE_SIZE);
+		kunmap_atomic(page_kaddr);
+
+		ret_val -= (PAGE_SIZE - rc);
+		if (rc)
+			break;
+
+		cond_resched();
+	}
+	return ret_val;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
-- 
cgit v1.2.3


From 8fb5debc5fcd450470cdd789c2d80ef95ebb8cf4 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 22 Feb 2017 15:42:52 -0800
Subject: userfaultfd: hugetlbfs: add hugetlb_mcopy_atomic_pte for userfaultfd
 support

hugetlb_mcopy_atomic_pte is the low level routine that implements the
userfaultfd UFFDIO_COPY command.  It is based on the existing
mcopy_atomic_pte routine with modifications for huge pages.

Link: http://lkml.kernel.org/r/20161216144821.5183-18-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  7 +++++
 mm/hugetlb.c            | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 48c76d612d40..aab2fff3e269 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -81,6 +81,11 @@ void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
+				struct vm_area_struct *dst_vma,
+				unsigned long dst_addr,
+				unsigned long src_addr,
+				struct page **pagep);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
@@ -149,6 +154,8 @@ static inline void hugetlb_show_meminfo(void)
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
+#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
+				src_addr, pagep)	({ BUG(); 0; })
 #define huge_pte_offset(mm, address)	0
 static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c7025c132670..dec628b26f59 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3948,6 +3948,87 @@ out_mutex:
 	return ret;
 }
 
+/*
+ * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
+ * modifications for huge pages.
+ */
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+			    pte_t *dst_pte,
+			    struct vm_area_struct *dst_vma,
+			    unsigned long dst_addr,
+			    unsigned long src_addr,
+			    struct page **pagep)
+{
+	struct hstate *h = hstate_vma(dst_vma);
+	pte_t _dst_pte;
+	spinlock_t *ptl;
+	int ret;
+	struct page *page;
+
+	if (!*pagep) {
+		ret = -ENOMEM;
+		page = alloc_huge_page(dst_vma, dst_addr, 0);
+		if (IS_ERR(page))
+			goto out;
+
+		ret = copy_huge_page_from_user(page,
+						(const void __user *) src_addr,
+						pages_per_huge_page(h));
+
+		/* fallback to copy_from_user outside mmap_sem */
+		if (unlikely(ret)) {
+			ret = -EFAULT;
+			*pagep = page;
+			/* don't free the page */
+			goto out;
+		}
+	} else {
+		page = *pagep;
+		*pagep = NULL;
+	}
+
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__SetPageUptodate(page);
+	set_page_huge_active(page);
+
+	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+	spin_lock(ptl);
+
+	ret = -EEXIST;
+	if (!huge_pte_none(huge_ptep_get(dst_pte)))
+		goto out_release_unlock;
+
+	ClearPagePrivate(page);
+	hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+
+	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
+	if (dst_vma->vm_flags & VM_WRITE)
+		_dst_pte = huge_pte_mkdirty(_dst_pte);
+	_dst_pte = pte_mkyoung(_dst_pte);
+
+	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
+					dst_vma->vm_flags & VM_WRITE);
+	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+	spin_unlock(ptl);
+	ret = 0;
+out:
+	return ret;
+out_release_unlock:
+	spin_unlock(ptl);
+	put_page(page);
+	goto out;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
-- 
cgit v1.2.3


From 810a56b943e265bbabfcd5a8e54cb8d3b16cd6e4 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 22 Feb 2017 15:42:58 -0800
Subject: userfaultfd: hugetlbfs: fix __mcopy_atomic_hugetlb retry/error
 processing

The new routine copy_huge_page_from_user() uses kmap_atomic() to map
PAGE_SIZE pages.  However, this prevents page faults in the subsequent
call to copy_from_user().  This is OK in the case where the routine is
copied with mmap_sema held.  However, in another case we want to allow
page faults.  So, add a new argument allow_pagefault to indicate if the
routine should allow page faults.

[dan.carpenter@oracle.com: unmap the correct pointer]
  Link: http://lkml.kernel.org/r/20170113082608.GA3548@mwanda
[akpm@linux-foundation.org: kunmap() takes a page*, per Hugh]
Link: http://lkml.kernel.org/r/20161216144821.5183-20-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 ++-
 mm/hugetlb.c       |  2 +-
 mm/memory.c        | 13 ++++++++++---
 mm/userfaultfd.c   |  2 +-
 4 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bc7f2c1a6d1..c3e2be2b3296 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2426,7 +2426,8 @@ extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned int pages_per_huge_page);
 extern long copy_huge_page_from_user(struct page *dst_page,
 				const void __user *usr_src,
-				unsigned int pages_per_huge_page);
+				unsigned int pages_per_huge_page,
+				bool allow_pagefault);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 extern struct page_ext_operations debug_guardpage_ops;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dec628b26f59..5d20af921a30 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3973,7 +3973,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 		ret = copy_huge_page_from_user(page,
 						(const void __user *) src_addr,
-						pages_per_huge_page(h));
+						pages_per_huge_page(h), false);
 
 		/* fallback to copy_from_user outside mmap_sem */
 		if (unlikely(ret)) {
diff --git a/mm/memory.c b/mm/memory.c
index 4ade940d105c..d7676a68c80a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4155,7 +4155,8 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 
 long copy_huge_page_from_user(struct page *dst_page,
 				const void __user *usr_src,
-				unsigned int pages_per_huge_page)
+				unsigned int pages_per_huge_page,
+				bool allow_pagefault)
 {
 	void *src = (void *)usr_src;
 	void *page_kaddr;
@@ -4163,11 +4164,17 @@ long copy_huge_page_from_user(struct page *dst_page,
 	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
 
 	for (i = 0; i < pages_per_huge_page; i++) {
-		page_kaddr = kmap_atomic(dst_page + i);
+		if (allow_pagefault)
+			page_kaddr = kmap(dst_page + i);
+		else
+			page_kaddr = kmap_atomic(dst_page + i);
 		rc = copy_from_user(page_kaddr,
 				(const void __user *)(src + i * PAGE_SIZE),
 				PAGE_SIZE);
-		kunmap_atomic(page_kaddr);
+		if (allow_pagefault)
+			kunmap(dst_page + i);
+		else
+			kunmap_atomic(page_kaddr);
 
 		ret_val -= (PAGE_SIZE - rc);
 		if (rc)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ef0495bfd17a..09976745be23 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -274,7 +274,7 @@ retry:
 
 			err = copy_huge_page_from_user(page,
 						(const void __user *)src_addr,
-						pages_per_huge_page(h));
+						pages_per_huge_page(h), true);
 			if (unlikely(err)) {
 				err = -EFAULT;
 				goto out;
-- 
cgit v1.2.3


From 87ffc118b54dcd4cc642723603d944673248152f Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 22 Feb 2017 15:43:13 -0800
Subject: userfaultfd: hugetlbfs: gup: support VM_FAULT_RETRY

Add support for VM_FAULT_RETRY to follow_hugetlb_page() so that
get_user_pages_unlocked/locked and "nonblocking/FOLL_NOWAIT" features
will work on hugetlbfs.

This is required for fully functional userfaultfd non-present support on
hugetlbfs.

Link: http://lkml.kernel.org/r/20161216144821.5183-25-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  5 +++--
 mm/gup.c                |  2 +-
 mm/hugetlb.c            | 48 ++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aab2fff3e269..503099d8aada 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -65,7 +65,8 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
-			 unsigned long *, unsigned long *, long, unsigned int);
+			 unsigned long *, unsigned long *, long, unsigned int,
+			 int *);
 void unmap_hugepage_range(struct vm_area_struct *,
 			  unsigned long, unsigned long, struct page *);
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
@@ -136,7 +137,7 @@ static inline unsigned long hugetlb_total_pages(void)
 	return 0;
 }
 
-#define follow_hugetlb_page(m,v,p,vs,a,b,i,w)	({ BUG(); 0; })
+#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n)	({ BUG(); 0; })
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 static inline void hugetlb_report_meminfo(struct seq_file *m)
diff --git a/mm/gup.c b/mm/gup.c
index 55315555489d..40abe4c90383 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -572,7 +572,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			if (is_vm_hugetlb_page(vma)) {
 				i = follow_hugetlb_page(mm, vma, pages, vmas,
 						&start, &nr_pages, i,
-						gup_flags);
+						gup_flags, nonblocking);
 				continue;
 			}
 		}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a4b29054cc3f..f6c7ff316daf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4065,7 +4065,7 @@ out_release_unlock:
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
-			 long i, unsigned int flags)
+			 long i, unsigned int flags, int *nonblocking)
 {
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
@@ -4128,16 +4128,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		    ((flags & FOLL_WRITE) &&
 		      !huge_pte_write(huge_ptep_get(pte)))) {
 			int ret;
+			unsigned int fault_flags = 0;
 
 			if (pte)
 				spin_unlock(ptl);
-			ret = hugetlb_fault(mm, vma, vaddr,
-				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
-			if (!(ret & VM_FAULT_ERROR))
-				continue;
-
-			remainder = 0;
-			break;
+			if (flags & FOLL_WRITE)
+				fault_flags |= FAULT_FLAG_WRITE;
+			if (nonblocking)
+				fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+			if (flags & FOLL_NOWAIT)
+				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+					FAULT_FLAG_RETRY_NOWAIT;
+			if (flags & FOLL_TRIED) {
+				VM_WARN_ON_ONCE(fault_flags &
+						FAULT_FLAG_ALLOW_RETRY);
+				fault_flags |= FAULT_FLAG_TRIED;
+			}
+			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+			if (ret & VM_FAULT_ERROR) {
+				remainder = 0;
+				break;
+			}
+			if (ret & VM_FAULT_RETRY) {
+				if (nonblocking)
+					*nonblocking = 0;
+				*nr_pages = 0;
+				/*
+				 * VM_FAULT_RETRY must not return an
+				 * error, it will return zero
+				 * instead.
+				 *
+				 * No need to update "position" as the
+				 * caller will not check it after
+				 * *nr_pages is set to 0.
+				 */
+				return i;
+			}
+			continue;
 		}
 
 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
@@ -4166,6 +4193,11 @@ same_page:
 		spin_unlock(ptl);
 	}
 	*nr_pages = remainder;
+	/*
+	 * setting position is actually required only if remainder is
+	 * not zero but it's faster not to add a "if (remainder)"
+	 * branch.
+	 */
 	*position = vaddr;
 
 	return i ? i : -EFAULT;
-- 
cgit v1.2.3


From 4c27fe4c4c84f3afd504ecff2420cc1ad420d38e Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 22 Feb 2017 15:43:25 -0800
Subject: userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd
 support

shmem_mcopy_atomic_pte is the low level routine that implements the
userfaultfd UFFDIO_COPY command.  It is based on the existing
mcopy_atomic_pte routine with modifications for shared memory pages.

Link: http://lkml.kernel.org/r/20161216144821.5183-29-aarcange@redhat.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |  11 +++++
 mm/shmem.c               | 110 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff078e7043b6..fdaac9d4d46d 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -124,4 +124,15 @@ static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
 }
 #endif
 
+#ifdef CONFIG_SHMEM
+extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+				  struct vm_area_struct *dst_vma,
+				  unsigned long dst_addr,
+				  unsigned long src_addr,
+				  struct page **pagep);
+#else
+#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
+			       src_addr, pagep)        ({ BUG(); 0; })
+#endif
+
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 7d52cd4b504d..14de2a9e5083 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -70,6 +70,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <uapi/linux/memfd.h>
+#include <linux/rmap.h>
 
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
@@ -2178,6 +2179,115 @@ bool shmem_mapping(struct address_space *mapping)
 	return mapping->a_ops == &shmem_aops;
 }
 
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+			   pmd_t *dst_pmd,
+			   struct vm_area_struct *dst_vma,
+			   unsigned long dst_addr,
+			   unsigned long src_addr,
+			   struct page **pagep)
+{
+	struct inode *inode = file_inode(dst_vma->vm_file);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct address_space *mapping = inode->i_mapping;
+	gfp_t gfp = mapping_gfp_mask(mapping);
+	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+	struct mem_cgroup *memcg;
+	spinlock_t *ptl;
+	void *page_kaddr;
+	struct page *page;
+	pte_t _dst_pte, *dst_pte;
+	int ret;
+
+	if (!*pagep) {
+		ret = -ENOMEM;
+		if (shmem_acct_block(info->flags, 1))
+			goto out;
+		if (sbinfo->max_blocks) {
+			if (percpu_counter_compare(&sbinfo->used_blocks,
+						   sbinfo->max_blocks) >= 0)
+				goto out_unacct_blocks;
+			percpu_counter_inc(&sbinfo->used_blocks);
+		}
+
+		page = shmem_alloc_page(gfp, info, pgoff);
+		if (!page)
+			goto out_dec_used_blocks;
+
+		page_kaddr = kmap_atomic(page);
+		ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
+				     PAGE_SIZE);
+		kunmap_atomic(page_kaddr);
+
+		/* fallback to copy_from_user outside mmap_sem */
+		if (unlikely(ret)) {
+			*pagep = page;
+			/* don't free the page */
+			return -EFAULT;
+		}
+	} else {
+		page = *pagep;
+		*pagep = NULL;
+	}
+
+	ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+	if (ret)
+		goto out_release;
+
+	ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+	if (!ret) {
+		ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
+		radix_tree_preload_end();
+	}
+	if (ret)
+		goto out_release_uncharge;
+
+	mem_cgroup_commit_charge(page, memcg, false, false);
+
+	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+	if (dst_vma->vm_flags & VM_WRITE)
+		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+	ret = -EEXIST;
+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+	if (!pte_none(*dst_pte))
+		goto out_release_uncharge_unlock;
+
+	__SetPageUptodate(page);
+
+	lru_cache_add_anon(page);
+
+	spin_lock(&info->lock);
+	info->alloced++;
+	inode->i_blocks += BLOCKS_PER_PAGE;
+	shmem_recalc_inode(inode);
+	spin_unlock(&info->lock);
+
+	inc_mm_counter(dst_mm, mm_counter_file(page));
+	page_add_file_rmap(page, false);
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+	unlock_page(page);
+	pte_unmap_unlock(dst_pte, ptl);
+	ret = 0;
+out:
+	return ret;
+out_release_uncharge_unlock:
+	pte_unmap_unlock(dst_pte, ptl);
+out_release_uncharge:
+	mem_cgroup_cancel_charge(page, memcg, false);
+out_release:
+	put_page(page);
+out_dec_used_blocks:
+	if (sbinfo->max_blocks)
+		percpu_counter_add(&sbinfo->used_blocks, -1);
+out_unacct_blocks:
+	shmem_unacct_blocks(info->flags, 1);
+	goto out;
+}
+
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
-- 
cgit v1.2.3


From b0506e488da5cf2f07f3a4f6d7acaa8f459ad714 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 22 Feb 2017 15:43:28 -0800
Subject: userfaultfd: shmem: introduce vma_is_shmem

Currently userfault relies on vma_is_anonymous and vma_is_hugetlb to
ensure compatibility of a VMA with userfault.  Introduction of
vma_is_shmem allows detection if tmpfs backed VMAs, so that they may be
used with userfaultfd.  Current implementation presumes usage of
vma_is_shmem only by slow path routines in userfaultfd, therefore the
vma_is_shmem is not made inline to leave the few remaining free bits in
vm_flags.

Link: http://lkml.kernel.org/r/20161216144821.5183-30-aarcange@redhat.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 10 ++++++++++
 mm/shmem.c         |  5 +++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c3e2be2b3296..bb997493e15d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1383,6 +1383,16 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
 	return !vma->vm_ops;
 }
 
+#ifdef CONFIG_SHMEM
+/*
+ * The vma_is_shmem is not inline because it is used only by slow
+ * paths in userfault.
+ */
+bool vma_is_shmem(struct vm_area_struct *vma);
+#else
+static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
+#endif
+
 static inline int stack_guard_page_start(struct vm_area_struct *vma,
 					     unsigned long addr)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 14de2a9e5083..8412baec17cd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,6 +191,11 @@ static const struct inode_operations shmem_special_inode_operations;
 static const struct vm_operations_struct shmem_vm_ops;
 static struct file_system_type shmem_fs_type;
 
+bool vma_is_shmem(struct vm_area_struct *vma)
+{
+	return vma->vm_ops == &shmem_vm_ops;
+}
+
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
-- 
cgit v1.2.3


From 74d81bfae8e3f52e956367f6ed764db269b87091 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 22 Feb 2017 15:44:41 -0800
Subject: mm: un-export wake_up_page functions

These are no longer used outside mm/filemap.c, so un-export them and
make them static where possible.  These were exported specifically for
NFS use in commit a4796e37c12e ("MM: export page_wakeup functions").

Link: http://lkml.kernel.org/r/20170103182234.30141-3-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++----------
 mm/filemap.c            | 10 ++++++++--
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 324c8dbad1e1..b572f5530392 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -482,19 +482,11 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
 }
 
 /*
- * This is exported only for wait_on_page_locked/wait_on_page_writeback,
- * and for filesystems which need to wait on PG_private.
+ * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
+ * and should not be used directly.
  */
 extern void wait_on_page_bit(struct page *page, int bit_nr);
 extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
-extern void wake_up_page_bit(struct page *page, int bit_nr);
-
-static inline void wake_up_page(struct page *page, int bit)
-{
-	if (!PageWaiters(page))
-		return;
-	wake_up_page_bit(page, bit);
-}
 
 /* 
  * Wait for a page to be unlocked.
diff --git a/mm/filemap.c b/mm/filemap.c
index 3f9afded581b..a3dbe9eff213 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -788,7 +788,7 @@ static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void
 	return autoremove_wake_function(wait, mode, sync, key);
 }
 
-void wake_up_page_bit(struct page *page, int bit_nr)
+static void wake_up_page_bit(struct page *page, int bit_nr)
 {
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
@@ -821,7 +821,13 @@ void wake_up_page_bit(struct page *page, int bit_nr)
 	}
 	spin_unlock_irqrestore(&q->lock, flags);
 }
-EXPORT_SYMBOL(wake_up_page_bit);
+
+static void wake_up_page(struct page *page, int bit)
+{
+	if (!PageWaiters(page))
+		return;
+	wake_up_page_bit(page, bit);
+}
 
 static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 		struct page *page, int bit_nr, int state, bool lock)
-- 
cgit v1.2.3


From 7f354a548d1cb6bb01b6ee74aee9264aa152f1ec Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 22 Feb 2017 15:44:50 -0800
Subject: mm, compaction: add vmstats for kcompactd work

A "compact_daemon_wake" vmstat exists that represents the number of
times kcompactd has woken up.  This doesn't represent how much work it
actually did, though.

It's useful to understand how much compaction work is being done by
kcompactd versus other methods such as direct compaction and explicitly
triggered per-node (or system) compaction.

This adds two new vmstats: "compact_daemon_migrate_scanned" and
"compact_daemon_free_scanned" to represent the number of pages kcompactd
has scanned as part of its migration scanner and freeing scanner,
respectively.

These values are still accounted for in the general
"compact_migrate_scanned" and "compact_free_scanned" for compatibility.

It could be argued that explicitly triggered compaction could also be
tracked separately, and that could be added if others find it useful.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1612071749390.69852@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h |  1 +
 mm/compaction.c               | 22 +++++++++++++++++++---
 mm/internal.h                 |  2 ++
 mm/vmstat.c                   |  2 ++
 4 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 4d6ec58a8d45..6aa1b6cb5828 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -56,6 +56,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		COMPACTISOLATED,
 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
 		KCOMPACTD_WAKE,
+		KCOMPACTD_MIGRATE_SCANNED, KCOMPACTD_FREE_SCANNED,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
diff --git a/mm/compaction.c b/mm/compaction.c
index 949198d01260..c6178bbd3e04 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -548,7 +548,7 @@ isolate_fail:
 	if (blockpfn == end_pfn)
 		update_pageblock_skip(cc, valid_page, total_isolated, false);
 
-	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+	cc->total_free_scanned += nr_scanned;
 	if (total_isolated)
 		count_compact_events(COMPACTISOLATED, total_isolated);
 	return total_isolated;
@@ -931,7 +931,7 @@ isolate_fail:
 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
 						nr_scanned, nr_isolated);
 
-	count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+	cc->total_migrate_scanned += nr_scanned;
 	if (nr_isolated)
 		count_compact_events(COMPACTISOLATED, nr_isolated);
 
@@ -1631,6 +1631,9 @@ out:
 			zone->compact_cached_free_pfn = free_pfn;
 	}
 
+	count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
+	count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
+
 	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
 				cc->free_pfn, end_pfn, sync, ret);
 
@@ -1645,6 +1648,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
+		.total_migrate_scanned = 0,
+		.total_free_scanned = 0,
 		.order = order,
 		.gfp_mask = gfp_mask,
 		.zone = zone,
@@ -1757,6 +1762,8 @@ static void compact_node(int nid)
 	struct zone *zone;
 	struct compact_control cc = {
 		.order = -1,
+		.total_migrate_scanned = 0,
+		.total_free_scanned = 0,
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 		.whole_zone = true,
@@ -1883,6 +1890,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 	struct zone *zone;
 	struct compact_control cc = {
 		.order = pgdat->kcompactd_max_order,
+		.total_migrate_scanned = 0,
+		.total_free_scanned = 0,
 		.classzone_idx = pgdat->kcompactd_classzone_idx,
 		.mode = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
@@ -1891,7 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 	};
 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
 							cc.classzone_idx);
-	count_vm_event(KCOMPACTD_WAKE);
+	count_compact_event(KCOMPACTD_WAKE);
 
 	for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
 		int status;
@@ -1909,6 +1918,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 
 		cc.nr_freepages = 0;
 		cc.nr_migratepages = 0;
+		cc.total_migrate_scanned = 0;
+		cc.total_free_scanned = 0;
 		cc.zone = zone;
 		INIT_LIST_HEAD(&cc.freepages);
 		INIT_LIST_HEAD(&cc.migratepages);
@@ -1927,6 +1938,11 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 			defer_compaction(zone, cc.order);
 		}
 
+		count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+				     cc.total_migrate_scanned);
+		count_compact_events(KCOMPACTD_FREE_SCANNED,
+				     cc.total_free_scanned);
+
 		VM_BUG_ON(!list_empty(&cc.freepages));
 		VM_BUG_ON(!list_empty(&cc.migratepages));
 	}
diff --git a/mm/internal.h b/mm/internal.h
index bfad3b5d2665..9ad04fc6eefe 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -175,6 +175,8 @@ struct compact_control {
 	struct list_head migratepages;	/* List of pages being migrated */
 	unsigned long nr_freepages;	/* Number of isolated free pages */
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
+	unsigned long total_migrate_scanned;
+	unsigned long total_free_scanned;
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7c28df36f50f..69f9aff39a2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1038,6 +1038,8 @@ const char * const vmstat_text[] = {
 	"compact_fail",
 	"compact_success",
 	"compact_daemon_wake",
+	"compact_daemon_migrate_scanned",
+	"compact_daemon_free_scanned",
 #endif
 
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3


From b92df1de5d289c0b5d653e72414bf0850b8511e0 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 22 Feb 2017 15:44:53 -0800
Subject: mm: page_alloc: skip over regions of invalid pfns where possible

When using a sparse memory model memmap_init_zone() when invoked with
the MEMMAP_EARLY context will skip over pages which aren't valid - ie.
which aren't in a populated region of the sparse memory map.  However if
the memory map is extremely sparse then it can spend a long time
linearly checking each PFN in a large non-populated region of the memory
map & skipping it in turn.

When CONFIG_HAVE_MEMBLOCK_NODE_MAP is enabled, we have sufficient
information to quickly discover the next valid PFN given an invalid one
by searching through the list of memory regions & skipping forwards to
the first PFN covered by the memory region to the right of the
non-populated region.  Implement this in order to speed up
memmap_init_zone() for systems with extremely sparse memory maps.

James said "I have tested this patch on a virtual model of a Samurai CPU
with a sparse memory map.  The kernel boot time drops from 109 to
62 seconds. "

Link: http://lkml.kernel.org/r/20161125185518.29885-1-paul.burton@imgtec.com
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Tested-by: James Hartley <james.hartley@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  1 +
 mm/memblock.c            | 25 +++++++++++++++++++++++++
 mm/page_alloc.c          | 11 ++++++++++-
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5b759c9acf97..38bcf00cbed3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -203,6 +203,7 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
+unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index 7608bc305936..a476d28e0733 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
+						      unsigned long max_pfn)
+{
+	struct memblock_type *type = &memblock.memory;
+	unsigned int right = type->cnt;
+	unsigned int mid, left = 0;
+	phys_addr_t addr = PFN_PHYS(pfn + 1);
+
+	do {
+		mid = (right + left) / 2;
+
+		if (addr < type->regions[mid].base)
+			right = mid;
+		else if (addr >= (type->regions[mid].base +
+				  type->regions[mid].size))
+			left = mid + 1;
+		else {
+			/* addr is within the region, so pfn + 1 is valid */
+			return min(pfn + 1, max_pfn);
+		}
+	} while (left < right);
+
+	return min(PHYS_PFN(type->regions[right].base), max_pfn);
+}
+
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05c0a59323bd..6da3169d3750 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5103,8 +5103,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn))
+		if (!early_pfn_valid(pfn)) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+			/*
+			 * Skip to the pfn preceding the next valid one (or
+			 * end_pfn), such that we hit a valid pfn (or end_pfn)
+			 * on our next iteration of the loop.
+			 */
+			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+#endif
 			continue;
+		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-- 
cgit v1.2.3


From 16e72e9b30986ee15f17fbb68189ca842c32af58 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Wed, 22 Feb 2017 15:45:16 -0800
Subject: powerpc: do not make the entire heap executable

On 32-bit powerpc the ELF PLT sections of binaries (built with
--bss-plt, or with a toolchain which defaults to it) look like this:

  [17] .sbss             NOBITS          0002aff8 01aff8 000014 00  WA  0   0  4
  [18] .plt              NOBITS          0002b00c 01aff8 000084 00 WAX  0   0  4
  [19] .bss              NOBITS          0002b090 01aff8 0000a4 00  WA  0   0  4

Which results in an ELF load header:

  Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD           0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x10000

This is all correct, the load region containing the PLT is marked as
executable.  Note that the PLT starts at 0002b00c but the file mapping
ends at 0002aff8, so the PLT falls in the 0 fill section described by
the load header, and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load
headers when it creates the 0 filled non-file backed mappings.  It
assumes all of these mappings are RW BSS sections, which is not the case
for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to
incur a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps
*entire brk area* with executable rights for all binaries, even
--secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header and
create 0 filled anonymous mappings that are executable if the load
header requests that.

Test program showing the difference in /proc/$PID/maps:

int main() {
	char buf[16*1024];
	char *p = malloc(123); /* make "[heap]" mapping appear */
	int fd = open("/proc/self/maps", O_RDONLY);
	int len = read(fd, buf, sizeof(buf));
	write(1, buf, len);
	printf("%p\n", p);
	return 0;
}

Compiled using: gcc -mbss-plt -m32 -Os test.c -otest

Unpatched ppc64 kernel:
00100000-00120000 r-xp 00000000 00:00 0                                  [vdso]
0fe10000-0ffd0000 r-xp 00000000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffd0000-0ffe0000 r--p 001b0000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffe0000-0fff0000 rw-p 001c0000 fd:00 67898094                           /usr/lib/libc-2.17.so
10000000-10010000 r-xp 00000000 fd:00 100674505                          /home/user/test
10010000-10020000 r--p 00000000 fd:00 100674505                          /home/user/test
10020000-10030000 rw-p 00010000 fd:00 100674505                          /home/user/test
10690000-106c0000 rwxp 00000000 00:00 0                                  [heap]
f7f70000-f7fa0000 r-xp 00000000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7fa0000-f7fb0000 r--p 00020000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7fb0000-f7fc0000 rw-p 00030000 fd:00 67898089                           /usr/lib/ld-2.17.so
ffa90000-ffac0000 rw-p 00000000 00:00 0                                  [stack]
0x10690008

Patched ppc64 kernel:
00100000-00120000 r-xp 00000000 00:00 0                                  [vdso]
0fe10000-0ffd0000 r-xp 00000000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffd0000-0ffe0000 r--p 001b0000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffe0000-0fff0000 rw-p 001c0000 fd:00 67898094                           /usr/lib/libc-2.17.so
10000000-10010000 r-xp 00000000 fd:00 100674505                          /home/user/test
10010000-10020000 r--p 00000000 fd:00 100674505                          /home/user/test
10020000-10030000 rw-p 00010000 fd:00 100674505                          /home/user/test
10180000-101b0000 rw-p 00000000 00:00 0                                  [heap]
                  ^^^^ this has changed
f7c60000-f7c90000 r-xp 00000000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7c90000-f7ca0000 r--p 00020000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7ca0000-f7cb0000 rw-p 00030000 fd:00 67898089                           /usr/lib/ld-2.17.so
ff860000-ff890000 rw-p 00000000 00:00 0                                  [stack]
0x10180008

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.

Link: http://lkml.kernel.org/r/20161215131950.23054-1-dvlasenk@redhat.com
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/page.h |  4 +++-
 fs/binfmt_elf.c                 | 30 ++++++++++++++++++++++--------
 include/linux/mm.h              |  1 +
 mm/mmap.c                       | 24 +++++++++++++++++++-----
 4 files changed, 45 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 47120bf2670c..2a32483c7b6c 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -230,7 +230,9 @@ extern long long virt_phys_offset;
  * and needs to be executable.  This means the whole heap ends
  * up being executable.
  */
-#define VM_DATA_DEFAULT_FLAGS32	(VM_READ | VM_WRITE | VM_EXEC | \
+#define VM_DATA_DEFAULT_FLAGS32 \
+	(((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
+				 VM_READ | VM_WRITE | \
 				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #define VM_DATA_DEFAULT_FLAGS64	(VM_READ | VM_WRITE | \
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e7bf01373bc4..443a6f537d56 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
 	start = ELF_PAGEALIGN(start);
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
-		int error = vm_brk(start, end - start);
+		/*
+		 * Map the last of the bss segment.
+		 * If the header is requesting these pages to be
+		 * executable, honour that (ppc32 needs this).
+		 */
+		int error = vm_brk_flags(start, end - start,
+				prot & PROT_EXEC ? VM_EXEC : 0);
 		if (error)
 			return error;
 	}
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
 	unsigned long last_bss = 0, elf_bss = 0;
+	int bss_prot = 0;
 	unsigned long error = ~0UL;
 	unsigned long total_size;
 	int i;
@@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			 * elf_bss and last_bss is the bss section.
 			 */
 			k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
-			if (k > last_bss)
+			if (k > last_bss) {
 				last_bss = k;
+				bss_prot = elf_prot;
+			}
 		}
 	}
 
@@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	/*
 	 * Next, align both the file and mem bss up to the page size,
 	 * since this is where elf_bss was just zeroed up to, and where
-	 * last_bss will end after the vm_brk() below.
+	 * last_bss will end after the vm_brk_flags() below.
 	 */
 	elf_bss = ELF_PAGEALIGN(elf_bss);
 	last_bss = ELF_PAGEALIGN(last_bss);
 	/* Finally, if there is still more bss to allocate, do it. */
 	if (last_bss > elf_bss) {
-		error = vm_brk(elf_bss, last_bss - elf_bss);
+		error = vm_brk_flags(elf_bss, last_bss - elf_bss,
+				bss_prot & PROT_EXEC ? VM_EXEC : 0);
 		if (error)
 			goto out;
 	}
@@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
 	unsigned long elf_bss, elf_brk;
+	int bss_prot = 0;
 	int retval, i;
 	unsigned long elf_entry;
 	unsigned long interp_load_addr = 0;
@@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			   before this one. Map anonymous pages, if needed,
 			   and clear the area.  */
 			retval = set_brk(elf_bss + load_bias,
-					 elf_brk + load_bias);
+					 elf_brk + load_bias,
+					 bss_prot);
 			if (retval)
 				goto out_free_dentry;
 			nbyte = ELF_PAGEOFFSET(elf_bss);
@@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		if (end_data < k)
 			end_data = k;
 		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
-		if (k > elf_brk)
+		if (k > elf_brk) {
+			bss_prot = elf_prot;
 			elf_brk = k;
+		}
 	}
 
 	loc->elf_ex.e_entry += load_bias;
@@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	 * mapping in the interpreter, to make sure it doesn't wind
 	 * up getting placed where the bss needs to go.
 	 */
-	retval = set_brk(elf_bss, elf_brk);
+	retval = set_brk(elf_bss, elf_brk, bss_prot);
 	if (retval)
 		goto out_free_dentry;
 	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bb997493e15d..dae6f58d67c8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2083,6 +2083,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 
 /* These take the mm semaphore themselves */
 extern int __must_check vm_brk(unsigned long, unsigned long);
+extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index dc4291dcc99b..b729084eea90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2806,11 +2806,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk(unsigned long addr, unsigned long request)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
-	unsigned long flags, len;
+	unsigned long len;
 	struct rb_node **rb_link, *rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
 	int error;
@@ -2821,7 +2821,10 @@ static int do_brk(unsigned long addr, unsigned long request)
 	if (!len)
 		return 0;
 
-	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+	/* Until we need other flags, refuse anything except VM_EXEC. */
+	if ((flags & (~VM_EXEC)) != 0)
+		return -EINVAL;
+	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
 	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
 	if (offset_in_page(error))
@@ -2889,7 +2892,12 @@ out:
 	return 0;
 }
 
-int vm_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
+{
+	return do_brk_flags(addr, len, 0);
+}
+
+int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	int ret;
@@ -2898,13 +2906,19 @@ int vm_brk(unsigned long addr, unsigned long len)
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_brk(addr, len);
+	ret = do_brk_flags(addr, len, flags);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
 	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
 }
+EXPORT_SYMBOL(vm_brk_flags);
+
+int vm_brk(unsigned long addr, unsigned long len)
+{
+	return vm_brk_flags(addr, len, 0);
+}
 EXPORT_SYMBOL(vm_brk);
 
 /* Release all mmaps. */
-- 
cgit v1.2.3


From 235b62176712b970c815923e36b9a9cc05d4d901 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 22 Feb 2017 15:45:22 -0800
Subject: mm/swap: add cluster lock

This patch is to reduce the lock contention of swap_info_struct->lock
via using a more fine grained lock in swap_cluster_info for some swap
operations.  swap_info_struct->lock is heavily contended if multiple
processes reclaim pages simultaneously.  Because there is only one lock
for each swap device.  While in common configuration, there is only one
or several swap devices in the system.  The lock protects almost all
swap related operations.

In fact, many swap operations only access one element of
swap_info_struct->swap_map array.  And there is no dependency between
different elements of swap_info_struct->swap_map.  So a fine grained
lock can be used to allow parallel access to the different elements of
swap_info_struct->swap_map.

In this patch, a spinlock is added to swap_cluster_info to protect the
elements of swap_info_struct->swap_map in the swap cluster and the
fields of swap_cluster_info.  This reduced locking contention for
swap_info_struct->swap_map access greatly.

Because of the added spinlock, the size of swap_cluster_info increases
from 4 bytes to 8 bytes on the 64 bit and 32 bit system.  This will use
additional 4k RAM for every 1G swap space.

Because the size of swap_cluster_info is much smaller than the size of
the cache line (8 vs 64 on x86_64 architecture), there may be false
cache line sharing between spinlocks in swap_cluster_info.  To avoid the
false sharing in the first round of the swap cluster allocation, the
order of the swap clusters in the free clusters list is changed.  So
that, the swap_cluster_info sharing the same cache line will be placed
as far as possible.  After the first round of allocation, the order of
the clusters in free clusters list is expected to be random.  So the
false sharing should be not serious.

Compared with a previous implementation using bit_spin_lock, the
sequential swap out throughput improved about 3.2%.  Test was done on a
Xeon E5 v3 system.  The swap device used is a RAM simulated PMEM
(persistent memory) device.  To test the sequential swapping out, the
test case created 32 processes, which sequentially allocate and write to
the anonymous pages until the RAM and part of the swap device is used.

[ying.huang@intel.com: v5]
  Link: http://lkml.kernel.org/r/878tqeuuic.fsf_-_@yhuang-dev.intel.com
[minchan@kernel.org: initialize spinlock for swap_cluster_info]
  Link: http://lkml.kernel.org/r/1486434945-29753-1-git-send-email-minchan@kernel.org
[hughd@google.com: annotate nested locking for cluster lock]
  Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1702161050540.21773@eggly.anvils
Link: http://lkml.kernel.org/r/dbb860bbd825b1aaba18988015e8963f263c3f0d.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   6 ++
 mm/swapfile.c        | 215 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 179 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7f47b7098b1b..279c7b84bd63 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,6 +176,12 @@ enum {
  * protected by swap_info_struct.lock.
  */
 struct swap_cluster_info {
+	spinlock_t lock;	/*
+				 * Protect swap_cluster_info fields
+				 * and swap_info_struct->swap_map
+				 * elements correspond to the swap
+				 * cluster
+				 */
 	unsigned int data:24;
 	unsigned int flags:8;
 };
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2001ce427a1d..eb71b5d9430b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -257,6 +257,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 	info->data = 0;
 }
 
+static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+						     unsigned long offset)
+{
+	struct swap_cluster_info *ci;
+
+	ci = si->cluster_info;
+	if (ci) {
+		ci += offset / SWAPFILE_CLUSTER;
+		spin_lock(&ci->lock);
+	}
+	return ci;
+}
+
+static inline void unlock_cluster(struct swap_cluster_info *ci)
+{
+	if (ci)
+		spin_unlock(&ci->lock);
+}
+
+static inline struct swap_cluster_info *lock_cluster_or_swap_info(
+	struct swap_info_struct *si,
+	unsigned long offset)
+{
+	struct swap_cluster_info *ci;
+
+	ci = lock_cluster(si, offset);
+	if (!ci)
+		spin_lock(&si->lock);
+
+	return ci;
+}
+
+static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+					       struct swap_cluster_info *ci)
+{
+	if (ci)
+		unlock_cluster(ci);
+	else
+		spin_unlock(&si->lock);
+}
+
 static inline bool cluster_list_empty(struct swap_cluster_list *list)
 {
 	return cluster_is_null(&list->head);
@@ -281,9 +322,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
 		cluster_set_next_flag(&list->head, idx, 0);
 		cluster_set_next_flag(&list->tail, idx, 0);
 	} else {
+		struct swap_cluster_info *ci_tail;
 		unsigned int tail = cluster_next(&list->tail);
 
-		cluster_set_next(&ci[tail], idx);
+		/*
+		 * Nested cluster lock, but both cluster locks are
+		 * only acquired when we held swap_info_struct->lock
+		 */
+		ci_tail = ci + tail;
+		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+		cluster_set_next(ci_tail, idx);
+		unlock_cluster(ci_tail);
 		cluster_set_next_flag(&list->tail, idx, 0);
 	}
 }
@@ -328,7 +377,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 */
 static void swap_do_scheduled_discard(struct swap_info_struct *si)
 {
-	struct swap_cluster_info *info;
+	struct swap_cluster_info *info, *ci;
 	unsigned int idx;
 
 	info = si->cluster_info;
@@ -341,10 +390,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
 				SWAPFILE_CLUSTER);
 
 		spin_lock(&si->lock);
-		cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+		cluster_set_flag(ci, CLUSTER_FLAG_FREE);
+		unlock_cluster(ci);
 		cluster_list_add_tail(&si->free_clusters, info, idx);
+		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 				0, SWAPFILE_CLUSTER);
+		unlock_cluster(ci);
 	}
 }
 
@@ -447,8 +500,9 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	unsigned long *offset, unsigned long *scan_base)
 {
 	struct percpu_cluster *cluster;
+	struct swap_cluster_info *ci;
 	bool found_free;
-	unsigned long tmp;
+	unsigned long tmp, max;
 
 new_cluster:
 	cluster = this_cpu_ptr(si->percpu_cluster);
@@ -476,14 +530,21 @@ new_cluster:
 	 * check if there is still free entry in the cluster
 	 */
 	tmp = cluster->next;
-	while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
-	       SWAPFILE_CLUSTER) {
+	max = min_t(unsigned long, si->max,
+		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+	if (tmp >= max) {
+		cluster_set_null(&cluster->index);
+		goto new_cluster;
+	}
+	ci = lock_cluster(si, tmp);
+	while (tmp < max) {
 		if (!si->swap_map[tmp]) {
 			found_free = true;
 			break;
 		}
 		tmp++;
 	}
+	unlock_cluster(ci);
 	if (!found_free) {
 		cluster_set_null(&cluster->index);
 		goto new_cluster;
@@ -496,6 +557,7 @@ new_cluster:
 static unsigned long scan_swap_map(struct swap_info_struct *si,
 				   unsigned char usage)
 {
+	struct swap_cluster_info *ci;
 	unsigned long offset;
 	unsigned long scan_base;
 	unsigned long last_in_cluster = 0;
@@ -572,9 +634,11 @@ checks:
 	if (offset > si->highest_bit)
 		scan_base = offset = si->lowest_bit;
 
+	ci = lock_cluster(si, offset);
 	/* reuse swap entry of cache-only swap if not busy. */
 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 		int swap_was_freed;
+		unlock_cluster(ci);
 		spin_unlock(&si->lock);
 		swap_was_freed = __try_to_reclaim_swap(si, offset);
 		spin_lock(&si->lock);
@@ -584,8 +648,10 @@ checks:
 		goto scan; /* check next one */
 	}
 
-	if (si->swap_map[offset])
+	if (si->swap_map[offset]) {
+		unlock_cluster(ci);
 		goto scan;
+	}
 
 	if (offset == si->lowest_bit)
 		si->lowest_bit++;
@@ -601,6 +667,7 @@ checks:
 	}
 	si->swap_map[offset] = usage;
 	inc_cluster_info_page(si, si->cluster_info, offset);
+	unlock_cluster(ci);
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
 
@@ -731,7 +798,7 @@ swp_entry_t get_swap_page_of_type(int type)
 	return (swp_entry_t) {0};
 }
 
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 	unsigned long offset, type;
@@ -749,7 +816,6 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 		goto bad_offset;
 	if (!p->swap_map[offset])
 		goto bad_free;
-	spin_lock(&p->lock);
 	return p;
 
 bad_free:
@@ -767,14 +833,45 @@ out:
 	return NULL;
 }
 
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+	struct swap_info_struct *p;
+
+	p = _swap_info_get(entry);
+	if (p)
+		spin_lock(&p->lock);
+	return p;
+}
+
 static unsigned char swap_entry_free(struct swap_info_struct *p,
-				     swp_entry_t entry, unsigned char usage)
+				     swp_entry_t entry, unsigned char usage,
+				     bool swap_info_locked)
 {
+	struct swap_cluster_info *ci;
 	unsigned long offset = swp_offset(entry);
 	unsigned char count;
 	unsigned char has_cache;
+	bool lock_swap_info = false;
+
+	if (!swap_info_locked) {
+		count = p->swap_map[offset];
+		if (!p->cluster_info || count == usage || count == SWAP_MAP_SHMEM) {
+lock_swap_info:
+			swap_info_locked = true;
+			lock_swap_info = true;
+			spin_lock(&p->lock);
+		}
+	}
+
+	ci = lock_cluster(p, offset);
 
 	count = p->swap_map[offset];
+
+	if (!swap_info_locked && (count == usage || count == SWAP_MAP_SHMEM)) {
+		unlock_cluster(ci);
+		goto lock_swap_info;
+	}
+
 	has_cache = count & SWAP_HAS_CACHE;
 	count &= ~SWAP_HAS_CACHE;
 
@@ -800,10 +897,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 	usage = count | has_cache;
 	p->swap_map[offset] = usage;
 
+	unlock_cluster(ci);
+
 	/* free if no reference */
 	if (!usage) {
+		VM_BUG_ON(!swap_info_locked);
 		mem_cgroup_uncharge_swap(entry);
+		ci = lock_cluster(p, offset);
 		dec_cluster_info_page(p, p->cluster_info, offset);
+		unlock_cluster(ci);
 		if (offset < p->lowest_bit)
 			p->lowest_bit = offset;
 		if (offset > p->highest_bit) {
@@ -829,6 +931,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 		}
 	}
 
+	if (lock_swap_info)
+		spin_unlock(&p->lock);
+
 	return usage;
 }
 
@@ -840,11 +945,9 @@ void swap_free(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 
-	p = swap_info_get(entry);
-	if (p) {
-		swap_entry_free(p, entry, 1);
-		spin_unlock(&p->lock);
-	}
+	p = _swap_info_get(entry);
+	if (p)
+		swap_entry_free(p, entry, 1, false);
 }
 
 /*
@@ -854,11 +957,9 @@ void swapcache_free(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 
-	p = swap_info_get(entry);
-	if (p) {
-		swap_entry_free(p, entry, SWAP_HAS_CACHE);
-		spin_unlock(&p->lock);
-	}
+	p = _swap_info_get(entry);
+	if (p)
+		swap_entry_free(p, entry, SWAP_HAS_CACHE, false);
 }
 
 /*
@@ -870,13 +971,17 @@ int page_swapcount(struct page *page)
 {
 	int count = 0;
 	struct swap_info_struct *p;
+	struct swap_cluster_info *ci;
 	swp_entry_t entry;
+	unsigned long offset;
 
 	entry.val = page_private(page);
-	p = swap_info_get(entry);
+	p = _swap_info_get(entry);
 	if (p) {
-		count = swap_count(p->swap_map[swp_offset(entry)]);
-		spin_unlock(&p->lock);
+		offset = swp_offset(entry);
+		ci = lock_cluster_or_swap_info(p, offset);
+		count = swap_count(p->swap_map[offset]);
+		unlock_cluster_or_swap_info(p, ci);
 	}
 	return count;
 }
@@ -889,22 +994,26 @@ int swp_swapcount(swp_entry_t entry)
 {
 	int count, tmp_count, n;
 	struct swap_info_struct *p;
+	struct swap_cluster_info *ci;
 	struct page *page;
 	pgoff_t offset;
 	unsigned char *map;
 
-	p = swap_info_get(entry);
+	p = _swap_info_get(entry);
 	if (!p)
 		return 0;
 
-	count = swap_count(p->swap_map[swp_offset(entry)]);
+	offset = swp_offset(entry);
+
+	ci = lock_cluster_or_swap_info(p, offset);
+
+	count = swap_count(p->swap_map[offset]);
 	if (!(count & COUNT_CONTINUED))
 		goto out;
 
 	count &= ~COUNT_CONTINUED;
 	n = SWAP_MAP_MAX + 1;
 
-	offset = swp_offset(entry);
 	page = vmalloc_to_page(p->swap_map + offset);
 	offset &= ~PAGE_MASK;
 	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
@@ -919,7 +1028,7 @@ int swp_swapcount(swp_entry_t entry)
 		n *= (SWAP_CONT_MAX + 1);
 	} while (tmp_count & COUNT_CONTINUED);
 out:
-	spin_unlock(&p->lock);
+	unlock_cluster_or_swap_info(p, ci);
 	return count;
 }
 
@@ -1017,7 +1126,7 @@ int free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
+		if (swap_entry_free(p, entry, 1, true) == SWAP_HAS_CACHE) {
 			page = find_get_page(swap_address_space(entry),
 					     swp_offset(entry));
 			if (page && !trylock_page(page)) {
@@ -2298,6 +2407,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	return maxpages;
 }
 
+#define SWAP_CLUSTER_COLS						\
+	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
 					union swap_header *swap_header,
 					unsigned char *swap_map,
@@ -2305,11 +2417,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 					unsigned long maxpages,
 					sector_t *span)
 {
-	int i;
+	unsigned int j, k;
 	unsigned int nr_good_pages;
 	int nr_extents;
 	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
-	unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
+	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+	unsigned long i, idx;
 
 	nr_good_pages = maxpages - 1;	/* omit header page */
 
@@ -2357,15 +2470,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	if (!cluster_info)
 		return nr_extents;
 
-	for (i = 0; i < nr_clusters; i++) {
-		if (!cluster_count(&cluster_info[idx])) {
+
+	/* Reduce false cache line sharing between cluster_info */
+	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+		j = (k + col) % SWAP_CLUSTER_COLS;
+		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+			idx = i * SWAP_CLUSTER_COLS + j;
+			if (idx >= nr_clusters)
+				continue;
+			if (cluster_count(&cluster_info[idx]))
+				continue;
 			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
 			cluster_list_add_tail(&p->free_clusters, cluster_info,
 					      idx);
 		}
-		idx++;
-		if (idx == nr_clusters)
-			idx = 0;
 	}
 	return nr_extents;
 }
@@ -2468,6 +2586,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
 		int cpu;
+		unsigned long ci, nr_cluster;
 
 		p->flags |= SWP_SOLIDSTATE;
 		/*
@@ -2475,13 +2594,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		 * SSD
 		 */
 		p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+		nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 
-		cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
-			SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+		cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info));
 		if (!cluster_info) {
 			error = -ENOMEM;
 			goto bad_swap;
 		}
+
+		for (ci = 0; ci < nr_cluster; ci++)
+			spin_lock_init(&((cluster_info + ci)->lock));
+
 		p->percpu_cluster = alloc_percpu(struct percpu_cluster);
 		if (!p->percpu_cluster) {
 			error = -ENOMEM;
@@ -2627,6 +2750,7 @@ void si_swapinfo(struct sysinfo *val)
 static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 {
 	struct swap_info_struct *p;
+	struct swap_cluster_info *ci;
 	unsigned long offset, type;
 	unsigned char count;
 	unsigned char has_cache;
@@ -2640,10 +2764,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 		goto bad_file;
 	p = swap_info[type];
 	offset = swp_offset(entry);
-
-	spin_lock(&p->lock);
 	if (unlikely(offset >= p->max))
-		goto unlock_out;
+		goto out;
+
+	ci = lock_cluster_or_swap_info(p, offset);
 
 	count = p->swap_map[offset];
 
@@ -2686,7 +2810,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 	p->swap_map[offset] = count | has_cache;
 
 unlock_out:
-	spin_unlock(&p->lock);
+	unlock_cluster_or_swap_info(p, ci);
 out:
 	return err;
 
@@ -2775,6 +2899,7 @@ EXPORT_SYMBOL_GPL(__page_file_index);
 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 {
 	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
 	struct page *head;
 	struct page *page;
 	struct page *list_page;
@@ -2798,6 +2923,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	}
 
 	offset = swp_offset(entry);
+
+	ci = lock_cluster(si, offset);
+
 	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
 
 	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
@@ -2810,6 +2938,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	}
 
 	if (!page) {
+		unlock_cluster(ci);
 		spin_unlock(&si->lock);
 		return -ENOMEM;
 	}
@@ -2858,6 +2987,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	list_add_tail(&page->lru, &head->lru);
 	page = NULL;			/* now it's attached, don't free it */
 out:
+	unlock_cluster(ci);
 	spin_unlock(&si->lock);
 outer:
 	if (page)
@@ -2871,7 +3001,8 @@ outer:
  * into, carry if so, or else fail until a new continuation page is allocated;
  * when the original swap_map count is decremented from 0 with continuation,
  * borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
+ * lock.
  */
 static bool swap_count_continued(struct swap_info_struct *si,
 				 pgoff_t offset, unsigned char count)
-- 
cgit v1.2.3


From 4b3ef9daa4fc0bba742a79faecb17fdaaead083b Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 22 Feb 2017 15:45:26 -0800
Subject: mm/swap: split swap cache into 64MB trunks

The patch is to improve the scalability of the swap out/in via using
fine grained locks for the swap cache.  In current kernel, one address
space will be used for each swap device.  And in the common
configuration, the number of the swap device is very small (one is
typical).  This causes the heavy lock contention on the radix tree of
the address space if multiple tasks swap out/in concurrently.

But in fact, there is no dependency between pages in the swap cache.  So
that, we can split the one shared address space for each swap device
into several address spaces to reduce the lock contention.  In the
patch, the shared address space is split into 64MB trunks.  64MB is
chosen to balance the memory space usage and effect of lock contention
reduction.

The size of struct address_space on x86_64 architecture is 408B, so with
the patch, 6528B more memory will be used for every 1GB swap space on
x86_64 architecture.

One address space is still shared for the swap entries in the same 64M
trunks.  To avoid lock contention for the first round of swap space
allocation, the order of the swap clusters in the initial free clusters
list is changed.  The swap space distance between the consecutive swap
clusters in the free cluster list is at least 64M.  After the first
round of allocation, the swap clusters are expected to be freed
randomly, so the lock contention should be reduced effectively.

Link: http://lkml.kernel.org/r/735bab895e64c930581ffb0a05b661e01da82bc5.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 11 +++++++--
 mm/swap.c            |  6 -----
 mm/swap_state.c      | 68 ++++++++++++++++++++++++++++++++++++++++++----------
 mm/swapfile.c        | 16 +++++++++++--
 4 files changed, 79 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 279c7b84bd63..648a32b58e3e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -343,8 +343,13 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
 		sector_t *);
 
 /* linux/mm/swap_state.c */
-extern struct address_space swapper_spaces[];
-#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
+/* One swap address space for each 64M swap space */
+#define SWAP_ADDRESS_SPACE_SHIFT	14
+#define SWAP_ADDRESS_SPACE_PAGES	(1 << SWAP_ADDRESS_SPACE_SHIFT)
+extern struct address_space *swapper_spaces[];
+#define swap_address_space(entry)			    \
+	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
+		>> SWAP_ADDRESS_SPACE_SHIFT])
 extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *, struct list_head *list);
@@ -398,6 +403,8 @@ extern struct swap_info_struct *page_swap_info(struct page *);
 extern bool reuse_swap_page(struct page *, int *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
+extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
+extern void exit_swap_address_space(unsigned int type);
 
 #else /* CONFIG_SWAP */
 
diff --git a/mm/swap.c b/mm/swap.c
index 844baedd2429..aabf2e90fe32 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -971,12 +971,6 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
 void __init swap_setup(void)
 {
 	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
-#ifdef CONFIG_SWAP
-	int i;
-
-	for (i = 0; i < MAX_SWAPFILES; i++)
-		spin_lock_init(&swapper_spaces[i].tree_lock);
-#endif
 
 	/* Use a smaller cluster for small-memory machines */
 	if (megs < 16)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 35d7e0ee1c77..3863acd6189c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
+#include <linux/vmalloc.h>
 
 #include <asm/pgtable.h>
 
@@ -32,15 +33,8 @@ static const struct address_space_operations swap_aops = {
 #endif
 };
 
-struct address_space swapper_spaces[MAX_SWAPFILES] = {
-	[0 ... MAX_SWAPFILES - 1] = {
-		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-		.i_mmap_writable = ATOMIC_INIT(0),
-		.a_ops		= &swap_aops,
-		/* swap cache doesn't use writeback related tags */
-		.flags		= 1 << AS_NO_WRITEBACK_TAGS,
-	}
-};
+struct address_space *swapper_spaces[MAX_SWAPFILES];
+static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
 
@@ -53,11 +47,26 @@ static struct {
 
 unsigned long total_swapcache_pages(void)
 {
-	int i;
+	unsigned int i, j, nr;
 	unsigned long ret = 0;
+	struct address_space *spaces;
 
-	for (i = 0; i < MAX_SWAPFILES; i++)
-		ret += swapper_spaces[i].nrpages;
+	rcu_read_lock();
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		/*
+		 * The corresponding entries in nr_swapper_spaces and
+		 * swapper_spaces will be reused only after at least
+		 * one grace period.  So it is impossible for them
+		 * belongs to different usage.
+		 */
+		nr = nr_swapper_spaces[i];
+		spaces = rcu_dereference(swapper_spaces[i]);
+		if (!nr || !spaces)
+			continue;
+		for (j = 0; j < nr; j++)
+			ret += spaces[j].nrpages;
+	}
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -505,3 +514,38 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 skip:
 	return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
+
+int init_swap_address_space(unsigned int type, unsigned long nr_pages)
+{
+	struct address_space *spaces, *space;
+	unsigned int i, nr;
+
+	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+	spaces = vzalloc(sizeof(struct address_space) * nr);
+	if (!spaces)
+		return -ENOMEM;
+	for (i = 0; i < nr; i++) {
+		space = spaces + i;
+		INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
+		atomic_set(&space->i_mmap_writable, 0);
+		space->a_ops = &swap_aops;
+		/* swap cache doesn't use writeback related tags */
+		mapping_set_no_writeback_tags(space);
+		spin_lock_init(&space->tree_lock);
+	}
+	nr_swapper_spaces[type] = nr;
+	rcu_assign_pointer(swapper_spaces[type], spaces);
+
+	return 0;
+}
+
+void exit_swap_address_space(unsigned int type)
+{
+	struct address_space *spaces;
+
+	spaces = swapper_spaces[type];
+	nr_swapper_spaces[type] = 0;
+	rcu_assign_pointer(swapper_spaces[type], NULL);
+	synchronize_rcu();
+	kvfree(spaces);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index eb71b5d9430b..66e95eb73040 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2084,6 +2084,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	vfree(frontswap_map);
 	/* Destroy swap account information */
 	swap_cgroup_swapoff(p->type);
+	exit_swap_address_space(p->type);
 
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
@@ -2407,8 +2408,12 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	return maxpages;
 }
 
-#define SWAP_CLUSTER_COLS						\
+#define SWAP_CLUSTER_INFO_COLS						\
 	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS						\
+	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS						\
+	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
 
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
 					union swap_header *swap_header,
@@ -2471,7 +2476,10 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 		return nr_extents;
 
 
-	/* Reduce false cache line sharing between cluster_info */
+	/*
+	 * Reduce false cache line sharing between cluster_info and
+	 * sharing same address space.
+	 */
 	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
 		j = (k + col) % SWAP_CLUSTER_COLS;
 		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
@@ -2661,6 +2669,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		}
 	}
 
+	error = init_swap_address_space(p->type, maxpages);
+	if (error)
+		goto bad_swap;
+
 	mutex_lock(&swapon_mutex);
 	prio = -1;
 	if (swap_flags & SWAP_FLAG_PREFER)
-- 
cgit v1.2.3


From e8c26ab60598558ec3a626e7925b06e7417d7710 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 22 Feb 2017 15:45:29 -0800
Subject: mm/swap: skip readahead for unreferenced swap slots

We can avoid needlessly allocating page for swap slots that are not used
by anyone.  No pages have to be read in for these slots.

Link: http://lkml.kernel.org/r/0784b3f20b9bd3aa5552219624cb78dc4ae710c9.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  6 ++++++
 mm/swap_state.c      |  4 ++++
 mm/swapfile.c        | 47 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 51 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 648a32b58e3e..3116382067cd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -398,6 +398,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
+extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern bool reuse_swap_page(struct page *, int *);
@@ -492,6 +493,11 @@ static inline int page_swapcount(struct page *page)
 	return 0;
 }
 
+static inline int __swp_swapcount(swp_entry_t entry)
+{
+	return 0;
+}
+
 static inline int swp_swapcount(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3863acd6189c..3d76d80c07d6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -323,6 +323,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		if (found_page)
 			break;
 
+		/* Just skip read ahead for unused swap slot */
+		if (!__swp_swapcount(entry))
+			return NULL;
+
 		/*
 		 * Get a new page to read into from swap.
 		 */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 66e95eb73040..7e888de35c41 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -798,7 +798,7 @@ swp_entry_t get_swap_page_of_type(int type)
 	return (swp_entry_t) {0};
 }
 
-static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 	unsigned long offset, type;
@@ -814,13 +814,8 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
 	offset = swp_offset(entry);
 	if (offset >= p->max)
 		goto bad_offset;
-	if (!p->swap_map[offset])
-		goto bad_free;
 	return p;
 
-bad_free:
-	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
-	goto out;
 bad_offset:
 	pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
 	goto out;
@@ -833,6 +828,24 @@ out:
 	return NULL;
 }
 
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+{
+	struct swap_info_struct *p;
+
+	p = __swap_info_get(entry);
+	if (!p)
+		goto out;
+	if (!p->swap_map[swp_offset(entry)])
+		goto bad_free;
+	return p;
+
+bad_free:
+	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+	goto out;
+out:
+	return NULL;
+}
+
 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
@@ -986,6 +999,28 @@ int page_swapcount(struct page *page)
 	return count;
 }
 
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
+int __swp_swapcount(swp_entry_t entry)
+{
+	int count = 0;
+	pgoff_t offset;
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+
+	si = __swap_info_get(entry);
+	if (si) {
+		offset = swp_offset(entry);
+		ci = lock_cluster_or_swap_info(si, offset);
+		count = swap_count(si->swap_map[offset]);
+		unlock_cluster_or_swap_info(si, ci);
+	}
+	return count;
+}
+
 /*
  * How many references to @entry are currently swapped out?
  * This considers COUNT_CONTINUED so it returns exact answer.
-- 
cgit v1.2.3


From 36005bae205da3eef0016a5c96a34f10a68afa1e Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 22 Feb 2017 15:45:33 -0800
Subject: mm/swap: allocate swap slots in batches

Currently, the swap slots are allocated one page at a time, causing
contention to the swap_info lock protecting the swap partition on every
page being swapped.

This patch adds new functions get_swap_pages and scan_swap_map_slots to
request multiple swap slots at once.  This will reduces the lock
contention on the swap_info lock.  Also scan_swap_map_slots can operate
more efficiently as swap slots often occurs in clusters close to each
other on a swap device and it is quicker to allocate them together.

Link: http://lkml.kernel.org/r/9fec2845544371f62c3763d43510045e33d286a6.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   2 +
 mm/swapfile.c        | 136 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 113 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3116382067cd..956eae8a8edf 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -27,6 +27,7 @@ struct bio;
 #define SWAP_FLAGS_VALID	(SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
 				 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
 				 SWAP_FLAG_DISCARD_PAGES)
+#define SWAP_BATCH 64
 
 static inline int current_is_kswapd(void)
 {
@@ -386,6 +387,7 @@ static inline long get_nr_swap_pages(void)
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[]);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7e888de35c41..e73b5441055b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -496,7 +496,7 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
  * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
  * might involve allocating a new cluster for current CPU too.
  */
-static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	unsigned long *offset, unsigned long *scan_base)
 {
 	struct percpu_cluster *cluster;
@@ -520,7 +520,7 @@ new_cluster:
 			*scan_base = *offset = si->cluster_next;
 			goto new_cluster;
 		} else
-			return;
+			return false;
 	}
 
 	found_free = false;
@@ -552,16 +552,22 @@ new_cluster:
 	cluster->next = tmp + 1;
 	*offset = tmp;
 	*scan_base = tmp;
+	return found_free;
 }
 
-static unsigned long scan_swap_map(struct swap_info_struct *si,
-				   unsigned char usage)
+static int scan_swap_map_slots(struct swap_info_struct *si,
+			       unsigned char usage, int nr,
+			       swp_entry_t slots[])
 {
 	struct swap_cluster_info *ci;
 	unsigned long offset;
 	unsigned long scan_base;
 	unsigned long last_in_cluster = 0;
 	int latency_ration = LATENCY_LIMIT;
+	int n_ret = 0;
+
+	if (nr > SWAP_BATCH)
+		nr = SWAP_BATCH;
 
 	/*
 	 * We try to cluster swap pages by allocating them sequentially
@@ -579,8 +585,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 	/* SSD algorithm */
 	if (si->cluster_info) {
-		scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
-		goto checks;
+		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+			goto checks;
+		else
+			goto scan;
 	}
 
 	if (unlikely(!si->cluster_nr--)) {
@@ -624,8 +632,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 checks:
 	if (si->cluster_info) {
-		while (scan_swap_map_ssd_cluster_conflict(si, offset))
-			scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+		/* take a break if we already got some slots */
+			if (n_ret)
+				goto done;
+			if (!scan_swap_map_try_ssd_cluster(si, &offset,
+							&scan_base))
+				goto scan;
+		}
 	}
 	if (!(si->flags & SWP_WRITEOK))
 		goto no_page;
@@ -650,7 +664,10 @@ checks:
 
 	if (si->swap_map[offset]) {
 		unlock_cluster(ci);
-		goto scan;
+		if (!n_ret)
+			goto scan;
+		else
+			goto done;
 	}
 
 	if (offset == si->lowest_bit)
@@ -669,9 +686,43 @@ checks:
 	inc_cluster_info_page(si, si->cluster_info, offset);
 	unlock_cluster(ci);
 	si->cluster_next = offset + 1;
-	si->flags -= SWP_SCANNING;
+	slots[n_ret++] = swp_entry(si->type, offset);
+
+	/* got enough slots or reach max slots? */
+	if ((n_ret == nr) || (offset >= si->highest_bit))
+		goto done;
+
+	/* search for next available slot */
+
+	/* time to take a break? */
+	if (unlikely(--latency_ration < 0)) {
+		if (n_ret)
+			goto done;
+		spin_unlock(&si->lock);
+		cond_resched();
+		spin_lock(&si->lock);
+		latency_ration = LATENCY_LIMIT;
+	}
 
-	return offset;
+	/* try to get more slots in cluster */
+	if (si->cluster_info) {
+		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+			goto checks;
+		else
+			goto done;
+	}
+	/* non-ssd case */
+	++offset;
+
+	/* non-ssd case, still more slots in cluster? */
+	if (si->cluster_nr && !si->swap_map[offset]) {
+		--si->cluster_nr;
+		goto checks;
+	}
+
+done:
+	si->flags -= SWP_SCANNING;
+	return n_ret;
 
 scan:
 	spin_unlock(&si->lock);
@@ -709,17 +760,41 @@ scan:
 
 no_page:
 	si->flags -= SWP_SCANNING;
-	return 0;
+	return n_ret;
 }
 
-swp_entry_t get_swap_page(void)
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+				   unsigned char usage)
+{
+	swp_entry_t entry;
+	int n_ret;
+
+	n_ret = scan_swap_map_slots(si, usage, 1, &entry);
+
+	if (n_ret)
+		return swp_offset(entry);
+	else
+		return 0;
+
+}
+
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
 {
 	struct swap_info_struct *si, *next;
-	pgoff_t offset;
+	long avail_pgs;
+	int n_ret = 0;
 
-	if (atomic_long_read(&nr_swap_pages) <= 0)
+	avail_pgs = atomic_long_read(&nr_swap_pages);
+	if (avail_pgs <= 0)
 		goto noswap;
-	atomic_long_dec(&nr_swap_pages);
+
+	if (n_goal > SWAP_BATCH)
+		n_goal = SWAP_BATCH;
+
+	if (n_goal > avail_pgs)
+		n_goal = avail_pgs;
+
+	atomic_long_sub(n_goal, &nr_swap_pages);
 
 	spin_lock(&swap_avail_lock);
 
@@ -745,14 +820,14 @@ start_over:
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
-
-		/* This is called for allocating swap entry for cache */
-		offset = scan_swap_map(si, SWAP_HAS_CACHE);
+		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+					    n_goal, swp_entries);
 		spin_unlock(&si->lock);
-		if (offset)
-			return swp_entry(si->type, offset);
+		if (n_ret)
+			goto check_out;
 		pr_debug("scan_swap_map of si %d failed to find offset\n",
-		       si->type);
+			si->type);
+
 		spin_lock(&swap_avail_lock);
 nextsi:
 		/*
@@ -763,7 +838,8 @@ nextsi:
 		 * up between us dropping swap_avail_lock and taking si->lock.
 		 * Since we dropped the swap_avail_lock, the swap_avail_head
 		 * list may have been modified; so if next is still in the
-		 * swap_avail_head list then try it, otherwise start over.
+		 * swap_avail_head list then try it, otherwise start over
+		 * if we have not gotten any slots.
 		 */
 		if (plist_node_empty(&next->avail_list))
 			goto start_over;
@@ -771,9 +847,19 @@ nextsi:
 
 	spin_unlock(&swap_avail_lock);
 
-	atomic_long_inc(&nr_swap_pages);
+check_out:
+	if (n_ret < n_goal)
+		atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
 noswap:
-	return (swp_entry_t) {0};
+	return n_ret;
+}
+
+swp_entry_t get_swap_page(void)
+{
+	swp_entry_t entry;
+
+	get_swap_pages(1, &entry);
+	return entry;
 }
 
 /* The only caller of this function is now suspend routine */
-- 
cgit v1.2.3


From 7c00bafee87c7bac7ed9eced7c161f8e5332cb4e Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 22 Feb 2017 15:45:36 -0800
Subject: mm/swap: free swap slots in batch

Add new functions that free unused swap slots in batches without the
need to reacquire swap info lock.  This improves scalability and reduce
lock contention.

Link: http://lkml.kernel.org/r/c25e0fcdfd237ec4ca7db91631d3b9f6ed23824e.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   1 +
 mm/swapfile.c        | 155 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 95 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 956eae8a8edf..bcc0b18f96d2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -394,6 +394,7 @@ extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
 extern void swapcache_free(swp_entry_t);
+extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e73b5441055b..8b5bd34b1a00 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -942,35 +942,34 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 	return p;
 }
 
-static unsigned char swap_entry_free(struct swap_info_struct *p,
-				     swp_entry_t entry, unsigned char usage,
-				     bool swap_info_locked)
+static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
+					struct swap_info_struct *q)
+{
+	struct swap_info_struct *p;
+
+	p = _swap_info_get(entry);
+
+	if (p != q) {
+		if (q != NULL)
+			spin_unlock(&q->lock);
+		if (p != NULL)
+			spin_lock(&p->lock);
+	}
+	return p;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+				       swp_entry_t entry, unsigned char usage)
 {
 	struct swap_cluster_info *ci;
 	unsigned long offset = swp_offset(entry);
 	unsigned char count;
 	unsigned char has_cache;
-	bool lock_swap_info = false;
-
-	if (!swap_info_locked) {
-		count = p->swap_map[offset];
-		if (!p->cluster_info || count == usage || count == SWAP_MAP_SHMEM) {
-lock_swap_info:
-			swap_info_locked = true;
-			lock_swap_info = true;
-			spin_lock(&p->lock);
-		}
-	}
 
-	ci = lock_cluster(p, offset);
+	ci = lock_cluster_or_swap_info(p, offset);
 
 	count = p->swap_map[offset];
 
-	if (!swap_info_locked && (count == usage || count == SWAP_MAP_SHMEM)) {
-		unlock_cluster(ci);
-		goto lock_swap_info;
-	}
-
 	has_cache = count & SWAP_HAS_CACHE;
 	count &= ~SWAP_HAS_CACHE;
 
@@ -994,46 +993,52 @@ lock_swap_info:
 	}
 
 	usage = count | has_cache;
-	p->swap_map[offset] = usage;
+	p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+
+	unlock_cluster_or_swap_info(p, ci);
+
+	return usage;
+}
 
+static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+{
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+	unsigned char count;
+
+	ci = lock_cluster(p, offset);
+	count = p->swap_map[offset];
+	VM_BUG_ON(count != SWAP_HAS_CACHE);
+	p->swap_map[offset] = 0;
+	dec_cluster_info_page(p, p->cluster_info, offset);
 	unlock_cluster(ci);
 
-	/* free if no reference */
-	if (!usage) {
-		VM_BUG_ON(!swap_info_locked);
-		mem_cgroup_uncharge_swap(entry);
-		ci = lock_cluster(p, offset);
-		dec_cluster_info_page(p, p->cluster_info, offset);
-		unlock_cluster(ci);
-		if (offset < p->lowest_bit)
-			p->lowest_bit = offset;
-		if (offset > p->highest_bit) {
-			bool was_full = !p->highest_bit;
-			p->highest_bit = offset;
-			if (was_full && (p->flags & SWP_WRITEOK)) {
-				spin_lock(&swap_avail_lock);
-				WARN_ON(!plist_node_empty(&p->avail_list));
-				if (plist_node_empty(&p->avail_list))
-					plist_add(&p->avail_list,
-						  &swap_avail_head);
-				spin_unlock(&swap_avail_lock);
-			}
-		}
-		atomic_long_inc(&nr_swap_pages);
-		p->inuse_pages--;
-		frontswap_invalidate_page(p->type, offset);
-		if (p->flags & SWP_BLKDEV) {
-			struct gendisk *disk = p->bdev->bd_disk;
-			if (disk->fops->swap_slot_free_notify)
-				disk->fops->swap_slot_free_notify(p->bdev,
-								  offset);
+	mem_cgroup_uncharge_swap(entry);
+	if (offset < p->lowest_bit)
+		p->lowest_bit = offset;
+	if (offset > p->highest_bit) {
+		bool was_full = !p->highest_bit;
+
+		p->highest_bit = offset;
+		if (was_full && (p->flags & SWP_WRITEOK)) {
+			spin_lock(&swap_avail_lock);
+			WARN_ON(!plist_node_empty(&p->avail_list));
+			if (plist_node_empty(&p->avail_list))
+				plist_add(&p->avail_list,
+					  &swap_avail_head);
+			spin_unlock(&swap_avail_lock);
 		}
 	}
+	atomic_long_inc(&nr_swap_pages);
+	p->inuse_pages--;
+	frontswap_invalidate_page(p->type, offset);
+	if (p->flags & SWP_BLKDEV) {
+		struct gendisk *disk = p->bdev->bd_disk;
 
-	if (lock_swap_info)
-		spin_unlock(&p->lock);
-
-	return usage;
+		if (disk->fops->swap_slot_free_notify)
+			disk->fops->swap_slot_free_notify(p->bdev,
+							  offset);
+	}
 }
 
 /*
@@ -1045,8 +1050,10 @@ void swap_free(swp_entry_t entry)
 	struct swap_info_struct *p;
 
 	p = _swap_info_get(entry);
-	if (p)
-		swap_entry_free(p, entry, 1, false);
+	if (p) {
+		if (!__swap_entry_free(p, entry, 1))
+			swapcache_free_entries(&entry, 1);
+	}
 }
 
 /*
@@ -1057,8 +1064,32 @@ void swapcache_free(swp_entry_t entry)
 	struct swap_info_struct *p;
 
 	p = _swap_info_get(entry);
+	if (p) {
+		if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
+			swapcache_free_entries(&entry, 1);
+	}
+}
+
+void swapcache_free_entries(swp_entry_t *entries, int n)
+{
+	struct swap_info_struct *p, *prev;
+	int i;
+
+	if (n <= 0)
+		return;
+
+	prev = NULL;
+	p = NULL;
+	for (i = 0; i < n; ++i) {
+		p = swap_info_get_cont(entries[i], prev);
+		if (p)
+			swap_entry_free(p, entries[i]);
+		else
+			break;
+		prev = p;
+	}
 	if (p)
-		swap_entry_free(p, entry, SWAP_HAS_CACHE, false);
+		spin_unlock(&p->lock);
 }
 
 /*
@@ -1241,21 +1272,23 @@ int free_swap_and_cache(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 	struct page *page = NULL;
+	unsigned char count;
 
 	if (non_swap_entry(entry))
 		return 1;
 
-	p = swap_info_get(entry);
+	p = _swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, entry, 1, true) == SWAP_HAS_CACHE) {
+		count = __swap_entry_free(p, entry, 1);
+		if (count == SWAP_HAS_CACHE) {
 			page = find_get_page(swap_address_space(entry),
 					     swp_offset(entry));
 			if (page && !trylock_page(page)) {
 				put_page(page);
 				page = NULL;
 			}
-		}
-		spin_unlock(&p->lock);
+		} else if (!count)
+			swapcache_free_entries(&entry, 1);
 	}
 	if (page) {
 		/*
-- 
cgit v1.2.3


From 67afa38e012e9581b9b42f2a41dfc56b1280794d Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 22 Feb 2017 15:45:39 -0800
Subject: mm/swap: add cache for swap slots allocation

We add per cpu caches for swap slots that can be allocated and freed
quickly without the need to touch the swap info lock.

Two separate caches are maintained for swap slots allocated and swap
slots returned.  This is to allow the swap slots to be returned to the
global pool in a batch so they will have a chance to be coaelesced with
other slots in a cluster.  We do not reuse the slots that are returned
right away, as it may increase fragmentation of the slots.

The swap allocation cache is protected by a mutex as we may sleep when
searching for empty slots in cache.  The swap free cache is protected by
a spin lock as we cannot sleep in the free path.

We refill the swap slots cache when we run out of slots, and we disable
the swap slots cache and drain the slots if the global number of slots
fall below a low watermark threshold.  We re-enable the cache agian when
the slots available are above a high watermark.

[ying.huang@intel.com: use raw_cpu_ptr over this_cpu_ptr for swap slots access]
[tim.c.chen@linux.intel.com: add comments on locks in swap_slots.h]
  Link: http://lkml.kernel.org/r/20170118180327.GA24225@linux.intel.com
Link: http://lkml.kernel.org/r/35de301a4eaa8daa2977de6e987f2c154385eb66.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h       |   4 +
 include/linux/swap_slots.h |  28 ++++
 mm/Makefile                |   2 +-
 mm/swap_slots.c            | 342 +++++++++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c            |   1 +
 mm/swapfile.c              |  26 ++--
 6 files changed, 391 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/swap_slots.h
 create mode 100644 mm/swap_slots.c

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bcc0b18f96d2..45e91dd6716d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -372,6 +372,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+extern bool has_usable_swap(void);
 
 /* Swap 50% full? Release swapcache more aggressively.. */
 static inline bool vm_swap_full(void)
@@ -410,6 +411,9 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 
+extern int get_swap_slots(int n, swp_entry_t *slots);
+extern void swapcache_free_batch(swp_entry_t *entries, int n);
+
 #else /* CONFIG_SWAP */
 
 #define swap_address_space(entry)		(NULL)
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
new file mode 100644
index 000000000000..ba5623b27c60
--- /dev/null
+++ b/include/linux/swap_slots.h
@@ -0,0 +1,28 @@
+#ifndef _LINUX_SWAP_SLOTS_H
+#define _LINUX_SWAP_SLOTS_H
+
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+#define SWAP_SLOTS_CACHE_SIZE			SWAP_BATCH
+#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE	(5*SWAP_SLOTS_CACHE_SIZE)
+#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE	(2*SWAP_SLOTS_CACHE_SIZE)
+
+struct swap_slots_cache {
+	bool		lock_initialized;
+	struct mutex	alloc_lock; /* protects slots, nr, cur */
+	swp_entry_t	*slots;
+	int		nr;
+	int		cur;
+	spinlock_t	free_lock;  /* protects slots_ret, n_ret */
+	swp_entry_t	*slots_ret;
+	int		n_ret;
+};
+
+void disable_swap_slots_cache_lock(void);
+void reenable_swap_slots_cache_unlock(void);
+int enable_swap_slots_cache(void);
+int free_swap_slot(swp_entry_t entry);
+
+#endif /* _LINUX_SWAP_SLOTS_H */
diff --git a/mm/Makefile b/mm/Makefile
index 295bd7a9f76b..433eaf9a876e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -35,7 +35,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
-			   compaction.o vmacache.o \
+			   compaction.o vmacache.o swap_slots.o \
 			   interval_tree.o list_lru.o workingset.o \
 			   debug.o $(mmu-y)
 
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
new file mode 100644
index 000000000000..ebf4f1cbac04
--- /dev/null
+++ b/mm/swap_slots.c
@@ -0,0 +1,342 @@
+/*
+ * Manage cache of swap slots to be used for and returned from
+ * swap.
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * We allocate the swap slots from the global pool and put
+ * it into local per cpu caches.  This has the advantage
+ * of no needing to acquire the swap_info lock every time
+ * we need a new slot.
+ *
+ * There is also opportunity to simply return the slot
+ * to local caches without needing to acquire swap_info
+ * lock.  We do not reuse the returned slots directly but
+ * move them back to the global pool in a batch.  This
+ * allows the slots to coaellesce and reduce fragmentation.
+ *
+ * The swap entry allocated is marked with SWAP_HAS_CACHE
+ * flag in map_count that prevents it from being allocated
+ * again from the global pool.
+ *
+ * The swap slots cache is protected by a mutex instead of
+ * a spin lock as when we search for slots with scan_swap_map,
+ * we can possibly sleep.
+ */
+
+#include <linux/swap_slots.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_SWAP
+
+static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+static bool	swap_slot_cache_active;
+static bool	swap_slot_cache_enabled;
+static bool	swap_slot_cache_initialized;
+DEFINE_MUTEX(swap_slots_cache_mutex);
+/* Serialize swap slots cache enable/disable operations */
+DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+
+static void __drain_swap_slots_cache(unsigned int type);
+static void deactivate_swap_slots_cache(void);
+static void reactivate_swap_slots_cache(void);
+
+#define use_swap_slot_cache (swap_slot_cache_active && \
+		swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define SLOTS_CACHE 0x1
+#define SLOTS_CACHE_RET 0x2
+
+static void deactivate_swap_slots_cache(void)
+{
+	mutex_lock(&swap_slots_cache_mutex);
+	swap_slot_cache_active = false;
+	__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+	mutex_unlock(&swap_slots_cache_mutex);
+}
+
+static void reactivate_swap_slots_cache(void)
+{
+	mutex_lock(&swap_slots_cache_mutex);
+	swap_slot_cache_active = true;
+	mutex_unlock(&swap_slots_cache_mutex);
+}
+
+/* Must not be called with cpu hot plug lock */
+void disable_swap_slots_cache_lock(void)
+{
+	mutex_lock(&swap_slots_cache_enable_mutex);
+	swap_slot_cache_enabled = false;
+	if (swap_slot_cache_initialized) {
+		/* serialize with cpu hotplug operations */
+		get_online_cpus();
+		__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+		put_online_cpus();
+	}
+}
+
+static void __reenable_swap_slots_cache(void)
+{
+	swap_slot_cache_enabled = has_usable_swap();
+}
+
+void reenable_swap_slots_cache_unlock(void)
+{
+	__reenable_swap_slots_cache();
+	mutex_unlock(&swap_slots_cache_enable_mutex);
+}
+
+static bool check_cache_active(void)
+{
+	long pages;
+
+	if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+		return false;
+
+	pages = get_nr_swap_pages();
+	if (!swap_slot_cache_active) {
+		if (pages > num_online_cpus() *
+		    THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
+			reactivate_swap_slots_cache();
+		goto out;
+	}
+
+	/* if global pool of slot caches too low, deactivate cache */
+	if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
+		deactivate_swap_slots_cache();
+out:
+	return swap_slot_cache_active;
+}
+
+static int alloc_swap_slot_cache(unsigned int cpu)
+{
+	struct swap_slots_cache *cache;
+	swp_entry_t *slots, *slots_ret;
+
+	/*
+	 * Do allocation outside swap_slots_cache_mutex
+	 * as vzalloc could trigger reclaim and get_swap_page,
+	 * which can lock swap_slots_cache_mutex.
+	 */
+	slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+	if (!slots)
+		return -ENOMEM;
+
+	slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+	if (!slots_ret) {
+		vfree(slots);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&swap_slots_cache_mutex);
+	cache = &per_cpu(swp_slots, cpu);
+	if (cache->slots || cache->slots_ret)
+		/* cache already allocated */
+		goto out;
+	if (!cache->lock_initialized) {
+		mutex_init(&cache->alloc_lock);
+		spin_lock_init(&cache->free_lock);
+		cache->lock_initialized = true;
+	}
+	cache->nr = 0;
+	cache->cur = 0;
+	cache->n_ret = 0;
+	cache->slots = slots;
+	slots = NULL;
+	cache->slots_ret = slots_ret;
+	slots_ret = NULL;
+out:
+	mutex_unlock(&swap_slots_cache_mutex);
+	if (slots)
+		vfree(slots);
+	if (slots_ret)
+		vfree(slots_ret);
+	return 0;
+}
+
+static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
+				  bool free_slots)
+{
+	struct swap_slots_cache *cache;
+	swp_entry_t *slots = NULL;
+
+	cache = &per_cpu(swp_slots, cpu);
+	if ((type & SLOTS_CACHE) && cache->slots) {
+		mutex_lock(&cache->alloc_lock);
+		swapcache_free_entries(cache->slots + cache->cur, cache->nr);
+		cache->cur = 0;
+		cache->nr = 0;
+		if (free_slots && cache->slots) {
+			vfree(cache->slots);
+			cache->slots = NULL;
+		}
+		mutex_unlock(&cache->alloc_lock);
+	}
+	if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
+		spin_lock_irq(&cache->free_lock);
+		swapcache_free_entries(cache->slots_ret, cache->n_ret);
+		cache->n_ret = 0;
+		if (free_slots && cache->slots_ret) {
+			slots = cache->slots_ret;
+			cache->slots_ret = NULL;
+		}
+		spin_unlock_irq(&cache->free_lock);
+		if (slots)
+			vfree(slots);
+	}
+}
+
+static void __drain_swap_slots_cache(unsigned int type)
+{
+	unsigned int cpu;
+
+	/*
+	 * This function is called during
+	 *	1) swapoff, when we have to make sure no
+	 *	   left over slots are in cache when we remove
+	 *	   a swap device;
+	 *      2) disabling of swap slot cache, when we run low
+	 *	   on swap slots when allocating memory and need
+	 *	   to return swap slots to global pool.
+	 *
+	 * We cannot acquire cpu hot plug lock here as
+	 * this function can be invoked in the cpu
+	 * hot plug path:
+	 * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
+	 *   -> memory allocation -> direct reclaim -> get_swap_page
+	 *   -> drain_swap_slots_cache
+	 *
+	 * Hence the loop over current online cpu below could miss cpu that
+	 * is being brought online but not yet marked as online.
+	 * That is okay as we do not schedule and run anything on a
+	 * cpu before it has been marked online. Hence, we will not
+	 * fill any swap slots in slots cache of such cpu.
+	 * There are no slots on such cpu that need to be drained.
+	 */
+	for_each_online_cpu(cpu)
+		drain_slots_cache_cpu(cpu, type, false);
+}
+
+static int free_slot_cache(unsigned int cpu)
+{
+	mutex_lock(&swap_slots_cache_mutex);
+	drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
+	mutex_unlock(&swap_slots_cache_mutex);
+	return 0;
+}
+
+int enable_swap_slots_cache(void)
+{
+	int ret = 0;
+
+	mutex_lock(&swap_slots_cache_enable_mutex);
+	if (swap_slot_cache_initialized) {
+		__reenable_swap_slots_cache();
+		goto out_unlock;
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+				alloc_swap_slot_cache, free_slot_cache);
+	if (ret < 0)
+		goto out_unlock;
+	swap_slot_cache_initialized = true;
+	__reenable_swap_slots_cache();
+out_unlock:
+	mutex_unlock(&swap_slots_cache_enable_mutex);
+	return 0;
+}
+
+/* called with swap slot cache's alloc lock held */
+static int refill_swap_slots_cache(struct swap_slots_cache *cache)
+{
+	if (!use_swap_slot_cache || cache->nr)
+		return 0;
+
+	cache->cur = 0;
+	if (swap_slot_cache_active)
+		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
+
+	return cache->nr;
+}
+
+int free_swap_slot(swp_entry_t entry)
+{
+	struct swap_slots_cache *cache;
+
+	BUG_ON(!swap_slot_cache_initialized);
+
+	cache = &get_cpu_var(swp_slots);
+	if (use_swap_slot_cache && cache->slots_ret) {
+		spin_lock_irq(&cache->free_lock);
+		/* Swap slots cache may be deactivated before acquiring lock */
+		if (!use_swap_slot_cache) {
+			spin_unlock_irq(&cache->free_lock);
+			goto direct_free;
+		}
+		if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
+			/*
+			 * Return slots to global pool.
+			 * The current swap_map value is SWAP_HAS_CACHE.
+			 * Set it to 0 to indicate it is available for
+			 * allocation in global pool
+			 */
+			swapcache_free_entries(cache->slots_ret, cache->n_ret);
+			cache->n_ret = 0;
+		}
+		cache->slots_ret[cache->n_ret++] = entry;
+		spin_unlock_irq(&cache->free_lock);
+	} else {
+direct_free:
+		swapcache_free_entries(&entry, 1);
+	}
+	put_cpu_var(swp_slots);
+
+	return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+	swp_entry_t entry, *pentry;
+	struct swap_slots_cache *cache;
+
+	/*
+	 * Preemption is allowed here, because we may sleep
+	 * in refill_swap_slots_cache().  But it is safe, because
+	 * accesses to the per-CPU data structure are protected by the
+	 * mutex cache->alloc_lock.
+	 *
+	 * The alloc path here does not touch cache->slots_ret
+	 * so cache->free_lock is not taken.
+	 */
+	cache = raw_cpu_ptr(&swp_slots);
+
+	entry.val = 0;
+	if (check_cache_active()) {
+		mutex_lock(&cache->alloc_lock);
+		if (cache->slots) {
+repeat:
+			if (cache->nr) {
+				pentry = &cache->slots[cache->cur++];
+				entry = *pentry;
+				pentry->val = 0;
+				cache->nr--;
+			} else {
+				if (refill_swap_slots_cache(cache))
+					goto repeat;
+			}
+		}
+		mutex_unlock(&cache->alloc_lock);
+		if (entry.val)
+			return entry;
+	}
+
+	get_swap_pages(1, &entry);
+
+	return entry;
+}
+
+#endif /* CONFIG_SWAP */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3d76d80c07d6..e1f07cafecaa 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -18,6 +18,7 @@
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/vmalloc.h>
+#include <linux/swap_slots.h>
 
 #include <asm/pgtable.h>
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8b5bd34b1a00..30a90fd140b7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -34,6 +34,7 @@
 #include <linux/frontswap.h>
 #include <linux/swapfile.h>
 #include <linux/export.h>
+#include <linux/swap_slots.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -854,14 +855,6 @@ noswap:
 	return n_ret;
 }
 
-swp_entry_t get_swap_page(void)
-{
-	swp_entry_t entry;
-
-	get_swap_pages(1, &entry);
-	return entry;
-}
-
 /* The only caller of this function is now suspend routine */
 swp_entry_t get_swap_page_of_type(int type)
 {
@@ -1052,7 +1045,7 @@ void swap_free(swp_entry_t entry)
 	p = _swap_info_get(entry);
 	if (p) {
 		if (!__swap_entry_free(p, entry, 1))
-			swapcache_free_entries(&entry, 1);
+			free_swap_slot(entry);
 	}
 }
 
@@ -1066,7 +1059,7 @@ void swapcache_free(swp_entry_t entry)
 	p = _swap_info_get(entry);
 	if (p) {
 		if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-			swapcache_free_entries(&entry, 1);
+			free_swap_slot(entry);
 	}
 }
 
@@ -1288,7 +1281,7 @@ int free_swap_and_cache(swp_entry_t entry)
 				page = NULL;
 			}
 		} else if (!count)
-			swapcache_free_entries(&entry, 1);
+			free_swap_slot(entry);
 	}
 	if (page) {
 		/*
@@ -2116,6 +2109,17 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 	spin_unlock(&swap_lock);
 }
 
+bool has_usable_swap(void)
+{
+	bool ret = true;
+
+	spin_lock(&swap_lock);
+	if (plist_head_empty(&swap_active_head))
+		ret = false;
+	spin_unlock(&swap_lock);
+	return ret;
+}
+
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
-- 
cgit v1.2.3


From ba81f83842549871cbd7226fc11530dc464500bb Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 22 Feb 2017 15:45:46 -0800
Subject: mm/swap: skip readahead only when swap slot cache is enabled

Because during swap off, a swap entry may have swap_map[] ==
SWAP_HAS_CACHE (for example, just allocated).  If we return NULL in
__read_swap_cache_async(), the swap off will abort.  So when swap slot
cache is disabled, (for swap off), we will wait for page to be put into
swap cache in such race condition.  This should not be a problem for swap
slot cache, because swap slot cache should be drained after clearing
swap_slot_cache_enabled.

[ying.huang@intel.com: fix memory leak in __read_swap_cache_async()]
  Link: http://lkml.kernel.org/r/874lzt6znd.fsf@yhuang-dev.intel.com
Link: http://lkml.kernel.org/r/5e2c5f6abe8e6eb0797408897b1bba80938e9b9d.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap_slots.h |  2 ++
 mm/swap_slots.c            |  2 +-
 mm/swap_state.c            | 13 ++++++++++---
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
index ba5623b27c60..6ef92d17633d 100644
--- a/include/linux/swap_slots.h
+++ b/include/linux/swap_slots.h
@@ -25,4 +25,6 @@ void reenable_swap_slots_cache_unlock(void);
 int enable_swap_slots_cache(void);
 int free_swap_slot(swp_entry_t entry);
 
+extern bool swap_slot_cache_enabled;
+
 #endif /* _LINUX_SWAP_SLOTS_H */
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index ebf4f1cbac04..9b5bc86f96ad 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -36,7 +36,7 @@
 
 static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
 static bool	swap_slot_cache_active;
-static bool	swap_slot_cache_enabled;
+bool	swap_slot_cache_enabled;
 static bool	swap_slot_cache_initialized;
 DEFINE_MUTEX(swap_slots_cache_mutex);
 /* Serialize swap slots cache enable/disable operations */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e1f07cafecaa..473b71e052a8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -324,9 +324,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		if (found_page)
 			break;
 
-		/* Just skip read ahead for unused swap slot */
-		if (!__swp_swapcount(entry))
-			return NULL;
+		/*
+		 * Just skip read ahead for unused swap slot.
+		 * During swap_off when swap_slot_cache is disabled,
+		 * we have to handle the race between putting
+		 * swap entry in swap cache and marking swap slot
+		 * as SWAP_HAS_CACHE.  That's done in later part of code or
+		 * else swap_off will be aborted if we return NULL.
+		 */
+		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
+			break;
 
 		/*
 		 * Get a new page to read into from swap.
-- 
cgit v1.2.3


From 21440d7eb9044001b7fdb71d0163689f60a0f2a1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 22 Feb 2017 15:45:49 -0800
Subject: mm, thp: add new defer+madvise defrag option

There is no thp defrag option that currently allows MADV_HUGEPAGE
regions to do direct compaction and reclaim while all other thp
allocations simply trigger kswapd and kcompactd in the background and
fail immediately.

The "defer" setting simply triggers background reclaim and compaction
for all regions, regardless of MADV_HUGEPAGE, which makes it unusable
for our userspace where MADV_HUGEPAGE is being used to indicate the
application is willing to wait for work for thp memory to be available.

The "madvise" setting will do direct compaction and reclaim for these
MADV_HUGEPAGE regions, but does not trigger kswapd and kcompactd in the
background for anybody else.

For reasonable usage, there needs to be a mesh between the two options.
This patch introduces a fifth mode, "defer+madvise", that will do direct
reclaim and compaction for MADV_HUGEPAGE regions and trigger background
reclaim and compaction for everybody else so that hugepages may be
available in the near future.

A proposal to allow direct reclaim and compaction for MADV_HUGEPAGE
regions as part of the "defer" mode, making it a very powerful setting
and avoids breaking userspace, was offered:
     http://marc.info/?t=148236612700003
This additional mode is a compromise.

A second proposal to allow both "defer" and "madvise" to be selected at
the same time was also offered:
     http://marc.info/?t=148357345300001.
This is possible, but there was a concern that it might break existing
userspaces the parse the output of the defrag mode, so the fifth option
was introduced instead.

This patch also cleans up the helper function for storing to "enabled"
and "defrag" since the former supports three modes while the latter
supports five and triple_flag_store() was getting unnecessarily messy.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701101614330.41805@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.txt |   8 ++-
 include/linux/huge_mm.h        |   1 +
 mm/huge_memory.c               | 146 +++++++++++++++++++++--------------------
 3 files changed, 82 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index c4171e4519c2..8fda5b7b24e9 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -110,6 +110,7 @@ MADV_HUGEPAGE region.
 
 echo always >/sys/kernel/mm/transparent_hugepage/defrag
 echo defer >/sys/kernel/mm/transparent_hugepage/defrag
+echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
 echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
 echo never >/sys/kernel/mm/transparent_hugepage/defrag
 
@@ -120,10 +121,15 @@ that benefit heavily from THP use and are willing to delay the VM start
 to utilise them.
 
 "defer" means that an application will wake kswapd in the background
-to reclaim pages and wake kcompact to compact memory so that THP is
+to reclaim pages and wake kcompactd to compact memory so that THP is
 available in the near future. It's the responsibility of khugepaged
 to then install the THP pages later.
 
+"defer+madvise" will enter direct reclaim and compaction like "always", but
+only for regions that have used madvise(MADV_HUGEPAGE); all other regions
+will wake kswapd in the background to reclaim pages and wake kcompactd to
+compact memory so that THP is available in the near future.
+
 "madvise" will enter direct reclaim like "always" but only for regions
 that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 97e478d6b690..f0029e786205 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -33,6 +33,7 @@ enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
 	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5f3ad65c85de..f9ecc2aeadfc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = {
 };
 
 #ifdef CONFIG_SYSFS
-
-static ssize_t triple_flag_store(struct kobject *kobj,
-				 struct kobj_attribute *attr,
-				 const char *buf, size_t count,
-				 enum transparent_hugepage_flag enabled,
-				 enum transparent_hugepage_flag deferred,
-				 enum transparent_hugepage_flag req_madv)
-{
-	if (!memcmp("defer", buf,
-		    min(sizeof("defer")-1, count))) {
-		if (enabled == deferred)
-			return -EINVAL;
-		clear_bit(enabled, &transparent_hugepage_flags);
-		clear_bit(req_madv, &transparent_hugepage_flags);
-		set_bit(deferred, &transparent_hugepage_flags);
-	} else if (!memcmp("always", buf,
-		    min(sizeof("always")-1, count))) {
-		clear_bit(deferred, &transparent_hugepage_flags);
-		clear_bit(req_madv, &transparent_hugepage_flags);
-		set_bit(enabled, &transparent_hugepage_flags);
-	} else if (!memcmp("madvise", buf,
-			   min(sizeof("madvise")-1, count))) {
-		clear_bit(enabled, &transparent_hugepage_flags);
-		clear_bit(deferred, &transparent_hugepage_flags);
-		set_bit(req_madv, &transparent_hugepage_flags);
-	} else if (!memcmp("never", buf,
-			   min(sizeof("never")-1, count))) {
-		clear_bit(enabled, &transparent_hugepage_flags);
-		clear_bit(req_madv, &transparent_hugepage_flags);
-		clear_bit(deferred, &transparent_hugepage_flags);
-	} else
-		return -EINVAL;
-
-	return count;
-}
-
 static ssize_t enabled_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *buf)
 {
@@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj,
 			     struct kobj_attribute *attr,
 			     const char *buf, size_t count)
 {
-	ssize_t ret;
+	ssize_t ret = count;
 
-	ret = triple_flag_store(kobj, attr, buf, count,
-				TRANSPARENT_HUGEPAGE_FLAG,
-				TRANSPARENT_HUGEPAGE_FLAG,
-				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+	if (!memcmp("always", buf,
+		    min(sizeof("always")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("madvise", buf,
+			   min(sizeof("madvise")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("never", buf,
+			   min(sizeof("never")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else
+		ret = -EINVAL;
 
 	if (ret > 0) {
 		int err = start_stop_khugepaged();
 		if (err)
 			ret = err;
 	}
-
 	return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
 	return count;
 }
 
-/*
- * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
- * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
- * memory just to allocate one more hugepage.
- */
 static ssize_t defrag_show(struct kobject *kobj,
 			   struct kobj_attribute *attr, char *buf)
 {
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
-		return sprintf(buf, "[always] defer madvise never\n");
+		return sprintf(buf, "[always] defer defer+madvise madvise never\n");
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-		return sprintf(buf, "always [defer] madvise never\n");
-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-		return sprintf(buf, "always defer [madvise] never\n");
-	else
-		return sprintf(buf, "always defer madvise [never]\n");
-
+		return sprintf(buf, "always [defer] defer+madvise madvise never\n");
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "always defer [defer+madvise] madvise never\n");
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "always defer defer+madvise [madvise] never\n");
+	return sprintf(buf, "always defer defer+madvise madvise [never]\n");
 }
+
 static ssize_t defrag_store(struct kobject *kobj,
 			    struct kobj_attribute *attr,
 			    const char *buf, size_t count)
 {
-	return triple_flag_store(kobj, attr, buf, count,
-				 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
-				 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
-				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+	if (!memcmp("always", buf,
+		    min(sizeof("always")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("defer", buf,
+		    min(sizeof("defer")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("defer+madvise", buf,
+		    min(sizeof("defer+madvise")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("madvise", buf,
+			   min(sizeof("madvise")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("never", buf,
+			   min(sizeof("never")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else
+		return -EINVAL;
+
+	return count;
 }
 static struct kobj_attribute defrag_attr =
 	__ATTR(defrag, 0644, defrag_show, defrag_store);
@@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 }
 
 /*
- * If THP defrag is set to always then directly reclaim/compact as necessary
- * If set to defer then do only background reclaim/compact and defer to khugepaged
- * If set to madvise and the VMA is flagged then directly reclaim/compact
- * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ *		  fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ *	    available
+ * never: never stall for any thp allocation
  */
 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
-	bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
 
-	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
-				&transparent_hugepage_flags) && vma_madvised)
-		return GFP_TRANSHUGE;
-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
-						&transparent_hugepage_flags))
-		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
-						&transparent_hugepage_flags))
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
 		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
-
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+							     __GFP_KSWAPD_RECLAIM);
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+							     0);
 	return GFP_TRANSHUGE_LIGHT;
 }
 
-- 
cgit v1.2.3


From fd538803731e50367b7c59ce4ad3454426a3d671 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:45:58 -0800
Subject: mm, vmscan: cleanup lru size claculations

lruvec_lru_size returns the full size of the LRU list while we sometimes
need a value reduced only to eligible zones (e.g.  for lowmem requests).
inactive_list_is_low is one such user.  Later patches will add more of
them.  Add a new parameter to lruvec_lru_size and allow it filter out
zones which are not eligible for the given context.

Link: http://lkml.kernel.org/r/20170117103702.28542-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 mm/vmscan.c            | 89 +++++++++++++++++++++++++-------------------------
 mm/workingset.c        |  2 +-
 3 files changed, 46 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f4aac87adcc3..82fc632fd11d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -779,7 +779,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #endif
 }
 
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
 
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 277e105646a5..e7c75a3c6b53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,22 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
 		pgdat_reclaimable_pages(pgdat) * 6;
 }
 
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/**
+ * lruvec_lru_size -  Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 {
+	unsigned long lru_size;
+	int zid;
+
 	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_lru_size(lruvec, lru);
+		lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+	else
+		lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 
-	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
-}
+	for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+		unsigned long size;
 
-unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
-				   int zone_idx)
-{
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
+		if (!managed_zone(zone))
+			continue;
+
+		if (!mem_cgroup_disabled())
+			size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+		else
+			size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+				       NR_ZONE_LRU_BASE + lru);
+		lru_size -= min(size, lru_size);
+	}
+
+	return lru_size;
 
-	return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
-			       NR_ZONE_LRU_BASE + lru);
 }
 
 /*
@@ -2049,11 +2066,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 						struct scan_control *sc, bool trace)
 {
 	unsigned long inactive_ratio;
-	unsigned long total_inactive, inactive;
-	unsigned long total_active, active;
+	unsigned long inactive, active;
+	enum lru_list inactive_lru = file * LRU_FILE;
+	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
 	unsigned long gb;
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-	int zid;
 
 	/*
 	 * If we don't have swap space, anonymous page deactivation
@@ -2062,27 +2078,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	if (!file && !total_swap_pages)
 		return false;
 
-	total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-	total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
-	/*
-	 * For zone-constrained allocations, it is necessary to check if
-	 * deactivations are required for lowmem to be reclaimed. This
-	 * calculates the inactive/active pages available in eligible zones.
-	 */
-	for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
-		struct zone *zone = &pgdat->node_zones[zid];
-		unsigned long inactive_zone, active_zone;
-
-		if (!managed_zone(zone))
-			continue;
-
-		inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
-		active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
-
-		inactive -= min(inactive, inactive_zone);
-		active -= min(active, active_zone);
-	}
+	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -2091,10 +2088,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 		inactive_ratio = 1;
 
 	if (trace)
-		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id,
+		trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
 				sc->reclaim_idx,
-				total_inactive, inactive,
-				total_active, active, inactive_ratio, file);
+				lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+				lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+				inactive_ratio, file);
+
 	return inactive * inactive_ratio < active;
 }
 
@@ -2234,7 +2233,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2260,10 +2259,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * anon in [0], file in [1]
 	 */
 
-	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
 
 	spin_lock_irq(&pgdat->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2301,7 +2300,7 @@ out:
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru);
+			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index abb58ffa3c64..a67f5796b995 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
 	}
 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
 	refault = atomic_long_read(&lruvec->inactive_age);
-	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
 	rcu_read_unlock();
 
 	/*
-- 
cgit v1.2.3


From a8e99259e7e32b67af2b447f0a570813c0c283ec Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:10 -0800
Subject: mm, page_alloc: warn_alloc print nodemask

warn_alloc is currently used for to report an allocation failure or an
allocation stall.  We print some details of the allocation request like
the gfp mask and the request order.  We do not print the allocation
nodemask which is important when debugging the reason for the allocation
failure as well.  We alreaddy print the nodemask in the OOM report.

Add nodemask to warn_alloc and print it in warn_alloc as well.

Link: http://lkml.kernel.org/r/20170117091543.25850-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  4 ++--
 mm/page_alloc.c    | 10 ++++++----
 mm/vmalloc.c       |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dae6f58d67c8..28b6c3f8a7f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1942,8 +1942,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
 extern unsigned long arch_reserved_kernel_pages(void);
 #endif
 
-extern __printf(2, 3)
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...);
+extern __printf(3, 4)
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
 
 extern void setup_per_cpu_pageset(void);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cf6b53dc08f9..96c8fe602dfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3028,12 +3028,13 @@ static void warn_alloc_show_mem(gfp_t gfp_mask)
 	show_mem(filter);
 }
 
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
+	nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed;
 
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
@@ -3047,7 +3048,8 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	pr_cont("%pV", &vaf);
 	va_end(args);
 
-	pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+	pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm));
+	cpuset_print_current_mems_allowed();
 
 	dump_stack();
 	warn_alloc_show_mem(gfp_mask);
@@ -3724,7 +3726,7 @@ retry:
 
 	/* Make sure we know about allocations which stall for too long */
 	if (time_after(jiffies, alloc_start + stall_timeout)) {
-		warn_alloc(gfp_mask,
+		warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation stalls for %ums, order:%u",
 			jiffies_to_msecs(jiffies-alloc_start), order);
 		stall_timeout += 10 * HZ;
@@ -3775,7 +3777,7 @@ nopage:
 	if (read_mems_allowed_retry(cpuset_mems_cookie))
 		goto retry_cpuset;
 
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
 	return page;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5f5b09e9dccd..d89034a393f2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1662,7 +1662,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	return area->addr;
 
 fail:
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
 			  (area->nr_pages*PAGE_SIZE), area->size);
 	vfree(area->addr);
@@ -1724,7 +1724,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure: %lu bytes", real_size);
 	return NULL;
 }
-- 
cgit v1.2.3


From 9af744d743170b5f5ef70031dea8d772d166ab28 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:16 -0800
Subject: lib/show_mem.c: teach show_mem to work with the given nodemask

show_mem() allows to filter out node specific data which is irrelevant
to the allocation request via SHOW_MEM_FILTER_NODES.  The filtering is
done in skip_free_areas_node which skips all nodes which are not in the
mems_allowed of the current process.  This works most of the time as
expected because the nodemask shouldn't be outside of the allocating
task but there are some exceptions.  E.g.  memory hotplug might want to
request allocations from outside of the allowed nodes (see
new_node_page).

Get rid of this hardcoded behavior and push the allocation mask down the
show_mem path and use it instead of cpuset_current_mems_allowed.  NULL
nodemask is interpreted as cpuset_current_mems_allowed.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170117091543.25850-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/xmon/xmon.c            |  2 +-
 arch/sparc/kernel/setup_32.c        |  2 +-
 drivers/net/ethernet/sgi/ioc3-eth.c |  2 +-
 drivers/tty/sysrq.c                 |  2 +-
 drivers/tty/vt/keyboard.c           |  2 +-
 include/linux/mm.h                  |  5 ++---
 lib/show_mem.c                      |  4 ++--
 mm/nommu.c                          |  6 +++---
 mm/oom_kill.c                       |  2 +-
 mm/page_alloc.c                     | 38 ++++++++++++++++++-------------------
 10 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 1be0499f5397..5720236d0266 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -916,7 +916,7 @@ cmds(struct pt_regs *excp)
 				memzcan();
 				break;
 			case 'i':
-				show_mem(0);
+				show_mem(0, NULL);
 				break;
 			default:
 				termch = cmd;
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index c4e65cb3280f..6f06058c5ae7 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -82,7 +82,7 @@ static void prom_sync_me(void)
 			     "nop\n\t" : : "r" (&trapbase));
 
 	prom_printf("PROM SYNC COMMAND...\n");
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	if (!is_idle_task(current)) {
 		local_irq_enable();
 		sys_sync();
diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index d390b9663dc3..57e6cef81ebe 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -914,7 +914,7 @@ static void ioc3_alloc_rings(struct net_device *dev)
 
 			skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
 			if (!skb) {
-				show_free_areas(0);
+				show_free_areas(0, NULL);
 				continue;
 			}
 
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 701c085bb19b..71136742e606 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -317,7 +317,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = {
 
 static void sysrq_handle_showmem(int key)
 {
-	show_mem(0);
+	show_mem(0, NULL);
 }
 static struct sysrq_key_op sysrq_showmem_op = {
 	.handler	= sysrq_handle_showmem,
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 3dd6a491cdba..397e1509fe51 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -572,7 +572,7 @@ static void fn_scroll_back(struct vc_data *vc)
 
 static void fn_show_mem(struct vc_data *vc)
 {
-	show_mem(0);
+	show_mem(0, NULL);
 }
 
 static void fn_show_state(struct vc_data *vc)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 28b6c3f8a7f3..8a67cae5a07c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1152,8 +1152,7 @@ extern void pagefault_out_of_memory(void);
  */
 #define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
 
-extern void show_free_areas(unsigned int flags);
-extern bool skip_free_areas_node(unsigned int flags, int nid);
+extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
 
 int shmem_zero_setup(struct vm_area_struct *);
 #ifdef CONFIG_SHMEM
@@ -1934,7 +1933,7 @@ extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
-extern void show_mem(unsigned int flags);
+extern void show_mem(unsigned int flags, nodemask_t *nodemask);
 extern long si_mem_available(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 1feed6a2b12a..0beaa1d899aa 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -9,13 +9,13 @@
 #include <linux/quicklist.h>
 #include <linux/cma.h>
 
-void show_mem(unsigned int filter)
+void show_mem(unsigned int filter, nodemask_t *nodemask)
 {
 	pg_data_t *pgdat;
 	unsigned long total = 0, reserved = 0, highmem = 0;
 
 	printk("Mem-Info:\n");
-	show_free_areas(filter);
+	show_free_areas(filter, nodemask);
 
 	for_each_online_pgdat(pgdat) {
 		unsigned long flags;
diff --git a/mm/nommu.c b/mm/nommu.c
index 24f9f5f39145..bc964c26be8c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1191,7 +1191,7 @@ error_free:
 enomem:
 	pr_err("Allocation of length %lu from process %d (%s) failed\n",
 	       len, current->pid, current->comm);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 }
 
@@ -1412,13 +1412,13 @@ error_getting_vma:
 	kmem_cache_free(vm_region_jar, region);
 	pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
 			len, current->pid);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 
 error_getting_region:
 	pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
 			len, current->pid);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ec9f11d4f094..7176b6a754cf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 	if (oc->memcg)
 		mem_cgroup_print_oom_info(oc->memcg, p);
 	else
-		show_mem(SHOW_MEM_FILTER_NODES);
+		show_mem(SHOW_MEM_FILTER_NODES, nm);
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(oc->memcg, oc->nodemask);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96c8fe602dfb..644fb75f6f24 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3005,7 +3005,7 @@ static inline bool should_suppress_show_mem(void)
 	return ret;
 }
 
-static void warn_alloc_show_mem(gfp_t gfp_mask)
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
@@ -3025,7 +3025,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask)
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
-	show_mem(filter);
+	show_mem(filter, nodemask);
 }
 
 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -3052,7 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	cpuset_print_current_mems_allowed();
 
 	dump_stack();
-	warn_alloc_show_mem(gfp_mask);
+	warn_alloc_show_mem(gfp_mask, nm);
 }
 
 static inline struct page *
@@ -4274,20 +4274,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
 {
-	bool ret = false;
-	unsigned int cpuset_mems_cookie;
-
 	if (!(flags & SHOW_MEM_FILTER_NODES))
-		goto out;
+		return false;
 
-	do {
-		cpuset_mems_cookie = read_mems_allowed_begin();
-		ret = !node_isset(nid, cpuset_current_mems_allowed);
-	} while (read_mems_allowed_retry(cpuset_mems_cookie));
-out:
-	return ret;
+	/*
+	 * no node mask - aka implicit memory numa policy. Do not bother with
+	 * the synchronization - read_mems_allowed_begin - because we do not
+	 * have to be precise here.
+	 */
+	if (!nodemask)
+		nodemask = &cpuset_current_mems_allowed;
+
+	return !node_isset(nid, *nodemask);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4328,7 +4328,7 @@ static void show_migration_types(unsigned char type)
  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
  *   cpuset.
  */
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 {
 	unsigned long free_pcp = 0;
 	int cpu;
@@ -4336,7 +4336,7 @@ void show_free_areas(unsigned int filter)
 	pg_data_t *pgdat;
 
 	for_each_populated_zone(zone) {
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 
 		for_each_online_cpu(cpu)
@@ -4370,7 +4370,7 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_online_pgdat(pgdat) {
-		if (skip_free_areas_node(filter, pgdat->node_id))
+		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
 			continue;
 
 		printk("Node %d"
@@ -4422,7 +4422,7 @@ void show_free_areas(unsigned int filter)
 	for_each_populated_zone(zone) {
 		int i;
 
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 
 		free_pcp = 0;
@@ -4487,7 +4487,7 @@ void show_free_areas(unsigned int filter)
 		unsigned long nr[MAX_ORDER], flags, total = 0;
 		unsigned char types[MAX_ORDER];
 
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 		show_node(zone);
 		printk(KERN_CONT "%s: ", zone->name);
-- 
cgit v1.2.3


From da162e9368990ed747075e2ab427da0759fc4a59 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 22 Feb 2017 15:46:31 -0800
Subject: mm: drop zap_details::ignore_dirty

The only user of ignore_dirty is oom-reaper.  But it doesn't really use
it.

ignore_dirty only has effect on file pages mapped with dirty pte.  But
oom-repear skips shared VMAs, so there's no way we can dirty file pte in
them.

Link: http://lkml.kernel.org/r/20170118122429.43661-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 mm/memory.c        | 6 ------
 mm/oom_kill.c      | 3 +--
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8a67cae5a07c..eae478a42f26 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1175,7 +1175,6 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
-	bool ignore_dirty;			/* Ignore dirty pages */
 	bool check_swap_entries;		/* Check also swap entries */
 };
 
diff --git a/mm/memory.c b/mm/memory.c
index d7676a68c80a..88872a93c3ca 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1155,12 +1155,6 @@ again:
 
 			if (!PageAnon(page)) {
 				if (pte_dirty(ptent)) {
-					/*
-					 * oom_reaper cannot tear down dirty
-					 * pages
-					 */
-					if (unlikely(details && details->ignore_dirty))
-						continue;
 					force_flush = 1;
 					set_page_dirty(page);
 				}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c7b48b4282d9..33fcc8a40aeb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -465,8 +465,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	struct zap_details details = {.check_swap_entries = true,
-				      .ignore_dirty = true};
+	struct zap_details details = {.check_swap_entries = true};
 	bool ret = true;
 
 	/*
-- 
cgit v1.2.3


From 3e8715fdc03e8df4d26d8e436166e44e3e416d3b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 22 Feb 2017 15:46:34 -0800
Subject: mm: drop zap_details::check_swap_entries

detail == NULL would give the same functionality as
.check_swap_entries==true.

Link: http://lkml.kernel.org/r/20170118122429.43661-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 mm/memory.c        | 4 ++--
 mm/oom_kill.c      | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index eae478a42f26..062936e8b832 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1175,7 +1175,6 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
-	bool check_swap_entries;		/* Check also swap entries */
 };
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 88872a93c3ca..e9035a0afee2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1173,8 +1173,8 @@ again:
 			}
 			continue;
 		}
-		/* only check swap_entries if explicitly asked for in details */
-		if (unlikely(details && !details->check_swap_entries))
+		/* If details->check_mapping, we leave swap entries. */
+		if (unlikely(details))
 			continue;
 
 		entry = pte_to_swp_entry(ptent);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 33fcc8a40aeb..a1977247c7ea 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -465,7 +465,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	struct zap_details details = {.check_swap_entries = true};
 	bool ret = true;
 
 	/*
@@ -531,7 +530,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
 			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-					 &details);
+					 NULL);
 	}
 	tlb_finish_mmu(&tlb, 0, -1);
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
-- 
cgit v1.2.3


From ecf1385d72f0491400a8ceca7001196ca369aa8c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 22 Feb 2017 15:46:37 -0800
Subject: mm: drop unused argument of zap_page_range()

There's no users of zap_page_range() who wants non-NULL 'details'.
Let's drop it.

Link: http://lkml.kernel.org/r/20170118122429.43661-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/mm/gmap.c               | 2 +-
 arch/x86/mm/mpx.c                 | 2 +-
 drivers/android/binder.c          | 2 +-
 drivers/staging/android/ion/ion.c | 3 +--
 include/linux/mm.h                | 2 +-
 mm/madvise.c                      | 2 +-
 mm/memory.c                       | 5 ++---
 7 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index ec1f0dedb948..59ac93714fa4 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -687,7 +687,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 		/* Find vma in the parent mm */
 		vma = find_vma(gmap->mm, vmaddr);
 		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
-		zap_page_range(vma, vmaddr, size, NULL);
+		zap_page_range(vma, vmaddr, size);
 	}
 	up_read(&gmap->mm->mmap_sem);
 }
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index af59f808742f..aad4ac386f98 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -796,7 +796,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
 			return -EINVAL;
 
 		len = min(vma->vm_end, end) - addr;
-		zap_page_range(vma, addr, len, NULL);
+		zap_page_range(vma, addr, len);
 		trace_mpx_unmap_zap(addr, addr+len);
 
 		vma = vma->vm_next;
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 9451b762fa1c..15b263a420e8 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -657,7 +657,7 @@ free_range:
 		page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
 		if (vma)
 			zap_page_range(vma, (uintptr_t)page_addr +
-				proc->user_buffer_offset, PAGE_SIZE, NULL);
+				proc->user_buffer_offset, PAGE_SIZE);
 err_vm_insert_page_failed:
 		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
 err_map_kernel_failed:
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 937c2d5d7ec3..969600779e44 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -865,8 +865,7 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
 	list_for_each_entry(vma_list, &buffer->vmas, list) {
 		struct vm_area_struct *vma = vma_list->vma;
 
-		zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start,
-			       NULL);
+		zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
 	}
 	mutex_unlock(&buffer->lock);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 062936e8b832..574bc157a27c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1185,7 +1185,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
-		unsigned long size, struct zap_details *);
+		unsigned long size);
 void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long start, unsigned long end);
 
diff --git a/mm/madvise.c b/mm/madvise.c
index ca75b8a01ba0..7f1490f0d3a6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -478,7 +478,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 		return -EINVAL;
 
 	madvise_userfault_dontneed(vma, prev, start, end);
-	zap_page_range(vma, start, end - start, NULL);
+	zap_page_range(vma, start, end - start);
 	return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index e9035a0afee2..7663068a33c6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1370,12 +1370,11 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
-		unsigned long size, struct zap_details *details)
+		unsigned long size)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
@@ -1386,7 +1385,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 	update_hiwater_rss(mm);
 	mmu_notifier_invalidate_range_start(mm, start, end);
 	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
-		unmap_single_vma(&tlb, vma, start, end, details);
+		unmap_single_vma(&tlb, vma, start, end, NULL);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	tlb_finish_mmu(&tlb, start, end);
 }
-- 
cgit v1.2.3


From 083fb8edda0487d192e8c117f625563b920cf7a4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 22 Feb 2017 15:46:48 -0800
Subject: mm: fix <linux/pagemap.h> stray kernel-doc notation

Delete stray (second) function description in find_lock_page()
kernel-doc notation.

Note: scripts/kernel-doc just ignores the second function description.

Fixes: 2457aec63745e ("mm: non-atomically mark page accessed during page cache allocation where possible")
Link: http://lkml.kernel.org/r/b037e9a3-516c-ec02-6c8e-fa5479747ba6@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b572f5530392..84943e8057ef 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -266,7 +266,6 @@ static inline struct page *find_get_page_flags(struct address_space *mapping,
 
 /**
  * find_lock_page - locate, pin and lock a pagecache page
- * pagecache_get_page - find and get a page reference
  * @mapping: the address_space to search
  * @offset: the page index
  *
-- 
cgit v1.2.3


From 745d8ae4622c6808b22e33a944c7decb30074be4 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Thu, 23 Feb 2017 12:02:42 +0200
Subject: net/mlx4: Spoofcheck and zero MAC can't coexist

Spoofcheck can't be enabled if VF MAC is zero.
Vice versa, can't zero MAC if spoofcheck is on.

Fixes: 8f7ba3ca12f6 ('net/mlx4: Add set VF mac address support')
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c       | 22 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  6 +-----
 include/linux/mlx4/cmd.h                       |  2 +-
 include/linux/mlx4/driver.h                    | 10 ++++++++++
 4 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index a49072b4fa52..e8c105164931 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -43,6 +43,7 @@
 #include <linux/semaphore.h>
 #include <rdma/ib_smi.h>
 #include <linux/delay.h>
+#include <linux/etherdevice.h>
 
 #include <asm/io.h>
 
@@ -2955,7 +2956,7 @@ static bool mlx4_valid_vf_state_change(struct mlx4_dev *dev, int port,
 	return false;
 }
 
-int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac)
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_vport_state *s_info;
@@ -2964,13 +2965,22 @@ int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac)
 	if (!mlx4_is_master(dev))
 		return -EPROTONOSUPPORT;
 
+	if (is_multicast_ether_addr(mac))
+		return -EINVAL;
+
 	slave = mlx4_get_slave_indx(dev, vf);
 	if (slave < 0)
 		return -EINVAL;
 
 	port = mlx4_slaves_closest_port(dev, slave, port);
 	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
-	s_info->mac = mac;
+
+	if (s_info->spoofchk && is_zero_ether_addr(mac)) {
+		mlx4_info(dev, "MAC invalidation is not allowed when spoofchk is on\n");
+		return -EPERM;
+	}
+
+	s_info->mac = mlx4_mac_to_u64(mac);
 	mlx4_info(dev, "default mac on vf %d port %d to %llX will take effect only after vf restart\n",
 		  vf, port, s_info->mac);
 	return 0;
@@ -3143,6 +3153,7 @@ int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_vport_state *s_info;
 	int slave;
+	u8 mac[ETH_ALEN];
 
 	if ((!mlx4_is_master(dev)) ||
 	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FSM))
@@ -3154,6 +3165,13 @@ int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
 
 	port = mlx4_slaves_closest_port(dev, slave, port);
 	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	mlx4_u64_to_mac(mac, s_info->mac);
+	if (setting && !is_valid_ether_addr(mac)) {
+		mlx4_info(dev, "Illegal MAC with spoofchk\n");
+		return -EPERM;
+	}
+
 	s_info->spoofchk = setting;
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index afe4444e5434..61420473fe5f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2485,12 +2485,8 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
 {
 	struct mlx4_en_priv *en_priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = en_priv->mdev;
-	u64 mac_u64 = mlx4_mac_to_u64(mac);
 
-	if (is_multicast_ether_addr(mac))
-		return -EINVAL;
-
-	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
+	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac);
 }
 
 static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 1f3568694a57..7b74afcbbab2 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -308,7 +308,7 @@ int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
 int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx,
 		      struct ifla_vf_stats *vf_stats);
 u32 mlx4_comm_get_version(void);
-int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac);
 int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan,
 		     u8 qos, __be16 proto);
 int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate,
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index bd0e7075ea6d..e965e5090d96 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -104,4 +104,14 @@ static inline u64 mlx4_mac_to_u64(u8 *addr)
 	return mac;
 }
 
+static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
+{
+	int i;
+
+	for (i = ETH_ALEN; i > 0; i--) {
+		addr[i - 1] = mac && 0xFF;
+		mac >>= 8;
+	}
+}
+
 #endif /* MLX4_DRIVER_H */
-- 
cgit v1.2.3


From 7d6d15789d69856f1c5405e106a773c87277eb8c Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Wed, 22 Feb 2017 10:15:06 -0700
Subject: block/sed-opal: Introduce free_opal_dev to free the structure and
 clean up state

Before we free the opal structure we need to clean up any saved
locking ranges that the user had told us to unlock from a suspend.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c         | 30 ++++++++++++++++++++++++++++++
 include/linux/sed-opal.h |  5 +++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 893557554cc5..020bf3e28e38 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1987,6 +1987,28 @@ static int check_opal_support(struct opal_dev *dev)
 	return ret;
 }
 
+static void clean_opal_dev(struct opal_dev *dev)
+{
+
+	struct opal_suspend_data *suspend, *next;
+
+	mutex_lock(&dev->dev_lock);
+	list_for_each_entry_safe(suspend, next, &dev->unlk_lst, node) {
+		list_del(&suspend->node);
+		kfree(suspend);
+	}
+	mutex_unlock(&dev->dev_lock);
+}
+
+void free_opal_dev(struct opal_dev *dev)
+{
+	if (!dev)
+		return;
+	clean_opal_dev(dev);
+	kfree(dev);
+}
+EXPORT_SYMBOL(free_opal_dev);
+
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
 {
 	struct opal_dev *dev;
@@ -2141,6 +2163,14 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
 	setup_opal_dev(dev, revert_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
+
+	/*
+	 * If we successfully reverted lets clean
+	 * any saved locking ranges.
+	 */
+	if (!ret)
+		clean_opal_dev(dev);
+
 	return ret;
 }
 
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index deee23d012e7..04b124fca51e 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -27,6 +27,7 @@ typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer,
 		size_t len, bool send);
 
 #ifdef CONFIG_BLK_SED_OPAL
+void free_opal_dev(struct opal_dev *dev);
 bool opal_unlock_from_suspend(struct opal_dev *dev);
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv);
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
@@ -51,6 +52,10 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	return false;
 }
 #else
+static inline void free_opal_dev(struct opal_dev *dev)
+{
+}
+
 static inline bool is_sed_ioctl(unsigned int cmd)
 {
 	return false;
-- 
cgit v1.2.3


From da55f2cc78418dee88400aafbbaed19d7ac8188e Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 22 Feb 2017 10:58:29 -0800
Subject: blk-mq: use sbq wait queues instead of restart for driver tags

Commit 50e1dab86aa2 ("blk-mq-sched: fix starvation for multiple hardware
queues and shared tags") fixed one starvation issue for shared tags.
However, we can still get into a situation where we fail to allocate a
tag because all tags are allocated but we don't have any pending
requests on any hardware queue.

One solution for this would be to restart all queues that share a tag
map, but that really sucks. Ideally, we could just block and wait for a
tag, but that isn't always possible from blk_mq_dispatch_rq_list().

However, we can still use the struct sbitmap_queue wait queues with a
custom callback instead of blocking. This has a few benefits:

1. It avoids iterating over all hardware queues when completing an I/O,
   which the current restart code has to do.
2. It benefits from the existing rolling wakeup code.
3. It avoids punting to another thread just to have it block.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 64 +++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/blk-mq.h |  2 ++
 2 files changed, 57 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b29e7dc7b309..9e6b064e5339 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list)
 	return first != NULL;
 }
 
+static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+				void *key)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+
+	list_del(&wait->task_list);
+	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+	blk_mq_run_hw_queue(hctx, true);
+	return 1;
+}
+
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+{
+	struct sbq_wait_state *ws;
+
+	/*
+	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+	 * The thread which wins the race to grab this bit adds the hardware
+	 * queue to the wait queue.
+	 */
+	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+		return false;
+
+	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+
+	/*
+	 * As soon as this returns, it's no longer safe to fiddle with
+	 * hctx->dispatch_wait, since a completion can wake up the wait queue
+	 * and unlock the bit.
+	 */
+	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+	return true;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 				continue;
 
 			/*
-			 * We failed getting a driver tag. Mark the queue(s)
-			 * as needing a restart. Retry getting a tag again,
-			 * in case the needed IO completed right before we
-			 * marked the queue as needing a restart.
+			 * The initial allocation attempt failed, so we need to
+			 * rerun the hardware queue when a tag is freed.
 			 */
-			blk_mq_sched_mark_restart(hctx);
-			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			if (blk_mq_dispatch_wait_add(hctx)) {
+				/*
+				 * It's possible that a tag was freed in the
+				 * window between the allocation failure and
+				 * adding the hardware queue to the wait queue.
+				 */
+				if (!blk_mq_get_driver_tag(rq, &hctx, false))
+					break;
+			} else {
 				break;
+			}
 		}
+
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		 *
 		 * blk_mq_run_hw_queue() already checks the STOPPED bit
 		 *
-		 * If RESTART is set, then let completion restart the queue
-		 * instead of potentially looping here.
+		 * If RESTART or TAG_WAITING is set, then let completion restart
+		 * the queue instead of potentially looping here.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx))
+		if (!blk_mq_sched_needs_restart(hctx) &&
+		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
 			blk_mq_run_hw_queue(hctx, true);
 	}
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8e4df3d6c8cd..001d30d727c5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
+	wait_queue_t		dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
@@ -160,6 +161,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
+	BLK_MQ_S_TAG_WAITING	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.2.3


From d08d1b27fe2a7f6923952613f5fab56ae47a6f5b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 22 Feb 2017 13:58:52 +0530
Subject: PM / QoS: Remove global notifiers

They were never used in the kernel, so get rid of them.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/pm_qos_interface.txt | 13 +--------
 drivers/base/power/qos.c                 | 50 +++-----------------------------
 include/linux/pm_qos.h                   |  8 -----
 3 files changed, 5 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index 129f7c0e1483..21d2d48f87a2 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -163,8 +163,7 @@ of flags and remove sysfs attributes pm_qos_no_power_off and pm_qos_remote_wakeu
 under the device's power directory.
 
 Notification mechanisms:
-The per-device PM QoS framework has 2 different and distinct notification trees:
-a per-device notification tree and a global notification tree.
+The per-device PM QoS framework has a per-device notification tree.
 
 int dev_pm_qos_add_notifier(device, notifier):
 Adds a notification callback function for the device.
@@ -174,16 +173,6 @@ is changed (for resume latency device PM QoS only).
 int dev_pm_qos_remove_notifier(device, notifier):
 Removes the notification callback function for the device.
 
-int dev_pm_qos_add_global_notifier(notifier):
-Adds a notification callback function in the global notification tree of the
-framework.
-The callback is called when the aggregated value for any device is changed
-(for resume latency device PM QoS only).
-
-int dev_pm_qos_remove_global_notifier(notifier):
-Removes the notification callback function from the global notification tree
-of the framework.
-
 
 Active state latency tolerance
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index d888d9869b6a..271bec73185e 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -17,12 +17,9 @@
  *
  * This QoS design is best effort based. Dependents register their QoS needs.
  * Watchers register to keep track of the current QoS needs of the system.
- * Watchers can register different types of notification callbacks:
- *  . a per-device notification callback using the dev_pm_qos_*_notifier API.
- *    The notification chain data is stored in the per-device constraint
- *    data struct.
- *  . a system-wide notification callback using the dev_pm_qos_*_global_notifier
- *    API. The notification chain data is stored in a static variable.
+ * Watchers can register a per-device notification callback using the
+ * dev_pm_qos_*_notifier API. The notification chain data is stored in the
+ * per-device constraint data struct.
  *
  * Note about the per-device constraint data struct allocation:
  * . The per-device constraints data struct ptr is tored into the device
@@ -49,8 +46,6 @@
 static DEFINE_MUTEX(dev_pm_qos_mtx);
 static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx);
 
-static BLOCKING_NOTIFIER_HEAD(dev_pm_notifiers);
-
 /**
  * __dev_pm_qos_flags - Check PM QoS flags for a given device.
  * @dev: Device to check the PM QoS flags for.
@@ -135,8 +130,7 @@ s32 dev_pm_qos_read_value(struct device *dev)
  * @value: Value to assign to the QoS request.
  *
  * Internal function to update the constraints list using the PM QoS core
- * code and if needed call the per-device and the global notification
- * callbacks
+ * code and if needed call the per-device callbacks.
  */
 static int apply_constraint(struct dev_pm_qos_request *req,
 			    enum pm_qos_req_action action, s32 value)
@@ -148,12 +142,6 @@ static int apply_constraint(struct dev_pm_qos_request *req,
 	case DEV_PM_QOS_RESUME_LATENCY:
 		ret = pm_qos_update_target(&qos->resume_latency,
 					   &req->data.pnode, action, value);
-		if (ret) {
-			value = pm_qos_read_value(&qos->resume_latency);
-			blocking_notifier_call_chain(&dev_pm_notifiers,
-						     (unsigned long)value,
-						     req);
-		}
 		break;
 	case DEV_PM_QOS_LATENCY_TOLERANCE:
 		ret = pm_qos_update_target(&qos->latency_tolerance,
@@ -535,36 +523,6 @@ int dev_pm_qos_remove_notifier(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier);
 
-/**
- * dev_pm_qos_add_global_notifier - sets notification entry for changes to
- * target value of the PM QoS constraints for any device
- *
- * @notifier: notifier block managed by caller.
- *
- * Will register the notifier into a notification chain that gets called
- * upon changes to the target value for any device.
- */
-int dev_pm_qos_add_global_notifier(struct notifier_block *notifier)
-{
-	return blocking_notifier_chain_register(&dev_pm_notifiers, notifier);
-}
-EXPORT_SYMBOL_GPL(dev_pm_qos_add_global_notifier);
-
-/**
- * dev_pm_qos_remove_global_notifier - deletes notification for changes to
- * target value of PM QoS constraints for any device
- *
- * @notifier: notifier block to be removed.
- *
- * Will remove the notifier from the notification chain that gets called
- * upon changes to the target value for any device.
- */
-int dev_pm_qos_remove_global_notifier(struct notifier_block *notifier)
-{
-	return blocking_notifier_chain_unregister(&dev_pm_notifiers, notifier);
-}
-EXPORT_SYMBOL_GPL(dev_pm_qos_remove_global_notifier);
-
 /**
  * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor.
  * @dev: Device whose ancestor to add the request for.
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index d4d34791e463..3e2547d6e207 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -146,8 +146,6 @@ int dev_pm_qos_add_notifier(struct device *dev,
 			    struct notifier_block *notifier);
 int dev_pm_qos_remove_notifier(struct device *dev,
 			       struct notifier_block *notifier);
-int dev_pm_qos_add_global_notifier(struct notifier_block *notifier);
-int dev_pm_qos_remove_global_notifier(struct notifier_block *notifier);
 void dev_pm_qos_constraints_init(struct device *dev);
 void dev_pm_qos_constraints_destroy(struct device *dev);
 int dev_pm_qos_add_ancestor_request(struct device *dev,
@@ -199,12 +197,6 @@ static inline int dev_pm_qos_add_notifier(struct device *dev,
 static inline int dev_pm_qos_remove_notifier(struct device *dev,
 					     struct notifier_block *notifier)
 			{ return 0; }
-static inline int dev_pm_qos_add_global_notifier(
-					struct notifier_block *notifier)
-			{ return 0; }
-static inline int dev_pm_qos_remove_global_notifier(
-					struct notifier_block *notifier)
-			{ return 0; }
 static inline void dev_pm_qos_constraints_init(struct device *dev)
 {
 	dev->power.power_state = PMSG_ON;
-- 
cgit v1.2.3


From 29dee3c03abce04cd527878ef5f9e5f91b7b83f4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 10 Feb 2017 16:27:52 +0100
Subject: locking/refcounts: Out-of-line everything

Linus asked to please make this real C code.

And since size then isn't an issue what so ever anymore, remove the
debug knob and make all WARN()s unconditional.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dwindsor@gmail.com
Cc: elena.reshetova@intel.com
Cc: gregkh@linuxfoundation.org
Cc: ishkamiel@gmail.com
Cc: keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/refcount.h | 277 ++---------------------------------------------
 lib/Kconfig.debug        |  13 ---
 lib/Makefile             |   2 +-
 lib/refcount.c           | 267 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 280 insertions(+), 279 deletions(-)
 create mode 100644 lib/refcount.c

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 600aadf9cca4..0e8cfb2ce91e 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -1,55 +1,10 @@
 #ifndef _LINUX_REFCOUNT_H
 #define _LINUX_REFCOUNT_H
 
-/*
- * Variant of atomic_t specialized for reference counts.
- *
- * The interface matches the atomic_t interface (to aid in porting) but only
- * provides the few functions one should use for reference counting.
- *
- * It differs in that the counter saturates at UINT_MAX and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free issues.
- *
- * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
- * and provide only what is strictly required for refcounts.
- *
- * The increments are fully relaxed; these will not provide ordering. The
- * rationale is that whatever is used to obtain the object we're increasing the
- * reference count on will provide the ordering. For locked data structures,
- * its the lock acquire, for RCU/lockless data structures its the dependent
- * load.
- *
- * Do note that inc_not_zero() provides a control dependency which will order
- * future stores against the inc, this ensures we'll never modify the object
- * if we did not in fact acquire a reference.
- *
- * The decrements will provide release order, such that all the prior loads and
- * stores will be issued before, it also provides a control dependency, which
- * will order us against the subsequent free().
- *
- * The control dependency is against the load of the cmpxchg (ll/sc) that
- * succeeded. This means the stores aren't fully ordered, but this is fine
- * because the 1->0 transition indicates no concurrency.
- *
- * Note that the allocator is responsible for ordering things between free()
- * and alloc().
- *
- */
-
 #include <linux/atomic.h>
-#include <linux/bug.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 
-#ifdef CONFIG_DEBUG_REFCOUNT
-#define REFCOUNT_WARN(cond, str) WARN_ON(cond)
-#define __refcount_check	__must_check
-#else
-#define REFCOUNT_WARN(cond, str) (void)(cond)
-#define __refcount_check
-#endif
-
 typedef struct refcount_struct {
 	atomic_t refs;
 } refcount_t;
@@ -66,229 +21,21 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
-static inline __refcount_check
-bool refcount_add_not_zero(unsigned int i, refcount_t *r)
-{
-	unsigned int old, new, val = atomic_read(&r->refs);
-
-	for (;;) {
-		if (!val)
-			return false;
-
-		if (unlikely(val == UINT_MAX))
-			return true;
-
-		new = val + i;
-		if (new < val)
-			new = UINT_MAX;
-		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-		if (old == val)
-			break;
-
-		val = old;
-	}
-
-	REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-	return true;
-}
-
-static inline void refcount_add(unsigned int i, refcount_t *r)
-{
-	REFCOUNT_WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- */
-static inline __refcount_check
-bool refcount_inc_not_zero(refcount_t *r)
-{
-	unsigned int old, new, val = atomic_read(&r->refs);
-
-	for (;;) {
-		new = val + 1;
-
-		if (!val)
-			return false;
-
-		if (unlikely(!new))
-			return true;
-
-		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-		if (old == val)
-			break;
-
-		val = old;
-	}
-
-	REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-	return true;
-}
-
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
- */
-static inline void refcount_inc(refcount_t *r)
-{
-	REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
-}
-
-/*
- * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_sub_and_test(unsigned int i, refcount_t *r)
-{
-	unsigned int old, new, val = atomic_read(&r->refs);
-
-	for (;;) {
-		if (unlikely(val == UINT_MAX))
-			return false;
-
-		new = val - i;
-		if (new > val) {
-			REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
-			return false;
-		}
-
-		old = atomic_cmpxchg_release(&r->refs, val, new);
-		if (old == val)
-			break;
-
-		val = old;
-	}
-
-	return !new;
-}
-
-static inline __refcount_check
-bool refcount_dec_and_test(refcount_t *r)
-{
-	return refcount_sub_and_test(1, r);
-}
+extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
+extern void refcount_add(unsigned int i, refcount_t *r);
 
-/*
- * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before.
- */
-static inline
-void refcount_dec(refcount_t *r)
-{
-	REFCOUNT_WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
-}
-
-/*
- * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
- * success thereof.
- *
- * Like all decrement operations, it provides release memory order and provides
- * a control dependency.
- *
- * It can be used like a try-delete operator; this explicit case is provided
- * and not cmpxchg in generic, because that would allow implementing unsafe
- * operations.
- */
-static inline __refcount_check
-bool refcount_dec_if_one(refcount_t *r)
-{
-	return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
-}
-
-/*
- * No atomic_t counterpart, it decrements unless the value is 1, in which case
- * it will return false.
- *
- * Was often done like: atomic_add_unless(&var, -1, 1)
- */
-static inline __refcount_check
-bool refcount_dec_not_one(refcount_t *r)
-{
-	unsigned int old, new, val = atomic_read(&r->refs);
+extern __must_check bool refcount_inc_not_zero(refcount_t *r);
+extern void refcount_inc(refcount_t *r);
 
-	for (;;) {
-		if (unlikely(val == UINT_MAX))
-			return true;
+extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
+extern void refcount_sub(unsigned int i, refcount_t *r);
 
-		if (val == 1)
-			return false;
+extern __must_check bool refcount_dec_and_test(refcount_t *r);
+extern void refcount_dec(refcount_t *r);
 
-		new = val - 1;
-		if (new > val) {
-			REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
-			return true;
-		}
-
-		old = atomic_cmpxchg_release(&r->refs, val, new);
-		if (old == val)
-			break;
-
-		val = old;
-	}
-
-	return true;
-}
-
-/*
- * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
- * to decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
-{
-	if (refcount_dec_not_one(r))
-		return false;
-
-	mutex_lock(lock);
-	if (!refcount_dec_and_test(r)) {
-		mutex_unlock(lock);
-		return false;
-	}
-
-	return true;
-}
-
-/*
- * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides a control dependency such that free() must come after.
- * See the comment on top.
- */
-static inline __refcount_check
-bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
-{
-	if (refcount_dec_not_one(r))
-		return false;
-
-	spin_lock(lock);
-	if (!refcount_dec_and_test(r)) {
-		spin_unlock(lock);
-		return false;
-	}
-
-	return true;
-}
+extern __must_check bool refcount_dec_if_one(refcount_t *r);
+extern __must_check bool refcount_dec_not_one(refcount_t *r);
+extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
+extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
 
 #endif /* _LINUX_REFCOUNT_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index acedbe626d47..0dbce99d8433 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -716,19 +716,6 @@ source "lib/Kconfig.kmemcheck"
 
 source "lib/Kconfig.kasan"
 
-config DEBUG_REFCOUNT
-	bool "Verbose refcount checks"
-	help
-	  Say Y here if you want reference counters (refcount_t and kref) to
-	  generate WARNs on dubious usage. Without this refcount_t will still
-	  be a saturating counter and avoid Use-After-Free by turning it into
-	  a resource leak Denial-Of-Service.
-
-	  Use of this option will increase kernel text size but will alert the
-	  admin of potential abuse.
-
-	  If in doubt, say "N".
-
 endmenu # "Memory Debugging"
 
 config ARCH_HAS_KCOV
diff --git a/lib/Makefile b/lib/Makefile
index 19ea76149a37..192e4d03caf9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -36,7 +36,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-	 once.o
+	 once.o refcount.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
diff --git a/lib/refcount.c b/lib/refcount.c
new file mode 100644
index 000000000000..1d33366189d1
--- /dev/null
+++ b/lib/refcount.c
@@ -0,0 +1,267 @@
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/refcount.h>
+#include <linux/bug.h>
+
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (!val)
+			return false;
+
+		if (unlikely(val == UINT_MAX))
+			return true;
+
+		new = val + i;
+		if (new < val)
+			new = UINT_MAX;
+		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(refcount_add_not_zero);
+
+void refcount_add(unsigned int i, refcount_t *r)
+{
+	WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_add);
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+bool refcount_inc_not_zero(refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		new = val + 1;
+
+		if (!val)
+			return false;
+
+		if (unlikely(!new))
+			return true;
+
+		old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+void refcount_inc(refcount_t *r)
+{
+	WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_inc);
+
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (unlikely(val == UINT_MAX))
+			return false;
+
+		new = val - i;
+		if (new > val) {
+			WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+			return false;
+		}
+
+		old = atomic_cmpxchg_release(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	return !new;
+}
+EXPORT_SYMBOL_GPL(refcount_sub_and_test);
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_test(refcount_t *r)
+{
+	return refcount_sub_and_test(1, r);
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_test);
+
+/*
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+
+void refcount_dec(refcount_t *r)
+{
+	WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+}
+EXPORT_SYMBOL_GPL(refcount_dec);
+
+/*
+ * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
+ * success thereof.
+ *
+ * Like all decrement operations, it provides release memory order and provides
+ * a control dependency.
+ *
+ * It can be used like a try-delete operator; this explicit case is provided
+ * and not cmpxchg in generic, because that would allow implementing unsafe
+ * operations.
+ */
+bool refcount_dec_if_one(refcount_t *r)
+{
+	return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_if_one);
+
+/*
+ * No atomic_t counterpart, it decrements unless the value is 1, in which case
+ * it will return false.
+ *
+ * Was often done like: atomic_add_unless(&var, -1, 1)
+ */
+bool refcount_dec_not_one(refcount_t *r)
+{
+	unsigned int old, new, val = atomic_read(&r->refs);
+
+	for (;;) {
+		if (unlikely(val == UINT_MAX))
+			return true;
+
+		if (val == 1)
+			return false;
+
+		new = val - 1;
+		if (new > val) {
+			WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+			return true;
+		}
+
+		old = atomic_cmpxchg_release(&r->refs, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_not_one);
+
+/*
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
+{
+	if (refcount_dec_not_one(r))
+		return false;
+
+	mutex_lock(lock);
+	if (!refcount_dec_and_test(r)) {
+		mutex_unlock(lock);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
+
+/*
+ * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
+{
+	if (refcount_dec_not_one(r))
+		return false;
+
+	spin_lock(lock);
+	if (!refcount_dec_and_test(r)) {
+		spin_unlock(lock);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(refcount_dec_and_lock);
+
-- 
cgit v1.2.3


From 318b1dedcd39012624f466d281627553e9fa2570 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Thu, 23 Feb 2017 15:09:34 +0200
Subject: locking/refcounts: Add missing kernel.h header to have UINT_MAX
 defined

Fix header dependency.

Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1487855374-21993-1-git-send-email-elena.reshetova@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/refcount.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0e8cfb2ce91e..0023fee4bbbc 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -4,6 +4,7 @@
 #include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
+#include <linux/kernel.h>
 
 typedef struct refcount_struct {
 	atomic_t refs;
-- 
cgit v1.2.3


From d1091c7fa3d52ebce4dd3f15d04155b3469b2f90 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 21 Feb 2017 15:35:32 -0600
Subject: objtool: Improve detection of BUG() and other dead ends

The BUG() macro's use of __builtin_unreachable() via the unreachable()
macro tells gcc that the instruction is a dead end, and that it's safe
to assume the current code path will not execute past the previous
instruction.

On x86, the BUG() macro is implemented with the 'ud2' instruction.  When
objtool's branch analysis sees that instruction, it knows the current
code path has come to a dead end.

Peter Zijlstra has been working on a patch to change the WARN macros to
use 'ud2'.  That patch will break objtool's assumption that 'ud2' is
always a dead end.

Generally it's best for objtool to avoid making those kinds of
assumptions anyway.  The more ignorant it is of kernel code internals,
the better.

So create a more generic way for objtool to detect dead ends by adding
an annotation to the unreachable() macro.  The annotation stores a
pointer to the end of the unreachable code path in an '__unreachable'
section.  Objtool can read that section to find the dead ends.

Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/41a6d33971462ebd944a1c60ad4bf5be86c17b77.1487712920.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S   |  1 +
 include/linux/compiler-gcc.h    | 13 ++++++++-
 tools/objtool/arch.h            |  5 ++--
 tools/objtool/arch/x86/decode.c |  3 ---
 tools/objtool/builtin-check.c   | 60 ++++++++++++++++++++++++++++++++++++++---
 5 files changed, 71 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e79f15f108a8..ad0118fbce90 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -346,6 +346,7 @@ SECTIONS
 	/DISCARD/ : {
 		*(.eh_frame)
 		*(__func_stack_frame_non_standard)
+		*(__unreachable)
 	}
 }
 
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 0444b1336268..8ea159fc489d 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -195,6 +195,17 @@
 #endif
 #endif
 
+#ifdef CONFIG_STACK_VALIDATION
+#define annotate_unreachable() ({					\
+	asm("1:\t\n"							\
+	    ".pushsection __unreachable, \"a\"\t\n"			\
+	    ".long 1b\t\n"						\
+	    ".popsection\t\n");						\
+})
+#else
+#define annotate_unreachable()
+#endif
+
 /*
  * Mark a position in code as unreachable.  This can be used to
  * suppress control flow warnings after asm blocks that transfer
@@ -204,7 +215,7 @@
  * this in the preprocessor, but we can live with this because they're
  * unreleased.  Really, we need to have autoconf for the kernel.
  */
-#define unreachable() __builtin_unreachable()
+#define unreachable() annotate_unreachable(); __builtin_unreachable()
 
 /* Mark a function definition as prohibited from being cloned. */
 #define __noclone	__attribute__((__noclone__, __optimize__("no-tracer")))
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index f7350fcedc70..a59e061c0b4a 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -31,9 +31,8 @@
 #define INSN_CALL_DYNAMIC	8
 #define INSN_RETURN		9
 #define INSN_CONTEXT_SWITCH	10
-#define INSN_BUG		11
-#define INSN_NOP		12
-#define INSN_OTHER		13
+#define INSN_NOP		11
+#define INSN_OTHER		12
 #define INSN_LAST		INSN_OTHER
 
 int arch_decode_instruction(struct elf *elf, struct section *sec,
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 039636ffb6c8..6ac99e3266eb 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -118,9 +118,6 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			 op2 == 0x35)
 			/* sysenter, sysret */
 			*type = INSN_CONTEXT_SWITCH;
-		else if (op2 == 0x0b || op2 == 0xb9)
-			/* ud2 */
-			*type = INSN_BUG;
 		else if (op2 == 0x0d || op2 == 0x1f)
 			/* nopl/nopw */
 			*type = INSN_NOP;
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index e8a1f699058a..5fc52ee3264c 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -51,7 +51,7 @@ struct instruction {
 	unsigned int len, state;
 	unsigned char type;
 	unsigned long immediate;
-	bool alt_group, visited;
+	bool alt_group, visited, dead_end;
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
 	struct list_head alts;
@@ -329,6 +329,54 @@ static int decode_instructions(struct objtool_file *file)
 	return 0;
 }
 
+/*
+ * Find all uses of the unreachable() macro, which are code path dead ends.
+ */
+static int add_dead_ends(struct objtool_file *file)
+{
+	struct section *sec;
+	struct rela *rela;
+	struct instruction *insn;
+	bool found;
+
+	sec = find_section_by_name(file->elf, ".rela__unreachable");
+	if (!sec)
+		return 0;
+
+	list_for_each_entry(rela, &sec->rela_list, list) {
+		if (rela->sym->type != STT_SECTION) {
+			WARN("unexpected relocation symbol type in .rela__unreachable");
+			return -1;
+		}
+		insn = find_insn(file, rela->sym->sec, rela->addend);
+		if (insn)
+			insn = list_prev_entry(insn, list);
+		else if (rela->addend == rela->sym->sec->len) {
+			found = false;
+			list_for_each_entry_reverse(insn, &file->insn_list, list) {
+				if (insn->sec == rela->sym->sec) {
+					found = true;
+					break;
+				}
+			}
+
+			if (!found) {
+				WARN("can't find unreachable insn at %s+0x%x",
+				     rela->sym->sec->name, rela->addend);
+				return -1;
+			}
+		} else {
+			WARN("can't find unreachable insn at %s+0x%x",
+			     rela->sym->sec->name, rela->addend);
+			return -1;
+		}
+
+		insn->dead_end = true;
+	}
+
+	return 0;
+}
+
 /*
  * Warnings shouldn't be reported for ignored functions.
  */
@@ -843,6 +891,10 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
+	ret = add_dead_ends(file);
+	if (ret)
+		return ret;
+
 	add_ignores(file);
 
 	ret = add_jump_destinations(file);
@@ -1037,13 +1089,13 @@ static int validate_branch(struct objtool_file *file,
 
 			return 0;
 
-		case INSN_BUG:
-			return 0;
-
 		default:
 			break;
 		}
 
+		if (insn->dead_end)
+			return 0;
+
 		insn = next_insn_same_sec(file, insn);
 		if (!insn) {
 			WARN("%s: unexpected end of section", sec->name);
-- 
cgit v1.2.3


From b18b9550e4059ceea0393c518eb323b95243f92f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sat, 11 Feb 2017 18:46:08 +0100
Subject: libceph: get rid of ack vs commit

- CEPH_OSD_FLAG_ACK shouldn't be set anymore, so assert on it
- remove support for handling ack replies (OSDs will send ack replies
  only if clients request them)
- drop the "do lingering callbacks under osd->lock" logic from
  handle_reply() -- lreq->lock is sufficient in all three cases

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 include/linux/ceph/osd_client.h |   6 +--
 net/ceph/osd_client.c           | 113 +++++++++-------------------------------
 2 files changed, 27 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653d329a..2ea0c282f3dc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -22,7 +22,6 @@ struct ceph_osd_client;
  * completion callback for async writepages
  */
 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 #define CEPH_HOMELESS_OSD	-1
 
@@ -170,15 +169,12 @@ struct ceph_osd_request {
 	unsigned int		r_num_ops;
 
 	int               r_result;
-	bool              r_got_reply;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
 	bool              r_mempool;
-	struct completion r_completion;
-	struct completion r_done_completion;  /* fsync waiter */
+	struct completion r_completion;       /* private to osd_client.c */
 	ceph_osdc_callback_t r_callback;
-	ceph_osdc_unsafe_callback_t r_unsafe_callback;
 	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ac4753421d0c..e1c6c2b4a295 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
 
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
-	init_completion(&req->r_done_completion);
 	RB_CLEAR_NODE(&req->r_node);
 	RB_CLEAR_NODE(&req->r_mc_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -1637,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	bool need_send = false;
 	bool promoted = false;
 
-	WARN_ON(req->r_tid || req->r_got_reply);
+	WARN_ON(req->r_tid);
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
@@ -1705,17 +1704,10 @@ promote:
 
 static void account_request(struct ceph_osd_request *req)
 {
-	unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+	WARN_ON(req->r_flags & CEPH_OSD_FLAG_ACK);
+	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
 
-	if (req->r_flags & CEPH_OSD_FLAG_READ) {
-		WARN_ON(req->r_flags & mask);
-		req->r_flags |= CEPH_OSD_FLAG_ACK;
-	} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-		WARN_ON(!(req->r_flags & mask));
-	else
-		WARN_ON(1);
-
-	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+	req->r_flags |= CEPH_OSD_FLAG_ONDISK;
 	atomic_inc(&req->r_osdc->num_requests);
 }
 
@@ -1750,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req)
 
 static void __complete_request(struct ceph_osd_request *req)
 {
-	if (req->r_callback)
+	if (req->r_callback) {
+		dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+		     req->r_tid, req->r_callback, req->r_result);
 		req->r_callback(req);
-	else
-		complete_all(&req->r_completion);
+	}
 }
 
 /*
- * Note that this is open-coded in handle_reply(), which has to deal
- * with ack vs commit, dup acks, etc.
+ * This is open-coded in handle_reply().
  */
 static void complete_request(struct ceph_osd_request *req, int err)
 {
@@ -1767,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err)
 	req->r_result = err;
 	finish_request(req);
 	__complete_request(req);
-	complete_all(&req->r_done_completion);
+	complete_all(&req->r_completion);
 	ceph_osdc_put_request(req);
 }
 
@@ -1793,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req)
 
 	cancel_map_check(req);
 	finish_request(req);
-	complete_all(&req->r_done_completion);
+	complete_all(&req->r_completion);
 	ceph_osdc_put_request(req);
 }
 
@@ -2170,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
 	mutex_lock(&lreq->lock);
 	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
 	     lreq->linger_id, req->r_result);
-	WARN_ON(!__linger_registered(lreq));
 	linger_reg_commit_complete(lreq, req->r_result);
 	lreq->committed = true;
 
@@ -2786,31 +2777,8 @@ e_inval:
 }
 
 /*
- * We are done with @req if
- *   - @m is a safe reply, or
- *   - @m is an unsafe reply and we didn't want a safe one
- */
-static bool done_request(const struct ceph_osd_request *req,
-			 const struct MOSDOpReply *m)
-{
-	return (m->result < 0 ||
-		(m->flags & CEPH_OSD_FLAG_ONDISK) ||
-		!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- *
- * ->r_unsafe_callback is set?	yes			no
- *
- * first reply is OK (needed	r_cb/r_completion,	r_cb/r_completion,
- * any or needed/got safe)	r_done_completion	r_done_completion
- *
- * first reply is unsafe	r_unsafe_cb(true)	(nothing)
- *
- * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
- *				r_done_completion	r_done_completion
+ * Handle MOSDOpReply.  Set ->r_result and call the callback if it is
+ * specified.
  */
 static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
@@ -2819,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 	struct MOSDOpReply m;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 	u32 data_len = 0;
-	bool already_acked;
 	int ret;
 	int i;
 
@@ -2898,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 		       le32_to_cpu(msg->hdr.data_len), req->r_tid);
 		goto fail_request;
 	}
-	dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
-	     req, req->r_tid, req->r_got_reply, m.result, data_len);
-
-	already_acked = req->r_got_reply;
-	if (!already_acked) {
-		req->r_result = m.result ?: data_len;
-		req->r_replay_version = m.replay_version; /* struct */
-		req->r_got_reply = true;
-	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
-		dout("req %p tid %llu dup ack\n", req, req->r_tid);
-		goto out_unlock_session;
-	}
-
-	if (done_request(req, &m)) {
-		finish_request(req);
-		if (req->r_linger) {
-			WARN_ON(req->r_unsafe_callback);
-			dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
-			__complete_request(req);
-		}
-	}
+	dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+	     req, req->r_tid, m.result, data_len);
 
+	/*
+	 * Since we only ever request ONDISK, we should only ever get
+	 * one (type of) reply back.
+	 */
+	WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+	req->r_result = m.result ?: data_len;
+	finish_request(req);
 	mutex_unlock(&osd->lock);
 	up_read(&osdc->lock);
 
-	if (done_request(req, &m)) {
-		if (already_acked && req->r_unsafe_callback) {
-			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
-			req->r_unsafe_callback(req, false);
-		} else if (!req->r_linger) {
-			dout("req %p tid %llu cb\n", req, req->r_tid);
-			__complete_request(req);
-		}
-		complete_all(&req->r_done_completion);
-		ceph_osdc_put_request(req);
-	} else {
-		if (req->r_unsafe_callback) {
-			dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
-			req->r_unsafe_callback(req, true);
-		} else {
-			WARN_ON(1);
-		}
-	}
-
+	__complete_request(req);
+	complete_all(&req->r_completion);
+	ceph_osdc_put_request(req);
 	return;
 
 fail_request:
@@ -3541,7 +3480,7 @@ again:
 			up_read(&osdc->lock);
 			dout("%s waiting on req %p tid %llu last_tid %llu\n",
 			     __func__, req, req->r_tid, last_tid);
-			wait_for_completion(&req->r_done_completion);
+			wait_for_completion(&req->r_completion);
 			ceph_osdc_put_request(req);
 			goto again;
 		}
-- 
cgit v1.2.3


From 05a45a2db42543c5f1a32e08f545aebbd7cb4790 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 24 Feb 2017 13:25:22 -0500
Subject: sunrpc: turn bitfield flags in svc_version into bools

It's just simpler to read this way, IMO. Also, no need to explicitly
set vs_hidden to false in the nfsacl ones.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/callback_xdr.c      | 4 ++--
 fs/nfsd/nfs2acl.c          | 1 -
 fs/nfsd/nfs3acl.c          | 1 -
 fs/nfsd/nfs4proc.c         | 2 +-
 include/linux/sunrpc/svc.h | 9 +++++----
 net/sunrpc/svc.c           | 2 +-
 6 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index eb094c6011d8..e9836f611d9c 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1083,7 +1083,7 @@ struct svc_version nfs4_callback_version1 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
-	.vs_hidden = 1,
+	.vs_hidden = true,
 };
 
 struct svc_version nfs4_callback_version4 = {
@@ -1092,5 +1092,5 @@ struct svc_version nfs4_callback_version4 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
-	.vs_hidden = 1,
+	.vs_hidden = true,
 };
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index d08cd88155c7..838f90f3f890 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -376,5 +376,4 @@ struct svc_version	nfsd_acl_version2 = {
 		.vs_proc	= nfsd_acl_procedures2,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
-		.vs_hidden	= 0,
 };
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 0c890347cde3..dcb5f79076c0 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -266,6 +266,5 @@ struct svc_version	nfsd_acl_version3 = {
 		.vs_proc	= nfsd_acl_procedures3,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
-		.vs_hidden	= 0,
 };
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 02750a4c8770..89e582fa58cd 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2542,7 +2542,7 @@ struct svc_version	nfsd_version4 = {
 		.vs_proc	= nfsd_procedures4,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
-		.vs_rpcb_optnl	= 1,
+		.vs_rpcb_optnl	= true,
 };
 
 /*
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 7321ae933867..96467c95f02e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -400,10 +400,11 @@ struct svc_version {
 	struct svc_procedure *	vs_proc;	/* per-procedure info */
 	u32			vs_xdrsize;	/* xdrsize needed for this version */
 
-	unsigned int		vs_hidden : 1,	/* Don't register with portmapper.
-						 * Only used for nfsacl so far. */
-				vs_rpcb_optnl:1;/* Don't care the result of register.
-						 * Only used for nfsv4. */
+	/* Don't register with rpcbind */
+	bool			vs_hidden;
+
+	/* Don't care if the rpcbind registration fails */
+	bool			vs_rpcb_optnl;
 
 	/* Override dispatch function (e.g. when caching replies).
 	 * A return value of 0 means drop the request. 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 75f290bddca1..85bcdea67a3f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
 		for (i = 0; i < progp->pg_nvers; i++) {
 			if (progp->pg_vers[i] == NULL)
 				continue;
-			if (progp->pg_vers[i]->vs_hidden == 0)
+			if (!progp->pg_vers[i]->vs_hidden)
 				return 1;
 		}
 	}
-- 
cgit v1.2.3


From 362142b25843fb059941aaa01b91501d5d8652cc Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 24 Feb 2017 13:25:23 -0500
Subject: sunrpc: flag transports as having congestion control

NFSv4 requires a transport protocol with congestion control in most
cases.

On an IP network, that means that NFSv4 over UDP should be forbidden.

The situation with RDMA is a bit more nuanced, but most RDMA transports
are suitable for this. For now, we assume that all RDMA transports are
suitable, but we may need to revise that at some point.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          | 1 +
 net/sunrpc/svcsock.c                     | 1 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 ++++++++
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 7440290f64ac..ddb7f94a9d06 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -67,6 +67,7 @@ struct svc_xprt {
 #define XPT_CACHE_AUTH	11		/* cache auth info */
 #define XPT_LOCAL	12		/* connection from loopback interface */
 #define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
+#define XPT_CONG_CTRL	14		/* has congestion control */
 
 	struct svc_serv		*xpt_server;	/* service for transport */
 	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index de066acdb34e..1956b8b96b2d 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
 		      &svsk->sk_xprt, serv);
 	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+	set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
 	if (sk->sk_state == TCP_LISTEN) {
 		dprintk("setting up TCP socket for listening\n");
 		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index ab2fd5377f94..9d9e4e16080f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -569,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
 	spin_lock_init(&cma_xprt->sc_map_lock);
 
+	/*
+	 * Note that this implies that the underlying transport support
+	 * has some form of congestion control (see RFC 7530 section 3.1
+	 * paragraph 2). For now, we assume that all supported RDMA
+	 * transports are suitable here.
+	 */
+	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
 	if (listener)
 		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
 
-- 
cgit v1.2.3


From bb292ac1c6028344013309a309b44dc691581825 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 25 Jan 2017 14:21:10 -0800
Subject: watchdog: Introduce watchdog_stop_on_unregister helper

Many watchdog drivers explicitly stop the watchdog when unregistering it.
While it is unclear if this is actually needed (the whatdog should not be
running at that time if it can be stopped), introduce a helper to
explicitly stop the watchdog in the watchdog core when unregistering it.
This helps reducing driver code size while retaining functionality.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/watchdog/watchdog-kernel-api.txt | 6 ++++++
 drivers/watchdog/watchdog_dev.c                | 5 +++++
 include/linux/watchdog.h                       | 7 +++++++
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/watchdog/watchdog-kernel-api.txt b/Documentation/watchdog/watchdog-kernel-api.txt
index ea277478982f..9b93953f69cf 100644
--- a/Documentation/watchdog/watchdog-kernel-api.txt
+++ b/Documentation/watchdog/watchdog-kernel-api.txt
@@ -280,6 +280,12 @@ To disable the watchdog on reboot, the user must call the following helper:
 
 static inline void watchdog_stop_on_reboot(struct watchdog_device *wdd);
 
+To disable the watchdog when unregistering the watchdog, the user must call
+the following helper. Note that this will only stop the watchdog if the
+nowayout flag is not set.
+
+static inline void watchdog_stop_on_unregister(struct watchdog_device *wdd);
+
 To change the priority of the restart handler the following helper should be
 used:
 
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 32930a073a12..d5d2bbd8f428 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -987,6 +987,11 @@ static void watchdog_cdev_unregister(struct watchdog_device *wdd)
 	wdd->wd_data = NULL;
 	mutex_unlock(&wd_data->lock);
 
+	if (watchdog_active(wdd) &&
+	    test_bit(WDOG_STOP_ON_UNREGISTER, &wdd->status)) {
+		watchdog_stop(wdd);
+	}
+
 	cancel_delayed_work_sync(&wd_data->work);
 
 	kref_put(&wd_data->kref, watchdog_core_data_release);
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 35a4d8185b51..a786e5e8973b 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -117,6 +117,7 @@ struct watchdog_device {
 #define WDOG_NO_WAY_OUT		1	/* Is 'nowayout' feature set ? */
 #define WDOG_STOP_ON_REBOOT	2	/* Should be stopped on reboot */
 #define WDOG_HW_RUNNING		3	/* True if HW watchdog running */
+#define WDOG_STOP_ON_UNREGISTER	4	/* Should be stopped on unregister */
 	struct list_head deferred;
 };
 
@@ -151,6 +152,12 @@ static inline void watchdog_stop_on_reboot(struct watchdog_device *wdd)
 	set_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
 }
 
+/* Use the following function to stop the watchdog when unregistering it */
+static inline void watchdog_stop_on_unregister(struct watchdog_device *wdd)
+{
+	set_bit(WDOG_STOP_ON_UNREGISTER, &wdd->status);
+}
+
 /* Use the following function to check if a timeout value is invalid */
 static inline bool watchdog_timeout_invalid(struct watchdog_device *wdd, unsigned int t)
 {
-- 
cgit v1.2.3


From 5283b03ee5cd28d516646298bead09b238d92ddc Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 24 Feb 2017 13:25:24 -0500
Subject: nfs/nfsd/sunrpc: enforce transport requirements for NFSv4

NFSv4 requires a transport "that is specified to avoid network
congestion" (RFC 7530, section 3.1, paragraph 2).  In practical terms,
that means that you should not run NFSv4 over UDP. The server has never
enforced that requirement, however.

This patchset fixes this by adding a new flag to the svc_version that
states that it has these transport requirements. With that, we can check
that the transport has XPT_CONG_CTRL set before processing an RPC. If it
doesn't we reject it with RPC_PROG_MISMATCH.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/callback_xdr.c      |  2 ++
 fs/nfsd/nfs4proc.c         | 13 +++++++------
 include/linux/sunrpc/svc.h |  3 +++
 net/sunrpc/svc.c           | 15 +++++++++++++++
 4 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index e9836f611d9c..fd0284c1dc32 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1084,6 +1084,7 @@ struct svc_version nfs4_callback_version1 = {
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
 	.vs_hidden = true,
+	.vs_need_cong_ctrl = true,
 };
 
 struct svc_version nfs4_callback_version4 = {
@@ -1093,4 +1094,5 @@ struct svc_version nfs4_callback_version4 = {
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
 	.vs_hidden = true,
+	.vs_need_cong_ctrl = true,
 };
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 89e582fa58cd..cbeeda1e94a2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2537,12 +2537,13 @@ static struct svc_procedure		nfsd_procedures4[2] = {
 };
 
 struct svc_version	nfsd_version4 = {
-		.vs_vers	= 4,
-		.vs_nproc	= 2,
-		.vs_proc	= nfsd_procedures4,
-		.vs_dispatch	= nfsd_dispatch,
-		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
-		.vs_rpcb_optnl	= true,
+	.vs_vers		= 4,
+	.vs_nproc		= 2,
+	.vs_proc		= nfsd_procedures4,
+	.vs_dispatch		= nfsd_dispatch,
+	.vs_xdrsize		= NFS4_SVC_XDRSIZE,
+	.vs_rpcb_optnl		= true,
+	.vs_need_cong_ctrl	= true,
 };
 
 /*
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 96467c95f02e..e770abeed32d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -406,6 +406,9 @@ struct svc_version {
 	/* Don't care if the rpcbind registration fails */
 	bool			vs_rpcb_optnl;
 
+	/* Need xprt with congestion control */
+	bool			vs_need_cong_ctrl;
+
 	/* Override dispatch function (e.g. when caching replies).
 	 * A return value of 0 means drop the request. 
 	 * vs_dispatch == NULL means use default dispatcher.
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 85bcdea67a3f..1fc3ff822168 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1169,6 +1169,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	  !(versp = progp->pg_vers[vers]))
 		goto err_bad_vers;
 
+	/*
+	 * Some protocol versions (namely NFSv4) require some form of
+	 * congestion control.  (See RFC 7530 section 3.1 paragraph 2)
+	 * In other words, UDP is not allowed. We mark those when setting
+	 * up the svc_xprt, and verify that here.
+	 *
+	 * The spec is not very clear about what error should be returned
+	 * when someone tries to access a server that is listening on UDP
+	 * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+	 * fit.
+	 */
+	if (versp->vs_need_cong_ctrl &&
+	    !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+		goto err_bad_vers;
+
 	procp = versp->vs_proc + proc;
 	if (proc >= versp->vs_nproc || !procp->pc_func)
 		goto err_bad_proc;
-- 
cgit v1.2.3


From 3fc21924100b13f73c734d0ce8dfcfe913fcf7a8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 24 Feb 2017 14:55:48 -0800
Subject: mm: validate device_hotplug is held for memory hotplug

mem_hotplug_begin() assumes that it can set mem_hotplug.active_writer
and run the hotplug process without racing another thread.  Validate
this assumption with a lockdep assertion.

Link: http://lkml.kernel.org/r/148693886229.16345.1770484669403334689.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/core.c    | 5 +++++
 include/linux/device.h | 1 +
 mm/memory_hotplug.c    | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8c25e68e67d7..3050e6f99403 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -638,6 +638,11 @@ int lock_device_hotplug_sysfs(void)
 	return restart_syscall();
 }
 
+void assert_held_device_hotplug(void)
+{
+	lockdep_assert_held(&device_hotplug_lock);
+}
+
 #ifdef CONFIG_BLOCK
 static inline int device_is_not_partition(struct device *dev)
 {
diff --git a/include/linux/device.h b/include/linux/device.h
index bd684fc8ec1d..a48a7ff70164 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1139,6 +1139,7 @@ static inline bool device_supports_offline(struct device *dev)
 extern void lock_device_hotplug(void);
 extern void unlock_device_hotplug(void);
 extern int lock_device_hotplug_sysfs(void);
+void assert_held_device_hotplug(void);
 extern int device_offline(struct device *dev);
 extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d67787d10ff0..92a242af5a91 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -126,6 +126,8 @@ void put_online_mems(void)
 
 void mem_hotplug_begin(void)
 {
+	assert_held_device_hotplug();
+
 	mem_hotplug.active_writer = current;
 
 	memhp_lock_acquire();
-- 
cgit v1.2.3


From 0262d9c845ec349edf93f69688a5129c36cc2232 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 24 Feb 2017 14:55:59 -0800
Subject: memblock: embed memblock type name within struct memblock_type

Provide the name of each memblock type with struct memblock_type.  This
allows to get rid of the function memblock_type_name() and duplicating
the type names in __memblock_dump_all().

The only memblock_type usage out of mm/memblock.c seems to be
arch/s390/kernel/crash_dump.c.  While at it, give it a name.

Link: http://lkml.kernel.org/r/20170120123456.46508-4-heiko.carstens@de.ibm.com
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Philipp Hachtmann <phacht@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/crash_dump.c |  1 +
 include/linux/memblock.h      |  1 +
 mm/memblock.c                 | 35 +++++++++++------------------------
 3 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index e2293c662bdf..dd1d5c62c374 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -32,6 +32,7 @@ static struct memblock_type oldmem_type = {
 	.max = 1,
 	.total_size = 0,
 	.regions = &oldmem_region,
+	.name = "oldmem",
 };
 
 struct save_area {
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 38bcf00cbed3..bdfc65af4152 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock_type {
 	unsigned long max;	/* size of the allocated array */
 	phys_addr_t total_size;	/* size of all regions */
 	struct memblock_region *regions;
+	char *name;
 };
 
 struct memblock {
diff --git a/mm/memblock.c b/mm/memblock.c
index 647d79ed07a3..b64b47803e52 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -35,15 +35,18 @@ struct memblock memblock __initdata_memblock = {
 	.memory.regions		= memblock_memory_init_regions,
 	.memory.cnt		= 1,	/* empty dummy entry */
 	.memory.max		= INIT_MEMBLOCK_REGIONS,
+	.memory.name		= "memory",
 
 	.reserved.regions	= memblock_reserved_init_regions,
 	.reserved.cnt		= 1,	/* empty dummy entry */
 	.reserved.max		= INIT_MEMBLOCK_REGIONS,
+	.reserved.name		= "reserved",
 
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
 	.physmem.regions	= memblock_physmem_init_regions,
 	.physmem.cnt		= 1,	/* empty dummy entry */
 	.physmem.max		= INIT_PHYSMEM_REGIONS,
+	.physmem.name		= "physmem",
 #endif
 
 	.bottom_up		= false,
@@ -64,22 +67,6 @@ ulong __init_memblock choose_memblock_flags(void)
 	return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
 }
 
-/* inline so we don't get a warning when pr_debug is compiled out */
-static __init_memblock const char *
-memblock_type_name(struct memblock_type *type)
-{
-	if (type == &memblock.memory)
-		return "memory";
-	else if (type == &memblock.reserved)
-		return "reserved";
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	else if (type == &memblock.physmem)
-		return "physmem";
-#endif
-	else
-		return "unknown";
-}
-
 /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
 static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
 {
@@ -406,12 +393,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
 	}
 	if (!addr) {
 		pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
-		       memblock_type_name(type), type->max, type->max * 2);
+		       type->name, type->max, type->max * 2);
 		return -1;
 	}
 
 	memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
-			memblock_type_name(type), type->max * 2, (u64)addr,
+			type->name, type->max * 2, (u64)addr,
 			(u64)addr + new_size - 1);
 
 	/*
@@ -1697,14 +1684,14 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
 	return memblock.current_limit;
 }
 
-static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
+static void __init_memblock memblock_dump(struct memblock_type *type)
 {
 	phys_addr_t base, end, size;
 	unsigned long flags;
 	int idx;
 	struct memblock_region *rgn;
 
-	pr_info(" %s.cnt  = 0x%lx\n", name, type->cnt);
+	pr_info(" %s.cnt  = 0x%lx\n", type->name, type->cnt);
 
 	for_each_memblock_type(type, rgn) {
 		char nid_buf[32] = "";
@@ -1719,7 +1706,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
 				 memblock_get_region_node(rgn));
 #endif
 		pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
-			name, idx, &base, &end, &size, nid_buf, flags);
+			type->name, idx, &base, &end, &size, nid_buf, flags);
 	}
 }
 
@@ -1730,10 +1717,10 @@ void __init_memblock __memblock_dump_all(void)
 		&memblock.memory.total_size,
 		&memblock.reserved.total_size);
 
-	memblock_dump(&memblock.memory, "memory");
-	memblock_dump(&memblock.reserved, "reserved");
+	memblock_dump(&memblock.memory);
+	memblock_dump(&memblock.reserved);
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	memblock_dump(&memblock.physmem, "physmem");
+	memblock_dump(&memblock.physmem);
 #endif
 }
 
-- 
cgit v1.2.3


From d811914d87576c562e849c00d9f9beff45038801 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:56:02 -0800
Subject: userfaultfd: non-cooperative: rename *EVENT_MADVDONTNEED to
 *EVENT_REMOVE

Patch series "userfaultfd: non-cooperative: add madvise() event for
MADV_REMOVE request".

These patches add notification of madvise(MADV_REMOVE) event to
non-cooperative userfaultfd monitor.

The first pacth renames EVENT_MADVDONTNEED to EVENT_REMOVE along with
relevant functions and structures.  Using _REMOVE instead of
_MADVDONTNEED describes the event semantics more clearly and I hope it's
not too late for such change in the ABI.

This patch (of 3):

The UFFD_EVENT_MADVDONTNEED purpose is to notify uffd monitor about
removal of certain range from address space tracked by userfaultfd.
Hence, UFFD_EVENT_REMOVE seems to better reflect the operation
semantics.  Respectively, 'madv_dn' field of uffd_msg is renamed to
'remove' and the madvise_userfault_dontneed callback is renamed to
userfaultfd_remove.

Link: http://lkml.kernel.org/r/1484814154-1557-2-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                         | 14 +++++++-------
 include/linux/userfaultfd_k.h            | 16 ++++++++--------
 include/uapi/linux/userfaultfd.h         |  8 ++++----
 mm/madvise.c                             |  2 +-
 tools/testing/selftests/vm/userfaultfd.c | 16 ++++++++--------
 5 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 18406158e13f..8fe601b4875e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -681,16 +681,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
-void madvise_userfault_dontneed(struct vm_area_struct *vma,
-				struct vm_area_struct **prev,
-				unsigned long start, unsigned long end)
+void userfaultfd_remove(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue ewq;
 
 	ctx = vma->vm_userfaultfd_ctx.ctx;
-	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
 		return;
 
 	userfaultfd_ctx_get(ctx);
@@ -700,9 +700,9 @@ void madvise_userfault_dontneed(struct vm_area_struct *vma,
 
 	msg_init(&ewq.msg);
 
-	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
-	ewq.msg.arg.madv_dn.start = start;
-	ewq.msg.arg.madv_dn.end = end;
+	ewq.msg.event = UFFD_EVENT_REMOVE;
+	ewq.msg.arg.remove.start = start;
+	ewq.msg.arg.remove.end = end;
 
 	userfaultfd_event_wait_completion(ctx, &ewq);
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f431861f22f1..2521542f6c07 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,10 +61,10 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
-extern void madvise_userfault_dontneed(struct vm_area_struct *vma,
-				       struct vm_area_struct **prev,
-				       unsigned long start,
-				       unsigned long end);
+extern void userfaultfd_remove(struct vm_area_struct *vma,
+			       struct vm_area_struct **prev,
+			       unsigned long start,
+			       unsigned long end);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -112,10 +112,10 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 {
 }
 
-static inline void madvise_userfault_dontneed(struct vm_area_struct *vma,
-					      struct vm_area_struct **prev,
-					      unsigned long start,
-					      unsigned long end)
+static inline void userfaultfd_remove(struct vm_area_struct *vma,
+				      struct vm_area_struct **prev,
+				      unsigned long start,
+				      unsigned long end)
 {
 }
 #endif /* CONFIG_USERFAULTFD */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 9ac4b68c54d1..b742c40c2880 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -20,7 +20,7 @@
 #define UFFD_API ((__u64)0xAA)
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
-			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
+			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
@@ -92,7 +92,7 @@ struct uffd_msg {
 		struct {
 			__u64	start;
 			__u64	end;
-		} madv_dn;
+		} remove;
 
 		struct {
 			/* unused reserved fields */
@@ -109,7 +109,7 @@ struct uffd_msg {
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
-#define UFFD_EVENT_MADVDONTNEED	0x15
+#define UFFD_EVENT_REMOVE	0x15
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -155,7 +155,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
-#define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
+#define UFFD_FEATURE_EVENT_REMOVE		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 	__u64 features;
diff --git a/mm/madvise.c b/mm/madvise.c
index b530a4986035..ab5ef141cc9b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -479,7 +479,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 	if (!can_madv_dontneed_vma(vma))
 		return -EINVAL;
 
-	madvise_userfault_dontneed(vma, prev, start, end);
+	userfaultfd_remove(vma, prev, start, end);
 	zap_page_range(vma, start, end - start);
 	return 0;
 }
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 5a840a605a16..9eb77df568f7 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -398,12 +398,12 @@ static void *uffd_poll_thread(void *arg)
 			uffd = msg.arg.fork.ufd;
 			pollfd[0].fd = uffd;
 			break;
-		case UFFD_EVENT_MADVDONTNEED:
-			uffd_reg.range.start = msg.arg.madv_dn.start;
-			uffd_reg.range.len = msg.arg.madv_dn.end -
-				msg.arg.madv_dn.start;
+		case UFFD_EVENT_REMOVE:
+			uffd_reg.range.start = msg.arg.remove.start;
+			uffd_reg.range.len = msg.arg.remove.end -
+				msg.arg.remove.start;
 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
-				fprintf(stderr, "madv_dn failure\n"), exit(1);
+				fprintf(stderr, "remove failure\n"), exit(1);
 			break;
 		case UFFD_EVENT_REMAP:
 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
@@ -570,7 +570,7 @@ static int userfaultfd_open(int features)
  * mremap, the entire monitored area is accessed in a single pass for
  * HUGETLB_TEST.
  * The release of the pages currently generates event only for
- * anonymous memory (UFFD_EVENT_MADVDONTNEED), hence it is not checked
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
  * for hugetlb and shmem.
  */
 static int faulting_process(void)
@@ -715,14 +715,14 @@ static int userfaultfd_events_test(void)
 	pid_t pid;
 	char c;
 
-	printf("testing events (fork, remap, madv_dn): ");
+	printf("testing events (fork, remap, remove): ");
 	fflush(stdout);
 
 	if (release_pages(area_dst))
 		return 1;
 
 	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
-		UFFD_FEATURE_EVENT_MADVDONTNEED;
+		UFFD_FEATURE_EVENT_REMOVE;
 	if (userfaultfd_open(features) < 0)
 		return 1;
 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-- 
cgit v1.2.3


From 1276ad68e2491d1ceeb65f55d790f9277593c459 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 24 Feb 2017 14:56:11 -0800
Subject: mm: vmscan: scan dirty pages even in laptop mode

Patch series "mm: vmscan: fix kswapd writeback regression".

We noticed a regression on multiple hadoop workloads when moving from
3.10 to 4.0 and 4.6, which involves kswapd getting tangled up in page
writeout, causing direct reclaim herds that also don't make progress.

I tracked it down to the thrash avoidance efforts after 3.10 that make
the kernel better at keeping use-once cache and use-many cache sorted on
the inactive and active list, with more aggressive protection of the
active list as long as there is inactive cache.  Unfortunately, our
workload's use-once cache is mostly from streaming writes.  Waiting for
writes to avoid potential reloads in the future is not a good tradeoff.

These patches do the following:

1. Wake the flushers when kswapd sees a lump of dirty pages. It's
   possible to be below the dirty background limit and still have cache
   velocity push them through the LRU. So start a-flushin'.

2. Let kswapd only write pages that have been rotated twice. This makes
   sure we really tried to get all the clean pages on the inactive list
   before resorting to horrible LRU-order writeback.

3. Move rotating dirty pages off the inactive list. Instead of churning
   or waiting on page writeback, we'll go after clean active cache. This
   might lead to thrashing, but in this state memory demand outstrips IO
   speed anyway, and reads are faster than writes.

Mel backported the series to 4.10-rc5 with one minor conflict and ran a
couple of tests on it.  Mix of read/write random workload didn't show
anything interesting.  Write-only database didn't show much difference
in performance but there were slight reductions in IO -- probably in the
noise.

simoop did show big differences although not as big as Mel expected.
This is Chris Mason's workload that similate the VM activity of hadoop.
Mel won't go through the full details but over the samples measured
during an hour it reported

                                         4.10.0-rc5            4.10.0-rc5
                                            vanilla         johannes-v1r1
Amean    p50-Read             21346531.56 (  0.00%) 21697513.24 ( -1.64%)
Amean    p95-Read             24700518.40 (  0.00%) 25743268.98 ( -4.22%)
Amean    p99-Read             27959842.13 (  0.00%) 28963271.11 ( -3.59%)
Amean    p50-Write                1138.04 (  0.00%)      989.82 ( 13.02%)
Amean    p95-Write             1106643.48 (  0.00%)    12104.00 ( 98.91%)
Amean    p99-Write             1569213.22 (  0.00%)    36343.38 ( 97.68%)
Amean    p50-Allocation          85159.82 (  0.00%)    79120.70 (  7.09%)
Amean    p95-Allocation         204222.58 (  0.00%)   129018.43 ( 36.82%)
Amean    p99-Allocation         278070.04 (  0.00%)   183354.43 ( 34.06%)
Amean    final-p50-Read       21266432.00 (  0.00%) 21921792.00 ( -3.08%)
Amean    final-p95-Read       24870912.00 (  0.00%) 26116096.00 ( -5.01%)
Amean    final-p99-Read       28147712.00 (  0.00%) 29523968.00 ( -4.89%)
Amean    final-p50-Write          1130.00 (  0.00%)      977.00 ( 13.54%)
Amean    final-p95-Write       1033216.00 (  0.00%)     2980.00 ( 99.71%)
Amean    final-p99-Write       1517568.00 (  0.00%)    32672.00 ( 97.85%)
Amean    final-p50-Allocation    86656.00 (  0.00%)    78464.00 (  9.45%)
Amean    final-p95-Allocation   211712.00 (  0.00%)   116608.00 ( 44.92%)
Amean    final-p99-Allocation   287232.00 (  0.00%)   168704.00 ( 41.27%)

The latencies are actually completely horrific in comparison to 4.4 (and
4.10-rc5 is worse than 4.9 according to historical data for reasons Mel
hasn't analysed yet).

Still, 95% of write latency (p95-write) is halved by the series and
allocation latency is way down.  Direct reclaim activity is one fifth of
what it was according to vmstats.  Kswapd activity is higher but this is
not necessarily surprising.  Kswapd efficiency is unchanged at 99% (99%
of pages scanned were reclaimed) but direct reclaim efficiency went from
77% to 99%

In the vanilla kernel, 627MB of data was written back from reclaim
context.  With the series, no data was written back.  With or without
the patch, pages are being immediately reclaimed after writeback
completes.  However, with the patch, only 1/8th of the pages are
reclaimed like this.

This patch (of 5):

We have an elaborate dirty/writeback throttling mechanism inside the
reclaim scanner, but for that to work the pages have to go through
shrink_page_list() and get counted for what they are.  Otherwise, we
mess up the LRU order and don't match reclaim speed to writeback.

Especially during deactivation, there is never a reason to skip dirty
pages; nothing is even trying to write them out from there.  Don't mess
up the LRU order for nothing, shuffle these pages along.

Link: http://lkml.kernel.org/r/20170123181641.23938-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 --
 mm/vmscan.c            | 14 ++------------
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 82fc632fd11d..8e02b3750fe0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -236,8 +236,6 @@ struct lruvec {
 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
 #define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
 
-/* Isolate clean file */
-#define ISOLATE_CLEAN		((__force isolate_mode_t)0x1)
 /* Isolate unmapped file */
 #define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7bb23ff229b6..0d05f7f3b532 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -87,6 +87,7 @@ struct scan_control {
 	/* The highest zone to isolate pages for reclaim from */
 	enum zone_type reclaim_idx;
 
+	/* Writepage batching in laptop mode; RECLAIM_WRITE */
 	unsigned int may_writepage:1;
 
 	/* Can mapped pages be reclaimed? */
@@ -1373,13 +1374,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 	 * wants to isolate pages it will be able to operate on without
 	 * blocking - clean pages for the most part.
 	 *
-	 * ISOLATE_CLEAN means that only clean pages should be isolated. This
-	 * is used by reclaim when it is cannot write to backing storage
-	 *
 	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
 	 * that it is possible to migrate without blocking
 	 */
-	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+	if (mode & ISOLATE_ASYNC_MIGRATE) {
 		/* All the caller can do on PageWriteback is block */
 		if (PageWriteback(page))
 			return ret;
@@ -1387,10 +1385,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 		if (PageDirty(page)) {
 			struct address_space *mapping;
 
-			/* ISOLATE_CLEAN means only clean pages */
-			if (mode & ISOLATE_CLEAN)
-				return ret;
-
 			/*
 			 * Only pages without mappings or that have a
 			 * ->migratepage callback are possible to migrate
@@ -1731,8 +1725,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
-	if (!sc->may_writepage)
-		isolate_mode |= ISOLATE_CLEAN;
 
 	spin_lock_irq(&pgdat->lru_lock);
 
@@ -1929,8 +1921,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
-	if (!sc->may_writepage)
-		isolate_mode |= ISOLATE_CLEAN;
 
 	spin_lock_irq(&pgdat->lru_lock);
 
-- 
cgit v1.2.3


From 726d061fbd3658e4bfeffa1b8e82da97de2ca4dd Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 24 Feb 2017 14:56:14 -0800
Subject: mm: vmscan: kick flushers when we encounter dirty pages on the LRU

Memory pressure can put dirty pages at the end of the LRU without
anybody running into dirty limits.  Don't start writing individual pages
from kswapd while the flushers might be asleep.

Unlike the old direct reclaim flusher wakeup (removed in the next patch)
that flushes the number of pages just scanned, this patch wakes the
flushers for all outstanding dirty pages.  That seemed to perform better
in a synthetic test that pushes dirty pages to the end of the LRU and
into reclaim, because we know LRU aging outstrips writeback already, and
this way we give younger dirty pages a headstart rather than wait until
reclaim runs into them as well.  It also means less plugging and risk of
exhausting the struct request pool from reclaim.

There is a concern that this will cause temporary files that used to get
dirtied and truncated before writeback to now get written to disk under
memory pressure.  If this turns out to be a real problem, we'll have to
revisit this and tame the reclaim flusher wakeups.

[hannes@cmpxchg.org: mention dirty expiration as a condition]
  Link: http://lkml.kernel.org/r/20170126174739.GA30636@cmpxchg.org
Link: http://lkml.kernel.org/r/20170123181641.23938-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/writeback.h        |  2 +-
 include/trace/events/writeback.h |  2 +-
 mm/vmscan.c                      | 18 +++++++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5527d910ba3d..a3c0cbd7c888 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -46,7 +46,7 @@ enum writeback_sync_modes {
  */
 enum wb_reason {
 	WB_REASON_BACKGROUND,
-	WB_REASON_TRY_TO_FREE_PAGES,
+	WB_REASON_VMSCAN,
 	WB_REASON_SYNC,
 	WB_REASON_PERIODIC,
 	WB_REASON_LAPTOP_TIMER,
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 2ccd9ccbf9ef..7bd8783a590f 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -31,7 +31,7 @@
 
 #define WB_WORK_REASON							\
 	EM( WB_REASON_BACKGROUND,		"background")		\
-	EM( WB_REASON_TRY_TO_FREE_PAGES,	"try_to_free_pages")	\
+	EM( WB_REASON_VMSCAN,			"vmscan")		\
 	EM( WB_REASON_SYNC,			"sync")			\
 	EM( WB_REASON_PERIODIC,			"periodic")		\
 	EM( WB_REASON_LAPTOP_TIMER,		"laptop_timer")		\
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0d05f7f3b532..83c92b866afe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1798,12 +1798,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 		/*
 		 * If dirty pages are scanned that are not queued for IO, it
-		 * implies that flushers are not keeping up. In this case, flag
-		 * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
-		 * reclaim context.
+		 * implies that flushers are not doing their job. This can
+		 * happen when memory pressure pushes dirty pages to the end of
+		 * the LRU before the dirty limits are breached and the dirty
+		 * data has expired. It can also happen when the proportion of
+		 * dirty pages grows not through writes but through memory
+		 * pressure reclaiming all the clean cache. And in some cases,
+		 * the flushers simply cannot keep up with the allocation
+		 * rate. Nudge the flusher threads in case they are asleep, but
+		 * also allow kswapd to start writing pages during reclaim.
 		 */
-		if (stat.nr_unqueued_dirty == nr_taken)
+		if (stat.nr_unqueued_dirty == nr_taken) {
+			wakeup_flusher_threads(0, WB_REASON_VMSCAN);
 			set_bit(PGDAT_DIRTY, &pgdat->flags);
+		}
 
 		/*
 		 * If kswapd scans pages marked marked for immediate
@@ -2787,7 +2795,7 @@ retry:
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
 		if (total_scanned > writeback_threshold) {
 			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
-						WB_REASON_TRY_TO_FREE_PAGES);
+						WB_REASON_VMSCAN);
 			sc->may_writepage = 1;
 		}
 	} while (--sc->priority >= 0);
-- 
cgit v1.2.3


From c55e8d035b28b2867e68b0e2d0eee2c0f1016b43 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 24 Feb 2017 14:56:23 -0800
Subject: mm: vmscan: move dirty pages out of the way until they're flushed

We noticed a performance regression when moving hadoop workloads from
3.10 kernels to 4.0 and 4.6.  This is accompanied by increased pageout
activity initiated by kswapd as well as frequent bursts of allocation
stalls and direct reclaim scans.  Even lowering the dirty ratios to the
equivalent of less than 1% of memory would not eliminate the issue,
suggesting that dirty pages concentrate where the scanner is looking.

This can be traced back to recent efforts of thrash avoidance.  Where
3.10 would not detect refaulting pages and continuously supply clean
cache to the inactive list, a thrashing workload on 4.0+ will detect and
activate refaulting pages right away, distilling used-once pages on the
inactive list much more effectively.  This is by design, and it makes
sense for clean cache.  But for the most part our workload's cache
faults are refaults and its use-once cache is from streaming writes.  We
end up with most of the inactive list dirty, and we don't go after the
active cache as long as we have use-once pages around.

But waiting for writes to avoid reclaiming clean cache that *might*
refault is a bad trade-off.  Even if the refaults happen, reads are
faster than writes.  Before getting bogged down on writeback, reclaim
should first look at *all* cache in the system, even active cache.

To accomplish this, activate pages that are dirty or under writeback
when they reach the end of the inactive LRU.  The pages are marked for
immediate reclaim, meaning they'll get moved back to the inactive LRU
tail as soon as they're written back and become reclaimable.  But in the
meantime, by reducing the inactive list to only immediately reclaimable
pages, we allow the scanner to deactivate and refill the inactive list
with clean cache from the active list tail to guarantee forward
progress.

[hannes@cmpxchg.org: update comment]
  Link: http://lkml.kernel.org/r/20170202191957.22872-8-hannes@cmpxchg.org
Link: http://lkml.kernel.org/r/20170123181641.23938-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  7 +++++++
 mm/swap.c                 |  9 +++++----
 mm/vmscan.c               | 15 ++++++++++++---
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 41d376e7116d..e030a68ead7e 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -50,6 +50,13 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 	list_add(&page->lru, &lruvec->lists[lru]);
 }
 
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+}
+
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/mm/swap.c b/mm/swap.c
index aabf2e90fe32..c4910f14f957 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,9 +209,10 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
 {
 	int *pgmoved = arg;
 
-	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-		enum lru_list lru = page_lru_base_type(page);
-		list_move_tail(&page->lru, &lruvec->lists[lru]);
+	if (PageLRU(page) && !PageUnevictable(page)) {
+		del_page_from_lru_list(page, lruvec, page_lru(page));
+		ClearPageActive(page);
+		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
 		(*pgmoved)++;
 	}
 }
@@ -235,7 +236,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
  */
 void rotate_reclaimable_page(struct page *page)
 {
-	if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
+	if (!PageLocked(page) && !PageDirty(page) &&
 	    !PageUnevictable(page) && PageLRU(page)) {
 		struct pagevec *pvec;
 		unsigned long flags;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 92e56cadceae..ae3d982216b5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1056,6 +1056,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 *    throttling so we could easily OOM just because too many
 		 *    pages are in writeback and there is nothing else to
 		 *    reclaim. Wait for the writeback to complete.
+		 *
+		 * In cases 1) and 2) we activate the pages to get them out of
+		 * the way while we continue scanning for clean pages on the
+		 * inactive list and refilling from the active list. The
+		 * observation here is that waiting for disk writes is more
+		 * expensive than potentially causing reloads down the line.
+		 * Since they're marked for immediate reclaim, they won't put
+		 * memory pressure on the cache working set any longer than it
+		 * takes to write them to disk.
 		 */
 		if (PageWriteback(page)) {
 			/* Case 1 above */
@@ -1063,7 +1072,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			    PageReclaim(page) &&
 			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
 				nr_immediate++;
-				goto keep_locked;
+				goto activate_locked;
 
 			/* Case 2 above */
 			} else if (sane_reclaim(sc) ||
@@ -1081,7 +1090,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				 */
 				SetPageReclaim(page);
 				nr_writeback++;
-				goto keep_locked;
+				goto activate_locked;
 
 			/* Case 3 above */
 			} else {
@@ -1174,7 +1183,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
 				SetPageReclaim(page);
 
-				goto keep_locked;
+				goto activate_locked;
 			}
 
 			if (references == PAGEREF_RECLAIM_CLEAN)
-- 
cgit v1.2.3


From 11bac80004499ea59f361ef2a5516c84b6eab675 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 24 Feb 2017 14:56:41 -0800
Subject: mm, fs: reduce fault, page_mkwrite, and pfn_mkwrite to take only vmf

->fault(), ->page_mkwrite(), and ->pfn_mkwrite() calls do not need to
take a vma and vmf parameter when the vma already resides in vmf.

Remove the vma parameter to simplify things.

[arnd@arndb.de: fix ARM build]
  Link: http://lkml.kernel.org/r/20170125223558.1451224-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/148521301778.19116.10840599906674778980.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_64_vio.c                 |  4 +--
 arch/powerpc/platforms/cell/spufs/file.c         | 39 ++++++++++++------------
 drivers/android/binder.c                         |  2 +-
 drivers/char/agp/alpha-agp.c                     |  5 ++-
 drivers/char/mspec.c                             |  6 ++--
 drivers/dax/dax.c                                | 12 ++++----
 drivers/gpu/drm/armada/armada_gem.c              |  9 +++---
 drivers/gpu/drm/drm_vm.c                         | 32 ++++++++++---------
 drivers/gpu/drm/etnaviv/etnaviv_drv.h            |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c            |  3 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c          |  3 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.h          |  2 +-
 drivers/gpu/drm/gma500/framebuffer.c             |  3 +-
 drivers/gpu/drm/gma500/gem.c                     |  3 +-
 drivers/gpu/drm/gma500/psb_drv.h                 |  2 +-
 drivers/gpu/drm/i915/i915_drv.h                  |  2 +-
 drivers/gpu/drm/i915/i915_gem.c                  |  4 +--
 drivers/gpu/drm/msm/msm_drv.h                    |  2 +-
 drivers/gpu/drm/msm/msm_gem.c                    |  3 +-
 drivers/gpu/drm/omapdrm/omap_drv.h               |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c               |  4 +--
 drivers/gpu/drm/qxl/qxl_ttm.c                    |  6 ++--
 drivers/gpu/drm/radeon/radeon_ttm.c              |  6 ++--
 drivers/gpu/drm/tegra/gem.c                      |  3 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c                  | 10 +++---
 drivers/gpu/drm/udl/udl_drv.h                    |  2 +-
 drivers/gpu/drm/udl/udl_gem.c                    |  3 +-
 drivers/gpu/drm/vgem/vgem_drv.c                  |  3 +-
 drivers/gpu/drm/virtio/virtgpu_ttm.c             |  7 ++---
 drivers/hsi/clients/cmt_speech.c                 |  4 +--
 drivers/hwtracing/intel_th/msu.c                 |  6 ++--
 drivers/infiniband/hw/hfi1/file_ops.c            |  4 +--
 drivers/infiniband/hw/qib/qib_file_ops.c         |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c        |  3 +-
 drivers/misc/cxl/context.c                       |  3 +-
 drivers/misc/sgi-gru/grumain.c                   |  3 +-
 drivers/misc/sgi-gru/grutables.h                 |  2 +-
 drivers/scsi/cxlflash/superpipe.c                |  6 ++--
 drivers/scsi/sg.c                                |  3 +-
 drivers/staging/android/ion/ion.c                |  6 ++--
 drivers/staging/lustre/lustre/llite/llite_mmap.c |  7 +++--
 drivers/staging/lustre/lustre/llite/vvp_io.c     |  2 +-
 drivers/target/target_core_user.c                |  6 ++--
 drivers/uio/uio.c                                |  6 ++--
 drivers/usb/mon/mon_bin.c                        |  4 +--
 drivers/video/fbdev/core/fb_defio.c              | 16 +++++-----
 drivers/xen/privcmd.c                            |  4 +--
 fs/9p/vfs_file.c                                 |  4 +--
 fs/btrfs/ctree.h                                 |  2 +-
 fs/btrfs/inode.c                                 |  6 ++--
 fs/ceph/addr.c                                   |  8 +++--
 fs/cifs/file.c                                   |  2 +-
 fs/dax.c                                         | 15 ++++-----
 fs/ext2/file.c                                   | 17 +++++------
 fs/ext4/ext4.h                                   |  4 +--
 fs/ext4/file.c                                   | 17 +++++------
 fs/ext4/inode.c                                  |  9 +++---
 fs/f2fs/file.c                                   |  7 ++---
 fs/fuse/file.c                                   |  6 ++--
 fs/gfs2/file.c                                   |  8 ++---
 fs/iomap.c                                       |  5 ++-
 fs/kernfs/file.c                                 | 13 ++++----
 fs/ncpfs/mmap.c                                  |  7 ++---
 fs/nfs/file.c                                    |  4 +--
 fs/nilfs2/file.c                                 |  3 +-
 fs/ocfs2/mmap.c                                  | 15 ++++-----
 fs/proc/vmcore.c                                 |  4 +--
 fs/ubifs/file.c                                  |  5 ++-
 fs/xfs/xfs_file.c                                | 25 +++++++--------
 include/linux/dax.h                              |  5 ++-
 include/linux/iomap.h                            |  3 +-
 include/linux/mm.h                               | 10 +++---
 ipc/shm.c                                        |  6 ++--
 kernel/events/core.c                             |  6 ++--
 kernel/relay.c                                   |  4 +--
 mm/filemap.c                                     | 19 ++++++------
 mm/hugetlb.c                                     |  2 +-
 mm/memory.c                                      |  6 ++--
 mm/mmap.c                                        |  9 +++---
 mm/nommu.c                                       |  2 +-
 mm/shmem.c                                       |  3 +-
 security/selinux/selinuxfs.c                     |  5 ++-
 sound/core/pcm_native.c                          | 15 ++++-----
 sound/usb/usx2y/us122l.c                         |  5 ++-
 sound/usb/usx2y/usX2Yhwdep.c                     |  7 ++---
 sound/usb/usx2y/usx2yhwdeppcm.c                  |  5 ++-
 virt/kvm/kvm_main.c                              |  4 +--
 87 files changed, 283 insertions(+), 290 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 491c5d8120f7..ab9d14c0e460 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -102,9 +102,9 @@ static void release_spapr_tce_table(struct rcu_head *head)
 	kfree(stt);
 }
 
-static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kvm_spapr_tce_fault(struct vm_fault *vmf)
 {
-	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
 	struct page *page;
 
 	if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index a35e2c29d7ee..e5ec1368f0cd 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -233,8 +233,9 @@ spufs_mem_write(struct file *file, const char __user *buffer,
 }
 
 static int
-spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mem_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct spu_context *ctx	= vma->vm_file->private_data;
 	unsigned long pfn, offset;
 
@@ -311,12 +312,11 @@ static const struct file_operations spufs_mem_fops = {
 	.mmap			= spufs_mem_mmap,
 };
 
-static int spufs_ps_fault(struct vm_area_struct *vma,
-				    struct vm_fault *vmf,
+static int spufs_ps_fault(struct vm_fault *vmf,
 				    unsigned long ps_offs,
 				    unsigned long ps_size)
 {
-	struct spu_context *ctx = vma->vm_file->private_data;
+	struct spu_context *ctx = vmf->vma->vm_file->private_data;
 	unsigned long area, offset = vmf->pgoff << PAGE_SHIFT;
 	int ret = 0;
 
@@ -354,7 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma,
 		down_read(&current->mm->mmap_sem);
 	} else {
 		area = ctx->spu->problem_phys + ps_offs;
-		vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
+		vm_insert_pfn(vmf->vma, vmf->address, (area + offset) >> PAGE_SHIFT);
 		spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
 	}
 
@@ -367,10 +367,9 @@ refault:
 }
 
 #if SPUFS_MMAP_4K
-static int spufs_cntl_mmap_fault(struct vm_area_struct *vma,
-					   struct vm_fault *vmf)
+static int spufs_cntl_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_cntl_mmap_vmops = {
@@ -1067,15 +1066,15 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 }
 
 static int
-spufs_signal1_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_signal1_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
-	return spufs_ps_fault(vma, vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE);
 #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
@@ -1205,15 +1204,15 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 
 #if SPUFS_MMAP_4K
 static int
-spufs_signal2_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_signal2_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
-	return spufs_ps_fault(vma, vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE);
 #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
@@ -1334,9 +1333,9 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get,
 
 #if SPUFS_MMAP_4K
 static int
-spufs_mss_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mss_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_mss_mmap_vmops = {
@@ -1396,9 +1395,9 @@ static const struct file_operations spufs_mss_fops = {
 };
 
 static int
-spufs_psmap_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_psmap_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_PS_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x0000, SPUFS_PS_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_psmap_mmap_vmops = {
@@ -1456,9 +1455,9 @@ static const struct file_operations spufs_psmap_fops = {
 
 #if SPUFS_MMAP_4K
 static int
-spufs_mfc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mfc_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_mfc_mmap_vmops = {
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 15b263a420e8..2bbcdc6fdfee 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3342,7 +3342,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
 	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
-static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int binder_vm_fault(struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index 737187865269..53fe633df1e8 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -11,15 +11,14 @@
 
 #include "agp.h"
 
-static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
-					struct vm_fault *vmf)
+static int alpha_core_agp_vm_fault(struct vm_fault *vmf)
 {
 	alpha_agp_info *agp = agp_bridge->dev_private_data;
 	dma_addr_t dma_addr;
 	unsigned long pa;
 	struct page *page;
 
-	dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base;
+	dma_addr = vmf->address - vmf->vma->vm_start + agp->aperture.bus_base;
 	pa = agp->ops->translate(agp, dma_addr);
 
 	if (pa == (unsigned long)-EINVAL)
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index a697ca0cab1e..a9c2fa3c81e5 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -191,12 +191,12 @@ mspec_close(struct vm_area_struct *vma)
  * Creates a mspec page and maps it to user space.
  */
 static int
-mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+mspec_fault(struct vm_fault *vmf)
 {
 	unsigned long paddr, maddr;
 	unsigned long pfn;
 	pgoff_t index = vmf->pgoff;
-	struct vma_data *vdata = vma->vm_private_data;
+	struct vma_data *vdata = vmf->vma->vm_private_data;
 
 	maddr = (volatile unsigned long) vdata->maddr[index];
 	if (maddr == 0) {
@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * be because another thread has installed the pte first, so it
 	 * is no problem.
 	 */
-	vm_insert_pfn(vma, vmf->address, pfn);
+	vm_insert_pfn(vmf->vma, vmf->address, pfn);
 
 	return VM_FAULT_NOPAGE;
 }
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 18e9875f6277..0261f332bf3e 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -419,8 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 	return -1;
 }
 
-static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
-		struct vm_fault *vmf)
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
@@ -428,7 +427,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	phys_addr_t phys;
 	pfn_t pfn;
 
-	if (check_vma(dax_dev, vma, __func__))
+	if (check_vma(dax_dev, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
 	dax_region = dax_dev->region;
@@ -446,7 +445,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	rc = vm_insert_mixed(vma, vmf->address, pfn);
+	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
 
 	if (rc == -ENOMEM)
 		return VM_FAULT_OOM;
@@ -456,8 +455,9 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	return VM_FAULT_NOPAGE;
 }
 
-static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int dax_dev_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	int rc;
 	struct file *filp = vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
@@ -466,7 +466,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read", vma->vm_start, vma->vm_end);
 	rcu_read_lock();
-	rc = __dax_dev_fault(dax_dev, vma, vmf);
+	rc = __dax_dev_fault(dax_dev, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 560d416deab2..1597458d884e 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -14,14 +14,15 @@
 #include <drm/armada_drm.h>
 #include "armada_ioctlP.h"
 
-static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int armada_gem_vm_fault(struct vm_fault *vmf)
 {
-	struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data);
+	struct drm_gem_object *gobj = vmf->vma->vm_private_data;
+	struct armada_gem_object *obj = drm_to_armada_gem(gobj);
 	unsigned long pfn = obj->phys_addr >> PAGE_SHIFT;
 	int ret;
 
-	pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-	ret = vm_insert_pfn(vma, vmf->address, pfn);
+	pfn += (vmf->address - vmf->vma->vm_start) >> PAGE_SHIFT;
+	ret = vm_insert_pfn(vmf->vma, vmf->address, pfn);
 
 	switch (ret) {
 	case 0:
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index bd311c77c254..bae6e26038ee 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -96,8 +96,9 @@ static pgprot_t drm_dma_prot(uint32_t map_type, struct vm_area_struct *vma)
  * map, get the page, increment the use count and return it.
  */
 #if IS_ENABLED(CONFIG_AGP)
-static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
 	struct drm_local_map *map = NULL;
@@ -168,7 +169,7 @@ vm_fault_error:
 	return VM_FAULT_SIGBUS;	/* Disallow mremap */
 }
 #else
-static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_fault(struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
@@ -184,8 +185,9 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  * Get the mapping, find the real physical page to map, get the page, and
  * return it.
  */
-static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_shm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_local_map *map = vma->vm_private_data;
 	unsigned long offset;
 	unsigned long i;
@@ -280,14 +282,14 @@ static void drm_vm_shm_close(struct vm_area_struct *vma)
 /**
  * \c fault method for DMA virtual memory.
  *
- * \param vma virtual memory area.
  * \param address access address.
  * \return pointer to the page structure.
  *
  * Determine the page number from the page offset and get it from drm_device_dma::pagelist.
  */
-static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_dma_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
 	struct drm_device_dma *dma = dev->dma;
@@ -315,14 +317,14 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 /**
  * \c fault method for scatter-gather virtual memory.
  *
- * \param vma virtual memory area.
  * \param address access address.
  * \return pointer to the page structure.
  *
  * Determine the map offset from the page offset and get it from drm_sg_mem::pagelist.
  */
-static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_sg_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_local_map *map = vma->vm_private_data;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
@@ -347,24 +349,24 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static int drm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_fault(vma, vmf);
+	return drm_do_vm_fault(vmf);
 }
 
-static int drm_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_shm_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_shm_fault(vma, vmf);
+	return drm_do_vm_shm_fault(vmf);
 }
 
-static int drm_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_dma_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_dma_fault(vma, vmf);
+	return drm_do_vm_dma_fault(vmf);
 }
 
-static int drm_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_sg_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_sg_fault(vma, vmf);
+	return drm_do_vm_sg_fault(vmf);
 }
 
 /** AGP virtual memory operations */
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.h b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
index c255eda40526..e41f38667c1c 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
@@ -73,7 +73,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		struct drm_file *file);
 
 int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int etnaviv_gem_fault(struct vm_fault *vmf);
 int etnaviv_gem_mmap_offset(struct drm_gem_object *obj, u64 *offset);
 struct sg_table *etnaviv_gem_prime_get_sg_table(struct drm_gem_object *obj);
 void *etnaviv_gem_prime_vmap(struct drm_gem_object *obj);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index aa6e35ddc87f..e78f1406885d 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -175,8 +175,9 @@ int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return obj->ops->mmap(obj, vma);
 }
 
-int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int etnaviv_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 	struct page **pages, *page;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index 57b81460fec8..4c28f7ffcc4d 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -447,8 +447,9 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv,
 	return ret;
 }
 
-int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int exynos_drm_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct exynos_drm_gem *exynos_gem = to_exynos_gem(obj);
 	unsigned long pfn;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.h b/drivers/gpu/drm/exynos/exynos_drm_gem.h
index df7c543d6558..85457255fcd1 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.h
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.h
@@ -116,7 +116,7 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv,
 				   uint64_t *offset);
 
 /* page fault handler and mmap fault address(virtual) to physical memory. */
-int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int exynos_drm_gem_fault(struct vm_fault *vmf);
 
 /* set vm_flags and we can change the vm attribute to other one at here. */
 int exynos_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index da42d2e1d397..ffe6b4ffa1a8 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -111,8 +111,9 @@ static int psbfb_pan(struct fb_var_screeninfo *var, struct fb_info *info)
         return 0;
 }
 
-static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int psbfb_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct psb_framebuffer *psbfb = vma->vm_private_data;
 	struct drm_device *dev = psbfb->base.dev;
 	struct drm_psb_private *dev_priv = dev->dev_private;
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c
index 527c62917660..7da061aab729 100644
--- a/drivers/gpu/drm/gma500/gem.c
+++ b/drivers/gpu/drm/gma500/gem.c
@@ -164,8 +164,9 @@ int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
  *	vma->vm_private_data points to the GEM object that is backing this
  *	mapping.
  */
-int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int psb_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj;
 	struct gtt_range *r;
 	int ret;
diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h
index 05d7aaf47eea..83e22fd4cfc0 100644
--- a/drivers/gpu/drm/gma500/psb_drv.h
+++ b/drivers/gpu/drm/gma500/psb_drv.h
@@ -752,7 +752,7 @@ extern int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
 			struct drm_mode_create_dumb *args);
 extern int psb_gem_dumb_map_gtt(struct drm_file *file, struct drm_device *dev,
 			uint32_t handle, uint64_t *offset);
-extern int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int psb_gem_fault(struct vm_fault *vmf);
 
 /* psb_device.c */
 extern const struct psb_ops psb_chip_ops;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bcc81912b5e5..0a4b42d31391 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3352,7 +3352,7 @@ int __must_check i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
 					unsigned int flags);
 int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
 void i915_gem_resume(struct drm_i915_private *dev_priv);
-int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int i915_gem_fault(struct vm_fault *vmf);
 int i915_gem_object_wait(struct drm_i915_gem_object *obj,
 			 unsigned int flags,
 			 long timeout,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 88f3628b4e29..6908123162d1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1772,7 +1772,6 @@ compute_partial_view(struct drm_i915_gem_object *obj,
 
 /**
  * i915_gem_fault - fault a page into the GTT
- * @area: CPU VMA in question
  * @vmf: fault info
  *
  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
@@ -1789,9 +1788,10 @@ compute_partial_view(struct drm_i915_gem_object *obj,
  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
  */
-int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+int i915_gem_fault(struct vm_fault *vmf)
 {
 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
+	struct vm_area_struct *area = vmf->vma;
 	struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index cdd7b2f8e977..c3b14876edaa 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -206,7 +206,7 @@ void msm_gem_shrinker_cleanup(struct drm_device *dev);
 int msm_gem_mmap_obj(struct drm_gem_object *obj,
 			struct vm_area_struct *vma);
 int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int msm_gem_fault(struct vm_fault *vmf);
 uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj);
 int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
 		uint64_t *iova);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index e140b05af134..59811f29607d 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -191,8 +191,9 @@ int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return msm_gem_mmap_obj(vma->vm_private_data, vma);
 }
 
-int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int msm_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_device *dev = obj->dev;
 	struct msm_drm_private *priv = dev->dev_private;
diff --git a/drivers/gpu/drm/omapdrm/omap_drv.h b/drivers/gpu/drm/omapdrm/omap_drv.h
index 36d93ce84a29..65977982f15f 100644
--- a/drivers/gpu/drm/omapdrm/omap_drv.h
+++ b/drivers/gpu/drm/omapdrm/omap_drv.h
@@ -188,7 +188,7 @@ int omap_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
 int omap_gem_mmap(struct file *filp, struct vm_area_struct *vma);
 int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		struct vm_area_struct *vma);
-int omap_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int omap_gem_fault(struct vm_fault *vmf);
 int omap_gem_op_start(struct drm_gem_object *obj, enum omap_gem_op op);
 int omap_gem_op_finish(struct drm_gem_object *obj, enum omap_gem_op op);
 int omap_gem_op_sync(struct drm_gem_object *obj, enum omap_gem_op op);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 74a9968df421..5d5a9f517c30 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -518,7 +518,6 @@ static int fault_2d(struct drm_gem_object *obj,
 
 /**
  * omap_gem_fault		-	pagefault handler for GEM objects
- * @vma: the VMA of the GEM object
  * @vmf: fault detail
  *
  * Invoked when a fault occurs on an mmap of a GEM managed area. GEM
@@ -529,8 +528,9 @@ static int fault_2d(struct drm_gem_object *obj,
  * vma->vm_private_data points to the GEM object that is backing this
  * mapping.
  */
-int omap_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int omap_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct omap_gem_object *omap_obj = to_omap_bo(obj);
 	struct drm_device *dev = obj->dev;
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index 4e1a40389964..7d1cab57c89e 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -105,15 +105,15 @@ static void qxl_ttm_global_fini(struct qxl_device *qdev)
 static struct vm_operations_struct qxl_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops;
 
-static int qxl_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int qxl_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL)
 		return VM_FAULT_NOPAGE;
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 7a10b3852970..684f1703aa5c 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -979,19 +979,19 @@ void radeon_ttm_set_active_vram_size(struct radeon_device *rdev, u64 size)
 static struct vm_operations_struct radeon_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops = NULL;
 
-static int radeon_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int radeon_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	struct radeon_device *rdev;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;	
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL) {
 		return VM_FAULT_NOPAGE;
 	}
 	rdev = radeon_get_rdev(bo->bdev);
 	down_read(&rdev->pm.mclk_lock);
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	up_read(&rdev->pm.mclk_lock);
 	return r;
 }
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index b523a5d4a38c..17e62ecb5d4d 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -441,8 +441,9 @@ int tegra_bo_dumb_map_offset(struct drm_file *file, struct drm_device *drm,
 	return 0;
 }
 
-static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int tegra_bo_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *gem = vma->vm_private_data;
 	struct tegra_bo *bo = to_tegra_bo(gem);
 	struct page *page;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 88169141bef5..35ffb3754feb 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -43,7 +43,6 @@
 #define TTM_BO_VM_NUM_PREFAULT 16
 
 static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
-				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
 	int ret = 0;
@@ -67,7 +66,7 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
 			goto out_unlock;
 
 		ttm_bo_reference(bo);
-		up_read(&vma->vm_mm->mmap_sem);
+		up_read(&vmf->vma->vm_mm->mmap_sem);
 		(void) dma_fence_wait(bo->moving, true);
 		ttm_bo_unreserve(bo);
 		ttm_bo_unref(&bo);
@@ -92,8 +91,9 @@ out_unlock:
 	return ret;
 }
 
-static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ttm_bo_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
 	    vma->vm_private_data;
 	struct ttm_bo_device *bdev = bo->bdev;
@@ -124,7 +124,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) {
 			if (!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
 				ttm_bo_reference(bo);
-				up_read(&vma->vm_mm->mmap_sem);
+				up_read(&vmf->vma->vm_mm->mmap_sem);
 				(void) ttm_bo_wait_unreserved(bo);
 				ttm_bo_unref(&bo);
 			}
@@ -168,7 +168,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * Wait for buffer data in transit, due to a pipelined
 	 * move.
 	 */
-	ret = ttm_bo_vm_fault_idle(bo, vma, vmf);
+	ret = ttm_bo_vm_fault_idle(bo, vmf);
 	if (unlikely(ret != 0)) {
 		retval = ret;
 
diff --git a/drivers/gpu/drm/udl/udl_drv.h b/drivers/gpu/drm/udl/udl_drv.h
index 6c4286e57362..2a75ab80527a 100644
--- a/drivers/gpu/drm/udl/udl_drv.h
+++ b/drivers/gpu/drm/udl/udl_drv.h
@@ -134,7 +134,7 @@ void udl_gem_put_pages(struct udl_gem_object *obj);
 int udl_gem_vmap(struct udl_gem_object *obj);
 void udl_gem_vunmap(struct udl_gem_object *obj);
 int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int udl_gem_fault(struct vm_fault *vmf);
 
 int udl_handle_damage(struct udl_framebuffer *fb, int x, int y,
 		      int width, int height);
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c
index 3c0c4bd3f750..775c50e4f02c 100644
--- a/drivers/gpu/drm/udl/udl_gem.c
+++ b/drivers/gpu/drm/udl/udl_gem.c
@@ -100,8 +100,9 @@ int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return ret;
 }
 
-int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int udl_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct udl_gem_object *obj = to_udl_bo(vma->vm_private_data);
 	struct page *page;
 	unsigned int page_offset;
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 477e07f0ecb6..7ccbb03e98de 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -50,8 +50,9 @@ static void vgem_gem_free_object(struct drm_gem_object *obj)
 	kfree(vgem_obj);
 }
 
-static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int vgem_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_vgem_gem_object *obj = vma->vm_private_data;
 	/* We don't use vmf->pgoff since that has the fake offset */
 	unsigned long vaddr = vmf->address;
diff --git a/drivers/gpu/drm/virtio/virtgpu_ttm.c b/drivers/gpu/drm/virtio/virtgpu_ttm.c
index 9cc7079f7aca..70ec8ca8d9b1 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ttm.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ttm.c
@@ -114,18 +114,17 @@ static void virtio_gpu_ttm_global_fini(struct virtio_gpu_device *vgdev)
 static struct vm_operations_struct virtio_gpu_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops;
 
-static int virtio_gpu_ttm_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int virtio_gpu_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	struct virtio_gpu_device *vgdev;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL)
 		return VM_FAULT_NOPAGE;
 	vgdev = virtio_gpu_get_vgdev(bo->bdev);
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	return r;
 }
 #endif
diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 3deef6cc7d7c..7175e6bedf21 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -1098,9 +1098,9 @@ static void cs_hsi_stop(struct cs_hsi_iface *hi)
 	kfree(hi);
 }
 
-static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cs_char_vma_fault(struct vm_fault *vmf)
 {
-	struct cs_char *csdata = vma->vm_private_data;
+	struct cs_char *csdata = vmf->vma->vm_private_data;
 	struct page *page;
 
 	page = virt_to_page(csdata->mmap_base);
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index e8d55a153a65..e88afe1a435c 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1188,9 +1188,9 @@ static void msc_mmap_close(struct vm_area_struct *vma)
 	mutex_unlock(&msc->buf_mutex);
 }
 
-static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int msc_mmap_fault(struct vm_fault *vmf)
 {
-	struct msc_iter *iter = vma->vm_file->private_data;
+	struct msc_iter *iter = vmf->vma->vm_file->private_data;
 	struct msc *msc = iter->msc;
 
 	vmf->page = msc_buffer_get_page(msc, vmf->pgoff);
@@ -1198,7 +1198,7 @@ static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
 	vmf->page->index = vmf->pgoff;
 
 	return 0;
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index bd786b7bd30b..f46033984d07 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -92,7 +92,7 @@ static unsigned int poll_next(struct file *, struct poll_table_struct *);
 static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
 static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
 static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
-static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static int vma_fault(struct vm_fault *);
 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 			    unsigned long arg);
 
@@ -695,7 +695,7 @@ done:
  * Local (non-chip) user memory is not mapped right away but as it is
  * accessed by the user-level code.
  */
-static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int vma_fault(struct vm_fault *vmf)
 {
 	struct page *page;
 
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 2d1eacf1dfed..9396c1807cc3 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -893,7 +893,7 @@ bail:
 /*
  * qib_file_vma_fault - handle a VMA page fault.
  */
-static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int qib_file_vma_fault(struct vm_fault *vmf)
 {
 	struct page *page;
 
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index ba63ca57ed7e..36bd904946bd 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -434,8 +434,9 @@ static void videobuf_vm_close(struct vm_area_struct *vma)
  * now ...).  Bounce buffers don't work very well for the data rates
  * video capture has.
  */
-static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int videobuf_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
 
 	dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n",
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 3907387b6d15..062bf6ca2625 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -121,8 +121,9 @@ void cxl_context_set_mapping(struct cxl_context *ctx,
 	mutex_unlock(&ctx->mapping_lock);
 }
 
-static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cxl_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct cxl_context *ctx = vma->vm_file->private_data;
 	u64 area, offset;
 
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index af2e077da4b8..3641f1334cf0 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -926,8 +926,9 @@ again:
  *
  * 	Note: gru segments alway mmaped on GRU_GSEG_PAGESIZE boundaries.
  */
-int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int gru_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct gru_thread_state *gts;
 	unsigned long paddr, vaddr;
 	unsigned long expires;
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
index 5c3ce2459675..b5e308b50ed1 100644
--- a/drivers/misc/sgi-gru/grutables.h
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -665,7 +665,7 @@ extern unsigned long gru_reserve_cb_resources(struct gru_state *gru,
 		int cbr_au_count, char *cbmap);
 extern unsigned long gru_reserve_ds_resources(struct gru_state *gru,
 		int dsr_au_count, char *dsmap);
-extern int gru_fault(struct vm_area_struct *, struct vm_fault *vmf);
+extern int gru_fault(struct vm_fault *vmf);
 extern struct gru_mm_struct *gru_register_mmu_notifier(void);
 extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
 
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index 90869cee2b20..ef5bf55f08a4 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -1053,7 +1053,6 @@ out:
 
 /**
  * cxlflash_mmap_fault() - mmap fault handler for adapter file descriptor
- * @vma:	VM area associated with mapping.
  * @vmf:	VM fault associated with current fault.
  *
  * To support error notification via MMIO, faults are 'caught' by this routine
@@ -1067,8 +1066,9 @@ out:
  *
  * Return: 0 on success, VM_FAULT_SIGBUS on failure
  */
-static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cxlflash_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct file *file = vma->vm_file;
 	struct cxl_context *ctx = cxl_fops_get_context(file);
 	struct cxlflash_cfg *cfg = container_of(file->f_op, struct cxlflash_cfg,
@@ -1097,7 +1097,7 @@ static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	if (likely(!ctxi->err_recovery_active)) {
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		rc = ctxi->cxl_mmap_vmops->fault(vma, vmf);
+		rc = ctxi->cxl_mmap_vmops->fault(vmf);
 	} else {
 		dev_dbg(dev, "%s: err recovery active, use err_page\n",
 			__func__);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index e831e01f9fa6..29b86505f796 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1185,8 +1185,9 @@ sg_fasync(int fd, struct file *filp, int mode)
 }
 
 static int
-sg_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+sg_vma_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	Sg_fd *sfp;
 	unsigned long offset, len, sa;
 	Sg_scatter_hold *rsv_schp;
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 969600779e44..2c3ffbcbd621 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -870,9 +870,9 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
 	mutex_unlock(&buffer->lock);
 }
 
-static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ion_vm_fault(struct vm_fault *vmf)
 {
-	struct ion_buffer *buffer = vma->vm_private_data;
+	struct ion_buffer *buffer = vmf->vma->vm_private_data;
 	unsigned long pfn;
 	int ret;
 
@@ -881,7 +881,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]);
 
 	pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff]));
-	ret = vm_insert_pfn(vma, vmf->address, pfn);
+	ret = vm_insert_pfn(vmf->vma, vmf->address, pfn);
 	mutex_unlock(&buffer->lock);
 	if (ret)
 		return VM_FAULT_ERROR;
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
index 9afa6bec3e6f..896196c74cd2 100644
--- a/drivers/staging/lustre/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -321,7 +321,7 @@ out:
 	return fault_ret;
 }
 
-static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_fault(struct vm_fault *vmf)
 {
 	int count = 0;
 	bool printed = false;
@@ -335,7 +335,7 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
 
 restart:
-	result = ll_fault0(vma, vmf);
+	result = ll_fault0(vmf->vma, vmf);
 	LASSERT(!(result & VM_FAULT_LOCKED));
 	if (result == 0) {
 		struct page *vmpage = vmf->page;
@@ -362,8 +362,9 @@ restart:
 	return result;
 }
 
-static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	int count = 0;
 	bool printed = false;
 	bool retry;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 3e9cf710501b..4c57755e06e7 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 {
 	struct vm_fault *vmf = cfio->ft_vmf;
 
-	cfio->ft_flags = filemap_fault(cfio->ft_vma, vmf);
+	cfio->ft_flags = filemap_fault(vmf);
 	cfio->ft_flags_valid = 1;
 
 	if (vmf->page) {
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 8041710b6972..5c1cb2df3a54 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -783,15 +783,15 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
 	return -1;
 }
 
-static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int tcmu_vma_fault(struct vm_fault *vmf)
 {
-	struct tcmu_dev *udev = vma->vm_private_data;
+	struct tcmu_dev *udev = vmf->vma->vm_private_data;
 	struct uio_info *info = &udev->uio_info;
 	struct page *page;
 	unsigned long offset;
 	void *addr;
 
-	int mi = tcmu_find_mem_index(vma);
+	int mi = tcmu_find_mem_index(vmf->vma);
 	if (mi < 0)
 		return VM_FAULT_SIGBUS;
 
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index fba021f5736a..31d95dc9c202 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -597,14 +597,14 @@ static int uio_find_mem_index(struct vm_area_struct *vma)
 	return -1;
 }
 
-static int uio_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int uio_vma_fault(struct vm_fault *vmf)
 {
-	struct uio_device *idev = vma->vm_private_data;
+	struct uio_device *idev = vmf->vma->vm_private_data;
 	struct page *page;
 	unsigned long offset;
 	void *addr;
 
-	int mi = uio_find_mem_index(vma);
+	int mi = uio_find_mem_index(vmf->vma);
 	if (mi < 0)
 		return VM_FAULT_SIGBUS;
 
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 91c22276c03b..9fb8b1e6ecc2 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1223,9 +1223,9 @@ static void mon_bin_vma_close(struct vm_area_struct *vma)
 /*
  * Map ring pages to user space.
  */
-static int mon_bin_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int mon_bin_vma_fault(struct vm_fault *vmf)
 {
-	struct mon_reader_bin *rp = vma->vm_private_data;
+	struct mon_reader_bin *rp = vmf->vma->vm_private_data;
 	unsigned long offset, chunk_idx;
 	struct page *pageptr;
 
diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index 74b5bcac8bf2..37f69c061210 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -37,12 +37,11 @@ static struct page *fb_deferred_io_page(struct fb_info *info, unsigned long offs
 }
 
 /* this is to find and return the vmalloc-ed fb pages */
-static int fb_deferred_io_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int fb_deferred_io_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page *page;
-	struct fb_info *info = vma->vm_private_data;
+	struct fb_info *info = vmf->vma->vm_private_data;
 
 	offset = vmf->pgoff << PAGE_SHIFT;
 	if (offset >= info->fix.smem_len)
@@ -54,8 +53,8 @@ static int fb_deferred_io_fault(struct vm_area_struct *vma,
 
 	get_page(page);
 
-	if (vma->vm_file)
-		page->mapping = vma->vm_file->f_mapping;
+	if (vmf->vma->vm_file)
+		page->mapping = vmf->vma->vm_file->f_mapping;
 	else
 		printk(KERN_ERR "no mapping available\n");
 
@@ -91,11 +90,10 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy
 EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
 /* vm_ops->page_mkwrite handler */
-static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
-				  struct vm_fault *vmf)
+static int fb_deferred_io_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct fb_info *info = vma->vm_private_data;
+	struct fb_info *info = vmf->vma->vm_private_data;
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *cur;
 
@@ -105,7 +103,7 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
 	deferred framebuffer IO. then if userspace touches a page
 	again, we repeat the same scheme */
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	/* protect against the workqueue changing the page list */
 	mutex_lock(&fbdefio->lock);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 2077a3ac7c0c..7a92a5e1d40c 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -804,10 +804,10 @@ static void privcmd_close(struct vm_area_struct *vma)
 	kfree(pages);
 }
 
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int privcmd_fault(struct vm_fault *vmf)
 {
 	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
-	       vma, vma->vm_start, vma->vm_end,
+	       vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
 	       vmf->pgoff, (void *)vmf->address);
 
 	return VM_FAULT_SIGBUS;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6a0f3fa85ef7..3de3b4a89d89 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -534,11 +534,11 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
 }
 
 static int
-v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct v9fs_inode *v9inode;
 	struct page *page = vmf->page;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct inode *inode = file_inode(filp);
 
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6a823719b6c5..adf16307842a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3147,7 +3147,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+int btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..ea1d500cfba6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8964,10 +8964,10 @@ again:
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int btrfs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -9000,7 +9000,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	ret = btrfs_delalloc_reserve_space(inode, page_start,
 					   reserved_space);
 	if (!ret) {
-		ret = file_update_time(vma->vm_file);
+		ret = file_update_time(vmf->vma->vm_file);
 		reserved = 1;
 	}
 	if (ret) {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e4b066cd912a..09860c0ec7c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1386,8 +1386,9 @@ static void ceph_restore_sigs(sigset_t *oldset)
 /*
  * vm ops
  */
-static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ceph_filemap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
@@ -1416,7 +1417,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
 	    ci->i_inline_version == CEPH_INLINE_NONE) {
 		current->journal_info = vma->vm_file;
-		ret = filemap_fault(vma, vmf);
+		ret = filemap_fault(vmf);
 		current->journal_info = NULL;
 	} else
 		ret = -EAGAIN;
@@ -1477,8 +1478,9 @@ out_restore:
 /*
  * Reuse write_begin here for simplicity.
  */
-static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ceph_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 98dc842e7245..aa3debbba826 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3282,7 +3282,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
  * sure that it doesn't change while being written back.
  */
 static int
-cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+cifs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 
diff --git a/fs/dax.c b/fs/dax.c
index 3f1181563fb1..f955c0df33bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -925,12 +925,11 @@ static int dax_insert_mapping(struct address_space *mapping,
 
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
- * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  */
-int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	void *entry, **slot;
 	pgoff_t index = vmf->pgoff;
@@ -1121,7 +1120,6 @@ static int dax_fault_return(int error)
 
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @ops: iomap ops passed from the file system
  *
@@ -1129,10 +1127,9 @@ static int dax_fault_return(int error)
  * or mkwrite handler for DAX files. Assumes the caller has done all the
  * necessary locking for the page fault to proceed successfully.
  */
-int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			const struct iomap_ops *ops)
+int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1205,11 +1202,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, sector,
-				PAGE_SIZE, &entry, vma, vmf);
+				PAGE_SIZE, &entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b0f241528a30..0bf0d971205a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -87,19 +87,19 @@ out_unlock:
  * The default page_lock and i_size verification done by non-DAX fault paths
  * is sufficient because ext2 doesn't support hole punching.
  */
-static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ext2_dax_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int ret;
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -107,16 +107,15 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return ret;
 }
 
-static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
-		struct vm_fault *vmf)
+static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	loff_t size;
 	int ret;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	down_read(&ei->dax_sem);
 
 	/* check that the faulting page hasn't raced with truncate */
@@ -124,7 +123,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cee23b684f47..2fd17e8e4984 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2483,8 +2483,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_page_mkwrite(struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 13021a054fc0..21e1f17fe36d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -253,19 +253,19 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
-static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ext4_dax_fault(struct vm_fault *vmf)
 {
 	int result;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
@@ -303,22 +303,21 @@ ext4_dax_pmd_fault(struct vm_fault *vmf)
  * wp_pfn_shared() fails. Thus fault gets retried and things work out as
  * desired.
  */
-static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	loff_t size;
 	int ret;
 
 	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	down_read(&EXT4_I(inode)->i_mmap_sem);
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(sb);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 75212a6e69f8..41d8e53e5a7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5821,8 +5821,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int ext4_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
@@ -5912,13 +5913,13 @@ out:
 	return ret;
 }
 
-int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int ext4_filemap_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	int err;
 
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	err = filemap_fault(vma, vmf);
+	err = filemap_fault(vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return err;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 49f10dce817d..1edc86e874e3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -32,11 +32,10 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
-static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
-						struct vm_fault *vmf)
+static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
 	int err;
@@ -58,7 +57,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 
 	f2fs_balance_fs(sbi, dn.node_changed);
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (unlikely(page->mapping != inode->i_mapping ||
 			page_offset(page) > i_size_read(inode) ||
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2401c5dabb2a..e80bfd06daf5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2043,12 +2043,12 @@ static void fuse_vma_close(struct vm_area_struct *vma)
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int fuse_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 016c11eaca7c..6fe2a59c6a9a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -379,10 +379,10 @@ static int gfs2_allocate_page_backing(struct page *page)
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int gfs2_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_alloc_parms ap = { .aflags = 0, };
@@ -399,7 +399,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out;
 
-	gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE);
+	gfs2_size_hint(vmf->vma->vm_file, pos, PAGE_SIZE);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -407,7 +407,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out_uninit;
 
 	/* Update file times before taking page lock */
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
diff --git a/fs/iomap.c b/fs/iomap.c
index d89f70bbb952..d209f42cdcb8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -445,11 +445,10 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 	return length;
 }
 
-int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		const struct iomap_ops *ops)
+int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	unsigned long length;
 	loff_t offset, size;
 	ssize_t ret;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 78219d5644e9..4f0535890b30 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -348,9 +348,9 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
 	kernfs_put_active(of->kn);
 }
 
-static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kernfs_vma_fault(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
@@ -362,16 +362,15 @@ static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	ret = VM_FAULT_SIGBUS;
 	if (of->vm_ops->fault)
-		ret = of->vm_ops->fault(vma, vmf);
+		ret = of->vm_ops->fault(vmf);
 
 	kernfs_put_active(of->kn);
 	return ret;
 }
 
-static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
-				   struct vm_fault *vmf)
+static int kernfs_vma_page_mkwrite(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
@@ -383,7 +382,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 
 	ret = 0;
 	if (of->vm_ops->page_mkwrite)
-		ret = of->vm_ops->page_mkwrite(vma, vmf);
+		ret = of->vm_ops->page_mkwrite(vmf);
 	else
 		file_update_time(file);
 
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 39f57bef8531..0c3905e0542e 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -27,10 +27,9 @@
  * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
  * page?
  */
-static int ncp_file_mmap_fault(struct vm_area_struct *area,
-					struct vm_fault *vmf)
+static int ncp_file_mmap_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(area->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	char *pg_addr;
 	unsigned int already_read;
 	unsigned int count;
@@ -90,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
 	 * -- nyc
 	 */
 	count_vm_event(PGMAJFAULT);
-	mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
+	mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 	return VM_FAULT_MAJOR;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 26dbe8b0c10d..668213984d68 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -528,10 +528,10 @@ const struct address_space_operations nfs_file_aops = {
  * writable, implying that someone is about to modify the page through a
  * shared-writable mapping
  */
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int nfs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct inode *inode = file_inode(filp);
 	unsigned pagelen;
 	int ret = VM_FAULT_NOPAGE;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 547381f3ce13..c5fa3dee72fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -51,8 +51,9 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	return err;
 }
 
-static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int nilfs_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct nilfs_transaction_info ti;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 429088786e93..098f5c712569 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -44,17 +44,18 @@
 #include "ocfs2_trace.h"
 
 
-static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+static int ocfs2_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	sigset_t oldset;
 	int ret;
 
 	ocfs2_block_signals(&oldset);
-	ret = filemap_fault(area, vmf);
+	ret = filemap_fault(vmf);
 	ocfs2_unblock_signals(&oldset);
 
-	trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
-			  area, vmf->page, vmf->pgoff);
+	trace_ocfs2_fault(OCFS2_I(vma->vm_file->f_mapping->host)->ip_blkno,
+			  vma, vmf->page, vmf->pgoff);
 	return ret;
 }
 
@@ -127,10 +128,10 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ocfs2_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct buffer_head *di_bh = NULL;
 	sigset_t oldset;
 	int ret;
@@ -160,7 +161,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
+	ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, page);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5105b1599981..f52d8e857ff7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -265,10 +265,10 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
  * On s390 the fault handler is used for memory regions that can't be mapped
  * directly with remap_pfn_range().
  */
-static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int mmap_vmcore_fault(struct vm_fault *vmf)
 {
 #ifdef CONFIG_S390
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	pgoff_t index = vmf->pgoff;
 	struct page *page;
 	loff_t offset;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b0d783774c96..d9ae86f96df7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1506,11 +1506,10 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made writable.
  * UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
-				 struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
 	struct ubifs_budget_req req = { .new_page = 1 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 022014016d80..9cc10136ba0b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1379,22 +1379,21 @@ xfs_file_llseek(
  */
 STATIC int
 xfs_filemap_page_mkwrite(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	int			ret;
 
 	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	} else {
-		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
+		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
 	}
 
@@ -1406,23 +1405,22 @@ xfs_filemap_page_mkwrite(
 
 STATIC int
 xfs_filemap_fault(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	int			ret;
 
 	trace_xfs_filemap_fault(XFS_I(inode));
 
 	/* DAX can shortcut the normal fault path on write faults! */
 	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
-		return xfs_filemap_page_mkwrite(vma, vmf);
+		return xfs_filemap_page_mkwrite(vmf);
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode))
-		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	else
-		ret = filemap_fault(vma, vmf);
+		ret = filemap_fault(vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	return ret;
@@ -1471,11 +1469,10 @@ xfs_filemap_pmd_fault(
  */
 static int
 xfs_filemap_pfn_mkwrite(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
 
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret = VM_FAULT_NOPAGE;
 	loff_t			size;
@@ -1483,7 +1480,7 @@ xfs_filemap_pfn_mkwrite(
 	trace_xfs_filemap_pfn_mkwrite(ip);
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	/* check if the faulting page hasn't raced with truncate */
 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
@@ -1491,7 +1488,7 @@ xfs_filemap_pfn_mkwrite(
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else if (IS_DAX(inode))
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 1e77ff5818f1..eeb02421c848 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,8 +38,7 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			const struct iomap_ops *ops);
+int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -83,7 +82,7 @@ static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	return VM_FAULT_FALLBACK;
 }
 #endif
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+int dax_pfn_mkwrite(struct vm_fault *vmf);
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 891459caa278..7291810067eb 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -79,8 +79,7 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, const struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops);
-int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		const struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops);
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		loff_t start, loff_t len, const struct iomap_ops *ops);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 574bc157a27c..3dd80ba6568a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -350,17 +350,17 @@ struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
-	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*fault)(struct vm_fault *vmf);
 	int (*pmd_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*page_mkwrite)(struct vm_fault *vmf);
 
 	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
-	int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pfn_mkwrite)(struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
@@ -2124,10 +2124,10 @@ extern void truncate_inode_pages_range(struct address_space *,
 extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
+extern int filemap_fault(struct vm_fault *vmf);
 extern void filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
-extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int filemap_page_mkwrite(struct vm_fault *vmf);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
diff --git a/ipc/shm.c b/ipc/shm.c
index 81203e8ba013..7f6537b84ef5 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -374,12 +374,12 @@ void exit_shm(struct task_struct *task)
 	up_write(&shm_ids(ns).rwsem);
 }
 
-static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int shm_fault(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	return sfd->vm_ops->fault(vma, vmf);
+	return sfd->vm_ops->fault(vmf);
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77a932b54a64..b2eb3542e829 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4925,9 +4925,9 @@ unlock:
 	rcu_read_unlock();
 }
 
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int perf_mmap_fault(struct vm_fault *vmf)
 {
-	struct perf_event *event = vma->vm_file->private_data;
+	struct perf_event *event = vmf->vma->vm_file->private_data;
 	struct ring_buffer *rb;
 	int ret = VM_FAULT_SIGBUS;
 
@@ -4950,7 +4950,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto unlock;
 
 	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
 	vmf->page->index   = vmf->pgoff;
 
 	ret = 0;
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f18d314a96a..8f8dc91db680 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
 /*
  * fault() vm_op implementation for relay file mapping.
  */
-static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int relay_buf_fault(struct vm_fault *vmf)
 {
 	struct page *page;
-	struct rchan_buf *buf = vma->vm_private_data;
+	struct rchan_buf *buf = vmf->vma->vm_private_data;
 	pgoff_t pgoff = vmf->pgoff;
 
 	if (!buf)
diff --git a/mm/filemap.c b/mm/filemap.c
index 416d563468a3..2ba46f410c7c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2169,7 +2169,6 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
 
 /**
  * filemap_fault - read in file data for page fault handling
- * @vma:	vma in which the fault was taken
  * @vmf:	struct vm_fault containing details of the fault
  *
  * filemap_fault() is invoked via the vma operations vector for a
@@ -2191,10 +2190,10 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
  *
  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
  */
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
 {
 	int error;
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct file_ra_state *ra = &file->f_ra;
 	struct inode *inode = mapping->host;
@@ -2216,12 +2215,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
-		do_async_mmap_readahead(vma, ra, file, page, offset);
+		do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
 	} else if (!page) {
 		/* No page in the page cache at all */
-		do_sync_mmap_readahead(vma, ra, file, offset);
+		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
 		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+		mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 retry_find:
 		page = find_get_page(mapping, offset);
@@ -2229,7 +2228,7 @@ retry_find:
 			goto no_cached_page;
 	}
 
-	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+	if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
 		put_page(page);
 		return ret | VM_FAULT_RETRY;
 	}
@@ -2396,14 +2395,14 @@ next:
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	int ret = VM_FAULT_LOCKED;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 30e7709a5121..167fd0722c15 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3142,7 +3142,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
  * this far.
  */
-static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int hugetlb_vm_op_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 7663068a33c6..cf97d88158cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2035,7 +2035,7 @@ static int do_page_mkwrite(struct vm_fault *vmf)
 
 	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 
-	ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
 	/* Restore original flags so that caller is not surprised */
 	vmf->flags = old_flags;
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2307,7 +2307,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
 
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		vmf->flags |= FAULT_FLAG_MKWRITE;
-		ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+		ret = vma->vm_ops->pfn_mkwrite(vmf);
 		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
 			return ret;
 		return finish_mkwrite_fault(vmf);
@@ -2861,7 +2861,7 @@ static int __do_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	int ret;
 
-	ret = vma->vm_ops->fault(vma, vmf);
+	ret = vma->vm_ops->fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
 			    VM_FAULT_DONE_COW)))
 		return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index b729084eea90..1cd70010edf0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3125,8 +3125,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 		mm->data_vm += npages;
 }
 
-static int special_mapping_fault(struct vm_area_struct *vma,
-				 struct vm_fault *vmf);
+static int special_mapping_fault(struct vm_fault *vmf);
 
 /*
  * Having a close hook prevents vma merging regardless of flags.
@@ -3161,9 +3160,9 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = {
 	.fault = special_mapping_fault,
 };
 
-static int special_mapping_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int special_mapping_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	pgoff_t pgoff;
 	struct page **pages;
 
@@ -3173,7 +3172,7 @@ static int special_mapping_fault(struct vm_area_struct *vma,
 		struct vm_special_mapping *sm = vma->vm_private_data;
 
 		if (sm->fault)
-			return sm->fault(sm, vma, vmf);
+			return sm->fault(sm, vmf->vma, vmf);
 
 		pages = sm->pages;
 	}
diff --git a/mm/nommu.c b/mm/nommu.c
index bc964c26be8c..62600ba0d1d8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1794,7 +1794,7 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index 9c6d22ff44e2..f7f2330c6cb6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1908,8 +1908,9 @@ static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync
 	return ret;
 }
 
-static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int shmem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
 	enum sgp_type sgp;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index c354807381c1..c9e8a9898ce4 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -424,10 +424,9 @@ out:
 	return ret;
 }
 
-static int sel_mmap_policy_fault(struct vm_area_struct *vma,
-				 struct vm_fault *vmf)
+static int sel_mmap_policy_fault(struct vm_fault *vmf)
 {
-	struct policy_load_memory *plm = vma->vm_file->private_data;
+	struct policy_load_memory *plm = vmf->vma->vm_file->private_data;
 	unsigned long offset;
 	struct page *page;
 
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 9d33c1e85c79..aec9c92250fd 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3245,10 +3245,9 @@ static unsigned int snd_pcm_capture_poll(struct file *file, poll_table * wait)
 /*
  * mmap status record
  */
-static int snd_pcm_mmap_status_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_status_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	
 	if (substream == NULL)
@@ -3282,10 +3281,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file
 /*
  * mmap control record
  */
-static int snd_pcm_mmap_control_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_control_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	
 	if (substream == NULL)
@@ -3341,10 +3339,9 @@ snd_pcm_default_page_ops(struct snd_pcm_substream *substream, unsigned long ofs)
 /*
  * fault callback for mmapping a RAM page
  */
-static int snd_pcm_mmap_data_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_data_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	unsigned long offset;
 	struct page * page;
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c
index cf5dc33f4a6d..cf45bf1f7ee0 100644
--- a/sound/usb/usx2y/us122l.c
+++ b/sound/usb/usx2y/us122l.c
@@ -137,13 +137,12 @@ static void usb_stream_hwdep_vm_open(struct vm_area_struct *area)
 	snd_printdd(KERN_DEBUG "%i\n", atomic_read(&us122l->mmap_count));
 }
 
-static int usb_stream_hwdep_vm_fault(struct vm_area_struct *area,
-				     struct vm_fault *vmf)
+static int usb_stream_hwdep_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page *page;
 	void *vaddr;
-	struct us122l *us122l = area->vm_private_data;
+	struct us122l *us122l = vmf->vma->vm_private_data;
 	struct usb_stream *s;
 
 	mutex_lock(&us122l->mutex);
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index 0b34dbc8f302..605e1047c01d 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -31,19 +31,18 @@
 #include "usbusx2y.h"
 #include "usX2Yhwdep.h"
 
-static int snd_us428ctls_vm_fault(struct vm_area_struct *area,
-				  struct vm_fault *vmf)
+static int snd_us428ctls_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page * page;
 	void *vaddr;
 
 	snd_printdd("ENTER, start %lXh, pgoff %ld\n",
-		   area->vm_start,
+		   vmf->vma->vm_start,
 		   vmf->pgoff);
 	
 	offset = vmf->pgoff << PAGE_SHIFT;
-	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->us428ctls_sharedmem + offset;
+	vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->us428ctls_sharedmem + offset;
 	page = virt_to_page(vaddr);
 	get_page(page);
 	vmf->page = page;
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 90766a92e7fd..f95164b91152 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -652,14 +652,13 @@ static void snd_usX2Y_hwdep_pcm_vm_close(struct vm_area_struct *area)
 }
 
 
-static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_area_struct *area,
-					struct vm_fault *vmf)
+static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	void *vaddr;
 
 	offset = vmf->pgoff << PAGE_SHIFT;
-	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->hwdep_pcm_shm + offset;
+	vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->hwdep_pcm_shm + offset;
 	vmf->page = virt_to_page(vaddr);
 	get_page(vmf->page);
 	return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cc4d6e0dd2a2..5b0dd4a9b2cb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2350,9 +2350,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
-static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kvm_vcpu_fault(struct vm_fault *vmf)
 {
-	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
 	struct page *page;
 
 	if (vmf->pgoff == 0)
-- 
cgit v1.2.3


From a2d581675d485eb7188f521f36efc114639a3096 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 24 Feb 2017 14:56:59 -0800
Subject: mm,fs,dax: change ->pmd_fault to ->huge_fault

Patch series "1G transparent hugepage support for device dax", v2.

The following series implements support for 1G trasparent hugepage on
x86 for device dax.  The bulk of the code was written by Mathew Wilcox a
while back supporting transparent 1G hugepage for fs DAX.  I have
forward ported the relevant bits to 4.10-rc.  The current submission has
only the necessary code to support device DAX.

Comments from Dan Williams: So the motivation and intended user of this
functionality mirrors the motivation and users of 1GB page support in
hugetlbfs.  Given expected capacities of persistent memory devices an
in-memory database may want to reduce tlb pressure beyond what they can
already achieve with 2MB mappings of a device-dax file.  We have
customer feedback to that effect as Willy mentioned in his previous
version of these patches [1].

[1]: https://lkml.org/lkml/2016/1/31/52

Comments from Nilesh @ Oracle:

There are applications which have a process model; and if you assume
10,000 processes attempting to mmap all the 6TB memory available on a
server; we are looking at the following:

processes         : 10,000
memory            :    6TB
pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB
pmd @ 2M page size: 120,000 / 512 = ~240GB
pud @ 1G page size: 240GB / 512 = ~480MB

As you can see with 2M pages, this system will use up an exorbitant
amount of DRAM to hold the page tables; but the 1G pages finally brings
it down to a reasonable level.  Memory sizes will keep increasing; so
this number will keep increasing.

An argument can be made to convert the applications from process model
to thread model, but in the real world that may not be always practical.
Hopefully this helps explain the use case where this is valuable.

This patch (of 3):

In preparation for adding the ability to handle PUD pages, convert
vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault.  The
vm_fault structure is extended to include a union of the different page
table pointers that may be needed, and three flag bits are reserved to
indicate which type of pointer is in the union.

[ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()]
  Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com
[dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path]
  Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dax/dax.c   | 34 +++++++++++++---------------------
 fs/dax.c            | 45 ++++++++++++++++++++++++++++++++-------------
 fs/ext2/file.c      |  2 +-
 fs/ext4/file.c      | 23 +----------------------
 fs/xfs/xfs_file.c   | 10 +++++-----
 fs/xfs/xfs_trace.h  |  2 +-
 include/linux/dax.h |  6 ------
 include/linux/mm.h  | 10 +++++++++-
 mm/memory.c         | 18 ++++++++++++------
 9 files changed, 74 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 0261f332bf3e..922ec461dcaa 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -419,7 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 	return -1;
 }
 
-static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
@@ -455,23 +455,6 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 	return VM_FAULT_NOPAGE;
 }
 
-static int dax_dev_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	int rc;
-	struct file *filp = vma->vm_file;
-	struct dax_dev *dax_dev = filp->private_data;
-
-	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
-			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read", vma->vm_start, vma->vm_end);
-	rcu_read_lock();
-	rc = __dax_dev_fault(dax_dev, vmf);
-	rcu_read_unlock();
-
-	return rc;
-}
-
 static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -510,7 +493,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_fault *vmf)
+static int dax_dev_fault(struct vm_fault *vmf)
 {
 	int rc;
 	struct file *filp = vmf->vma->vm_file;
@@ -522,7 +505,16 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
 			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vmf);
+	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+	case FAULT_FLAG_SIZE_PTE:
+		rc = __dax_dev_pte_fault(dax_dev, vmf);
+		break;
+	case FAULT_FLAG_SIZE_PMD:
+		rc = __dax_dev_pmd_fault(dax_dev, vmf);
+		break;
+	default:
+		return VM_FAULT_FALLBACK;
+	}
 	rcu_read_unlock();
 
 	return rc;
@@ -530,7 +522,7 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
 
 static const struct vm_operations_struct dax_dev_vm_ops = {
 	.fault = dax_dev_fault,
-	.pmd_fault = dax_dev_pmd_fault,
+	.huge_fault = dax_dev_fault,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/dax.c b/fs/dax.c
index f955c0df33bb..c3c29fbf64be 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1118,16 +1118,8 @@ static int dax_fault_return(int error)
 	return VM_FAULT_SIGBUS;
 }
 
-/**
- * dax_iomap_fault - handle a page fault on a DAX file
- * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
- *
- * When a page fault occurs, filesystems may call this helper in their fault
- * or mkwrite handler for DAX files. Assumes the caller has done all the
- * necessary locking for the page fault to proceed successfully.
- */
-int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+static int dax_iomap_pte_fault(struct vm_fault *vmf,
+			       const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@@ -1244,7 +1236,6 @@ int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
 	}
 	return vmf_ret;
 }
-EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
 #ifdef CONFIG_FS_DAX_PMD
 /*
@@ -1335,7 +1326,8 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -1443,5 +1435,32 @@ out:
 	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
 	return result;
 }
-EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#else
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+{
+	return VM_FAULT_FALLBACK;
+}
 #endif /* CONFIG_FS_DAX_PMD */
+
+/**
+ * dax_iomap_fault - handle a page fault on a DAX file
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in
+ * their fault handler for DAX files. dax_iomap_fault() assumes the caller
+ * has done all the necessary locking for page fault to proceed
+ * successfully.
+ */
+int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+{
+	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+	case FAULT_FLAG_SIZE_PTE:
+		return dax_iomap_pte_fault(vmf, ops);
+	case FAULT_FLAG_SIZE_PMD:
+		return dax_iomap_pmd_fault(vmf, ops);
+	default:
+		return VM_FAULT_FALLBACK;
+	}
+}
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 0bf0d971205a..68738832beda 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -133,7 +133,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
 	/*
-	 * .pmd_fault is not supported for DAX because allocation in ext2
+	 * .huge_fault is not supported for DAX because allocation in ext2
 	 * cannot be reliably aligned to huge page sizes and so pmd faults
 	 * will always fail and fail back to regular faults.
 	 */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 21e1f17fe36d..502d2d07d191 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,27 +273,6 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 	return result;
 }
 
-static int
-ext4_dax_pmd_fault(struct vm_fault *vmf)
-{
-	int result;
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	struct super_block *sb = inode->i_sb;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
-
-	if (write) {
-		sb_start_pagefault(sb);
-		file_update_time(vmf->vma->vm_file);
-	}
-	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
-	up_read(&EXT4_I(inode)->i_mmap_sem);
-	if (write)
-		sb_end_pagefault(sb);
-
-	return result;
-}
-
 /*
  * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
  * handler we check for races agaist truncate. Note that since we cycle through
@@ -326,7 +305,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
-	.pmd_fault	= ext4_dax_pmd_fault,
+	.huge_fault	= ext4_dax_fault,
 	.page_mkwrite	= ext4_dax_fault,
 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9cc10136ba0b..990e03819370 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1429,12 +1429,12 @@ xfs_filemap_fault(
 /*
  * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
  * both read and write faults. Hence we need to handle both cases. There is no
- * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * ->huge_mkwrite callout for huge pages, so we have a single function here to
  * handle both cases here. @flags carries the information on the type of fault
  * occuring.
  */
 STATIC int
-xfs_filemap_pmd_fault(
+xfs_filemap_huge_fault(
 	struct vm_fault		*vmf)
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
@@ -1444,7 +1444,7 @@ xfs_filemap_pmd_fault(
 	if (!IS_DAX(inode))
 		return VM_FAULT_FALLBACK;
 
-	trace_xfs_filemap_pmd_fault(ip);
+	trace_xfs_filemap_huge_fault(ip);
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
@@ -1452,7 +1452,7 @@ xfs_filemap_pmd_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
+	ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -1497,7 +1497,7 @@ xfs_filemap_pfn_mkwrite(
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
-	.pmd_fault	= xfs_filemap_pmd_fault,
+	.huge_fault	= xfs_filemap_huge_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_filemap_page_mkwrite,
 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fb7555e73a62..383ac227ce2c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,7 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
 DEFINE_INODE_EVENT(xfs_filemap_fault);
-DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
+DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
 DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
 DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index eeb02421c848..cf9af225962b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -70,17 +70,11 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
-static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
-		const struct iomap_ops *ops)
-{
-	return VM_FAULT_FALLBACK;
-}
 #endif
 int dax_pfn_mkwrite(struct vm_fault *vmf);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3dd80ba6568a..035a688e5472 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,6 +285,11 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
+#define FAULT_FLAG_SIZE_MASK	0x7000	/* Support up to 8-level page tables */
+#define FAULT_FLAG_SIZE_PTE	0x0000	/* First level (eg 4k) */
+#define FAULT_FLAG_SIZE_PMD	0x1000	/* Second level (eg 2MB) */
+#define FAULT_FLAG_SIZE_PUD	0x2000	/* Third level (eg 1GB) */
+
 #define FAULT_FLAG_TRACE \
 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
 	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
@@ -314,6 +319,9 @@ struct vm_fault {
 	unsigned long address;		/* Faulting virtual address */
 	pmd_t *pmd;			/* Pointer to pmd entry matching
 					 * the 'address' */
+	pud_t *pud;			/* Pointer to pud entry matching
+					 * the 'address'
+					 */
 	pte_t orig_pte;			/* Value of PTE at the time of fault */
 
 	struct page *cow_page;		/* Page handler may use for COW fault */
@@ -351,7 +359,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_fault *vmf);
+	int (*huge_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/mm/memory.c b/mm/memory.c
index cf97d88158cd..e721e8eba570 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3466,8 +3466,8 @@ static int create_huge_pmd(struct vm_fault *vmf)
 {
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
-	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf);
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3475,8 +3475,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
-	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf);
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3606,6 +3606,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	pud_t *pud;
+	int ret;
 
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
@@ -3615,15 +3616,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-		int ret = create_huge_pmd(&vmf);
+		vmf.flags |= FAULT_FLAG_SIZE_PMD;
+		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
+		/* fall through path, remove PMD flag */
+		vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
 	} else {
 		pmd_t orig_pmd = *vmf.pmd;
-		int ret;
 
 		barrier();
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+			vmf.flags |= FAULT_FLAG_SIZE_PMD;
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
@@ -3632,6 +3636,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 				ret = wp_huge_pmd(&vmf, orig_pmd);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
+				/* fall through path, remove PUD flag */
+				vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
 			} else {
 				huge_pmd_set_accessed(&vmf, orig_pmd);
 				return 0;
-- 
cgit v1.2.3


From a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 24 Feb 2017 14:57:02 -0800
Subject: mm, x86: add support for PUD-sized transparent hugepages

The current transparent hugepage code only supports PMDs.  This patch
adds support for transparent use of PUDs with DAX.  It does not include
support for anonymous pages.  x86 support code also added.

Most of this patch simply parallels the work that was done for huge
PMDs.  The only major difference is how the new ->pud_entry method in
mm_walk works.  The ->pmd_entry method replaces the ->pte_entry method,
whereas the ->pud_entry method works along with either ->pmd_entry or
->pte_entry.  The pagewalk code takes care of locking the PUD before
calling ->pud_walk, so handlers do not need to worry whether the PUD is
stable.

[dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()]
  Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com
[dave.jiang@intel.com: native_pud_clear missing on i386 build]
  Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Alexander Kapshuk <alexander.kapshuk@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                          |   3 +
 arch/x86/Kconfig                      |   1 +
 arch/x86/include/asm/paravirt.h       |  11 ++
 arch/x86/include/asm/paravirt_types.h |   2 +
 arch/x86/include/asm/pgtable-2level.h |  17 +++
 arch/x86/include/asm/pgtable-3level.h |  30 ++++
 arch/x86/include/asm/pgtable.h        | 140 +++++++++++++++++++
 arch/x86/include/asm/pgtable_64.h     |  15 ++
 arch/x86/kernel/paravirt.c            |   1 +
 arch/x86/mm/pgtable.c                 |  31 +++++
 include/asm-generic/pgtable.h         |  80 ++++++++++-
 include/asm-generic/tlb.h             |  14 ++
 include/linux/huge_mm.h               |  83 +++++++++++-
 include/linux/mm.h                    |  30 +++-
 include/linux/mmu_notifier.h          |  14 ++
 include/linux/pfn_t.h                 |  12 ++
 mm/gup.c                              |   7 +
 mm/huge_memory.c                      | 249 ++++++++++++++++++++++++++++++++++
 mm/memory.c                           |  88 +++++++++++-
 mm/pagewalk.c                         |  20 ++-
 mm/pgtable-generic.c                  |  14 ++
 21 files changed, 844 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index f761142976e5..d0012add6b19 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -571,6 +571,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	bool
 
+config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+	bool
+
 config HAVE_ARCH_HUGE_VMAP
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 874c1238dffd..33007aa74111 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,7 @@ config X86
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
 	select HAVE_ARCH_VMAP_STACK		if X86_64
 	select HAVE_ARCH_WITHIN_STACK_FRAMES
 	select HAVE_CC_STACKPROTECTOR
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f75fbfe550f2..0489884fdc44 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -475,6 +475,17 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			    native_pmd_val(pmd));
 }
 
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+			      pud_t *pudp, pud_t pud)
+{
+	if (sizeof(pudval_t) > sizeof(long))
+		/* 5 arg words */
+		pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
+	else
+		PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
+			    native_pud_val(pud));
+}
+
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
 	pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45a60f2..b060f962d581 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -249,6 +249,8 @@ struct pv_mmu_ops {
 	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
 	void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
 			   pmd_t *pmdp, pmd_t pmdval);
+	void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
+			   pud_t *pudp, pud_t pudval);
 	void (*pte_update)(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep);
 
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index fd74a11959de..a8b96e708c2b 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -21,6 +21,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
 	*pmdp = pmd;
 }
 
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+}
+
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	native_set_pte(ptep, pte);
@@ -31,6 +35,10 @@ static inline void native_pmd_clear(pmd_t *pmdp)
 	native_set_pmd(pmdp, __pmd(0));
 }
 
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+
 static inline void native_pte_clear(struct mm_struct *mm,
 				    unsigned long addr, pte_t *xp)
 {
@@ -55,6 +63,15 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+	return __pud(xchg((pudval_t *)xp, 0));
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
 /* Bit manipulation helper on pte/pgoff entry */
 static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
 				      unsigned long mask, unsigned int leftshift)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index cdaa58c9b39e..8f50fb3f04e1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -121,6 +121,12 @@ static inline void native_pmd_clear(pmd_t *pmd)
 	*(tmp + 1) = 0;
 }
 
+#ifndef CONFIG_SMP
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+#endif
+
 static inline void pud_clear(pud_t *pudp)
 {
 	set_pud(pudp, __pud(0));
@@ -176,6 +182,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+union split_pud {
+	struct {
+		u32 pud_low;
+		u32 pud_high;
+	};
+	pud_t pud;
+};
+
+static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
+{
+	union split_pud res, *orig = (union split_pud *)pudp;
+
+	/* xchg acts as a barrier before setting of the high bits */
+	res.pud_low = xchg(&orig->pud_low, 0);
+	res.pud_high = orig->pud_high;
+	orig->pud_high = 0;
+
+	return res.pud;
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb436efa..1cfb36b8c024 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -46,6 +46,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #define set_pte(ptep, pte)		native_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)	native_set_pte_at(mm, addr, ptep, pte)
 #define set_pmd_at(mm, addr, pmdp, pmd)	native_set_pmd_at(mm, addr, pmdp, pmd)
+#define set_pud_at(mm, addr, pudp, pud)	native_set_pud_at(mm, addr, pudp, pud)
 
 #define set_pte_atomic(ptep, pte)					\
 	native_set_pte_atomic(ptep, pte)
@@ -128,6 +129,16 @@ static inline int pmd_young(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_ACCESSED;
 }
 
+static inline int pud_dirty(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_DIRTY;
+}
+
+static inline int pud_young(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_ACCESSED;
+}
+
 static inline int pte_write(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_RW;
@@ -181,6 +192,13 @@ static inline int pmd_trans_huge(pmd_t pmd)
 	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_trans_huge(pud_t pud)
+{
+	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+}
+#endif
+
 #define has_transparent_hugepage has_transparent_hugepage
 static inline int has_transparent_hugepage(void)
 {
@@ -192,6 +210,18 @@ static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
 }
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_devmap(pud_t pud)
+{
+	return !!(pud_val(pud) & _PAGE_DEVMAP);
+}
+#else
+static inline int pud_devmap(pud_t pud)
+{
+	return 0;
+}
+#endif
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -333,6 +363,65 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
+static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
+{
+	pudval_t v = native_pud_val(pud);
+
+	return __pud(v | set);
+}
+
+static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
+{
+	pudval_t v = native_pud_val(pud);
+
+	return __pud(v & ~clear);
+}
+
+static inline pud_t pud_mkold(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkclean(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_DIRTY);
+}
+
+static inline pud_t pud_wrprotect(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_RW);
+}
+
+static inline pud_t pud_mkdirty(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_mkdevmap(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_DEVMAP);
+}
+
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_PSE);
+}
+
+static inline pud_t pud_mkyoung(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkwrite(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_RW);
+}
+
+static inline pud_t pud_mknotpresent(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline int pte_soft_dirty(pte_t pte)
 {
@@ -344,6 +433,11 @@ static inline int pmd_soft_dirty(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
 }
 
+static inline int pud_soft_dirty(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_SOFT_DIRTY;
+}
+
 static inline pte_t pte_mksoft_dirty(pte_t pte)
 {
 	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
@@ -354,6 +448,11 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pud_t pud_mksoft_dirty(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
 static inline pte_t pte_clear_soft_dirty(pte_t pte)
 {
 	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -364,6 +463,11 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pud_t pud_clear_soft_dirty(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*
@@ -392,6 +496,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 		     massage_pgprot(pgprot));
 }
 
+static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
+}
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	pteval_t val = pte_val(pte);
@@ -771,6 +881,14 @@ static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
 	return res;
 }
 
+static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
+{
+	pud_t res = *pudp;
+
+	native_pud_clear(pudp);
+	return res;
+}
+
 static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
 				     pte_t *ptep , pte_t pte)
 {
@@ -783,6 +901,12 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	native_set_pmd(pmdp, pmd);
 }
 
+static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
+				     pud_t *pudp, pud_t pud)
+{
+	native_set_pud(pudp, pud);
+}
+
 #ifndef CONFIG_PARAVIRT
 /*
  * Rules for using pte_update - it must be called after any PTE update which
@@ -861,10 +985,15 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp,
 				 pmd_t entry, int dirty);
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pud_t *pudp,
+				 pud_t entry, int dirty);
 
 #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 				     unsigned long addr, pmd_t *pmdp);
+extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pud_t *pudp);
 
 #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
@@ -884,6 +1013,13 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long
 	return native_pmdp_get_and_clear(pmdp);
 }
 
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+					unsigned long addr, pud_t *pudp)
+{
+	return native_pudp_get_and_clear(pudp);
+}
+
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long addr, pmd_t *pmdp)
@@ -932,6 +1068,10 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmd)
 {
 }
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+		unsigned long addr, pud_t *pud)
+{
+}
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 62b775926045..73c7ccc38912 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,6 +106,21 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+#ifdef CONFIG_SMP
+	return native_make_pud(xchg(&xp->pud, 0));
+#else
+	/* native_local_pudp_get_and_clear,
+	 * but duplicated because of cyclic dependency
+	 */
+	pud_t ret = *xp;
+
+	native_pud_clear(xp);
+	return ret;
+#endif
+}
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 	*pgdp = pgd;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a1bfba0f7234..4797e87b0fb6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -425,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 	.pmd_clear = native_pmd_clear,
 #endif
 	.set_pud = native_set_pud,
+	.set_pud_at = native_set_pud_at,
 
 	.pmd_val = PTE_IDENT,
 	.make_pmd = PTE_IDENT,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5af4e67..6cbdff26bb96 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -445,6 +445,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 	return changed;
 }
+
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pud_t *pudp, pud_t entry, int dirty)
+{
+	int changed = !pud_same(*pudp, entry);
+
+	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+
+	if (changed && dirty) {
+		*pudp = entry;
+		/*
+		 * We had a write-protection fault here and changed the pud
+		 * to to more permissive. No need to flush the TLB for that,
+		 * #PF is architecturally guaranteed to do that and in the
+		 * worst-case we'll generate a spurious fault.
+		 */
+	}
+
+	return changed;
+}
 #endif
 
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
@@ -474,6 +494,17 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 
 	return ret;
 }
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pud_t *pudp)
+{
+	int ret = 0;
+
+	if (pud_young(*pudp))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *)pudp);
+
+	return ret;
+}
 #endif
 
 int ptep_clear_flush_young(struct vm_area_struct *vma,
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 18af2bcefe6a..a0aba0f9c57b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -36,6 +36,9 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp,
 				 pmd_t entry, int dirty);
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pud_t *pudp,
+				 pud_t entry, int dirty);
 #else
 static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmdp,
@@ -44,6 +47,13 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 	BUILD_BUG();
 	return 0;
 }
+static inline int pudp_set_access_flags(struct vm_area_struct *vma,
+					unsigned long address, pud_t *pudp,
+					pud_t entry, int dirty)
+{
+	BUILD_BUG();
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
@@ -121,8 +131,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long address,
 					    pmd_t *pmdp)
@@ -131,20 +141,40 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 	pmd_clear(pmdp);
 	return pmd;
 }
+#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
+#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address,
+					    pud_t *pudp)
+{
+	pud_t pud = *pudp;
+
+	pud_clear(pudp);
+	return pud;
+}
+#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#endif
 
-#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
 					    unsigned long address, pmd_t *pmdp,
 					    int full)
 {
 	return pmdp_huge_get_and_clear(mm, address, pmdp);
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
+#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
+static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
+					    unsigned long address, pud_t *pudp,
+					    int full)
+{
+	return pudp_huge_get_and_clear(mm, address, pudp);
+}
+#endif
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 					    unsigned long address, pte_t *ptep,
@@ -181,6 +211,9 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
 extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 			      unsigned long address,
 			      pmd_t *pmdp);
+extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
+			      unsigned long address,
+			      pud_t *pudp);
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -208,6 +241,23 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline void pudp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long address, pud_t *pudp)
+{
+	pud_t old_pud = *pudp;
+
+	set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
+}
+#else
+static inline void pudp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long address, pud_t *pudp)
+{
+	BUILD_BUG();
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#endif
 
 #ifndef pmdp_collapse_flush
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -273,12 +323,23 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
 	return pmd_val(pmd_a) == pmd_val(pmd_b);
 }
+
+static inline int pud_same(pud_t pud_a, pud_t pud_b)
+{
+	return pud_val(pud_a) == pud_val(pud_b);
+}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
 	BUILD_BUG();
 	return 0;
 }
+
+static inline int pud_same(pud_t pud_a, pud_t pud_b)
+{
+	BUILD_BUG();
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
@@ -640,6 +701,15 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
+	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+static inline int pud_trans_huge(pud_t pud)
+{
+	return 0;
+}
+#endif
+
 #ifndef pmd_read_atomic
 static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 {
@@ -785,8 +855,10 @@ static inline int pmd_clear_huge(pmd_t *pmd)
  * e.g. see arch/arc: flush_pmd_tlb_range
  */
 #define flush_pmd_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
+#define flush_pud_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
 #else
 #define flush_pmd_tlb_range(vma, addr, end)	BUILD_BUG()
+#define flush_pud_tlb_range(vma, addr, end)	BUILD_BUG()
 #endif
 #endif
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 7eed8cf3130a..4329bc6ef04b 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -232,6 +232,20 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 		__tlb_remove_pmd_tlb_entry(tlb, pmdp, address);		\
 	} while (0)
 
+/**
+ * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
+ * invalidation. This is a nop so far, because only x86 needs it.
+ */
+#ifndef __tlb_remove_pud_tlb_entry
+#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
+#endif
+
+#define tlb_remove_pud_tlb_entry(tlb, pudp, address)			\
+	do {								\
+		__tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE);	\
+		__tlb_remove_pud_tlb_entry(tlb, pudp, address);		\
+	} while (0)
+
 /*
  * For things like page tables caches (ie caching addresses "inside" the
  * page tables, like x86 does), for legacy reasons, flushing an
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0029e786205..a3762d49ba39 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -6,6 +6,18 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 			 struct vm_area_struct *vma);
 extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
+extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+			 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+			 struct vm_area_struct *vma);
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
+#else
+static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
+{
+}
+#endif
+
 extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
 extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
@@ -17,6 +29,9 @@ extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
 extern int zap_huge_pmd(struct mmu_gather *tlb,
 			struct vm_area_struct *vma,
 			pmd_t *pmd, unsigned long addr);
+extern int zap_huge_pud(struct mmu_gather *tlb,
+			struct vm_area_struct *vma,
+			pud_t *pud, unsigned long addr);
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
@@ -26,8 +41,10 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, pgprot_t newprot,
 			int prot_numa);
-int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
-			pfn_t pfn, bool write);
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+			pmd_t *pmd, pfn_t pfn, bool write);
+int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+			pud_t *pud, pfn_t pfn, bool write);
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -58,13 +75,14 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, int flags);
-
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
 #define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
 
+#define HPAGE_PUD_SHIFT PUD_SHIFT
+#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
+#define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
+
 extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
 #define transparent_hugepage_enabled(__vma)				\
@@ -118,6 +136,17 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 		bool freeze, struct page *page);
 
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+		unsigned long address);
+
+#define split_huge_pud(__vma, __pud, __address)				\
+	do {								\
+		pud_t *____pud = (__pud);				\
+		if (pud_trans_huge(*____pud)				\
+					|| pud_devmap(*____pud))	\
+			__split_huge_pud(__vma, __pud, __address);	\
+	}  while (0)
+
 extern int hugepage_madvise(struct vm_area_struct *vma,
 			    unsigned long *vm_flags, int advice);
 extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -126,6 +155,8 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    long adjust_next);
 extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma);
+extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
+		struct vm_area_struct *vma);
 /* mmap_sem must be held on entry */
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
@@ -136,6 +167,15 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 	else
 		return NULL;
 }
+static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
+		struct vm_area_struct *vma)
+{
+	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
+	if (pud_trans_huge(*pud) || pud_devmap(*pud))
+		return __pud_trans_huge_lock(pud, vma);
+	else
+		return NULL;
+}
 static inline int hpage_nr_pages(struct page *page)
 {
 	if (unlikely(PageTransHuge(page)))
@@ -143,6 +183,11 @@ static inline int hpage_nr_pages(struct page *page)
 	return 1;
 }
 
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, int flags);
+struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud, int flags);
+
 extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 extern struct page *huge_zero_page;
@@ -157,6 +202,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return is_huge_zero_page(pmd_page(pmd));
 }
 
+static inline bool is_huge_zero_pud(pud_t pud)
+{
+	return false;
+}
+
 struct page *mm_get_huge_zero_page(struct mm_struct *mm);
 void mm_put_huge_zero_page(struct mm_struct *mm);
 
@@ -167,6 +217,10 @@ void mm_put_huge_zero_page(struct mm_struct *mm);
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
 
+#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
+#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
+#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
+
 #define hpage_nr_pages(x) 1
 
 #define transparent_hugepage_enabled(__vma) 0
@@ -195,6 +249,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 static inline void split_huge_pmd_address(struct vm_area_struct *vma,
 		unsigned long address, bool freeze, struct page *page) {}
 
+#define split_huge_pud(__vma, __pmd, __address)	\
+	do { } while (0)
+
 static inline int hugepage_madvise(struct vm_area_struct *vma,
 				   unsigned long *vm_flags, int advice)
 {
@@ -212,6 +269,11 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 {
 	return NULL;
 }
+static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
+		struct vm_area_struct *vma)
+{
+	return NULL;
+}
 
 static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
@@ -223,6 +285,11 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
+static inline bool is_huge_zero_pud(pud_t pud)
+{
+	return false;
+}
+
 static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 {
 	return;
@@ -233,6 +300,12 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
 {
 	return NULL;
 }
+
+static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
+		unsigned long addr, pud_t *pud, int flags)
+{
+	return NULL;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 035a688e5472..d8b75d7d6a9e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -424,6 +424,10 @@ static inline int pmd_devmap(pmd_t pmd)
 {
 	return 0;
 }
+static inline int pud_devmap(pud_t pud)
+{
+	return 0;
+}
 #endif
 
 /*
@@ -1199,6 +1203,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 
 /**
  * mm_walk - callbacks for walk_page_range
+ * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
+ *	       this handler should only handle pud_trans_huge() puds.
+ *	       the pmd_entry or pte_entry callbacks will be used for
+ *	       regular PUDs.
  * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
  *	       this handler is required to be able to handle
  *	       pmd_trans_huge() pmds.  They may simply choose to
@@ -1218,6 +1226,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  * (see the comment on walk_page_range() for more details)
  */
 struct mm_walk {
+	int (*pud_entry)(pud_t *pud, unsigned long addr,
+			 unsigned long next, struct mm_walk *walk);
 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
 	int (*pte_entry)(pte_t *pte, unsigned long addr,
@@ -1801,8 +1811,26 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
 	return ptl;
 }
 
-extern void __init pagecache_init(void);
+/*
+ * No scalability reason to split PUD locks yet, but follow the same pattern
+ * as the PMD locks to make it easier if we decide to.  The VM should not be
+ * considered ready to switch to split PUD locks yet; there may be places
+ * which need to be converted from page_table_lock.
+ */
+static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
+{
+	return &mm->page_table_lock;
+}
+
+static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
+{
+	spinlock_t *ptl = pud_lockptr(mm, pud);
+
+	spin_lock(ptl);
+	return ptl;
+}
 
+extern void __init pagecache_init(void);
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a1a210d59961..51891fb0d3ce 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -381,6 +381,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pmd;								\
 })
 
+#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud)		\
+({									\
+	unsigned long ___haddr = __haddr & HPAGE_PUD_MASK;		\
+	struct mm_struct *___mm = (__vma)->vm_mm;			\
+	pud_t ___pud;							\
+									\
+	___pud = pudp_huge_clear_flush(__vma, __haddr, __pud);		\
+	mmu_notifier_invalidate_range(___mm, ___haddr,			\
+				      ___haddr + HPAGE_PUD_SIZE);	\
+									\
+	___pud;								\
+})
+
 #define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)		\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
@@ -475,6 +488,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
+#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
 #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 033fc7bbcefa..a49b3259cad7 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -90,6 +90,13 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
 {
 	return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
 }
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
+{
+	return pfn_pud(pfn_t_to_pfn(pfn), pgprot);
+}
+#endif
 #endif
 
 #ifdef __HAVE_ARCH_PTE_DEVMAP
@@ -106,5 +113,10 @@ static inline bool pfn_t_devmap(pfn_t pfn)
 }
 pte_t pte_mkdevmap(pte_t pte);
 pmd_t pmd_mkdevmap(pmd_t pmd);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+pud_t pud_mkdevmap(pud_t pud);
 #endif
+#endif /* __HAVE_ARCH_PTE_DEVMAP */
+
 #endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/gup.c b/mm/gup.c
index 40abe4c90383..1e67461b2733 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -253,6 +253,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
+	if (pud_devmap(*pud)) {
+		ptl = pud_lock(mm, pud);
+		page = follow_devmap_pud(vma, address, pud, flags);
+		spin_unlock(ptl);
+		if (page)
+			return page;
+	}
 	if (unlikely(pud_bad(*pud)))
 		return no_page_table(vma, flags);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9ecc2aeadfc..85742ac5b32e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_flags & VM_WRITE))
+		pud = pud_mkwrite(pud);
+	return pud;
+}
+
+static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pud_t entry;
+	spinlock_t *ptl;
+
+	ptl = pud_lock(mm, pud);
+	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
+	if (pfn_t_devmap(pfn))
+		entry = pud_mkdevmap(entry);
+	if (write) {
+		entry = pud_mkyoung(pud_mkdirty(entry));
+		entry = maybe_pud_mkwrite(entry, vma);
+	}
+	set_pud_at(mm, addr, pud, entry);
+	update_mmu_cache_pud(vma, addr, pud);
+	spin_unlock(ptl);
+}
+
+int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+			pud_t *pud, pfn_t pfn, bool write)
+{
+	pgprot_t pgprot = vma->vm_page_prot;
+	/*
+	 * If we had pud_special, we could avoid all these restrictions,
+	 * but we need to be consistent with PTEs and architectures that
+	 * can't support a 'special' bit.
+	 */
+	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+						(VM_PFNMAP|VM_MIXEDMAP));
+	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+	BUG_ON(!pfn_t_devmap(pfn));
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	track_pfn_insert(vma, &pgprot, pfn);
+
+	insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+	return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd)
 {
@@ -887,6 +941,123 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud)
+{
+	pud_t _pud;
+
+	/*
+	 * We should set the dirty bit only for FOLL_WRITE but for now
+	 * the dirty bit in the pud is meaningless.  And if the dirty
+	 * bit will become meaningful and we'll only set it with
+	 * FOLL_WRITE, an atomic set_bit will be required on the pud to
+	 * set the young bit, instead of the current set_pud_at.
+	 */
+	_pud = pud_mkyoung(pud_mkdirty(*pud));
+	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
+				pud, _pud,  1))
+		update_mmu_cache_pud(vma, addr, pud);
+}
+
+struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud, int flags)
+{
+	unsigned long pfn = pud_pfn(*pud);
+	struct mm_struct *mm = vma->vm_mm;
+	struct dev_pagemap *pgmap;
+	struct page *page;
+
+	assert_spin_locked(pud_lockptr(mm, pud));
+
+	if (flags & FOLL_WRITE && !pud_write(*pud))
+		return NULL;
+
+	if (pud_present(*pud) && pud_devmap(*pud))
+		/* pass */;
+	else
+		return NULL;
+
+	if (flags & FOLL_TOUCH)
+		touch_pud(vma, addr, pud);
+
+	/*
+	 * device mapped pages can only be returned if the
+	 * caller will manage the page reference count.
+	 */
+	if (!(flags & FOLL_GET))
+		return ERR_PTR(-EEXIST);
+
+	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+	pgmap = get_dev_pagemap(pfn, NULL);
+	if (!pgmap)
+		return ERR_PTR(-EFAULT);
+	page = pfn_to_page(pfn);
+	get_page(page);
+	put_dev_pagemap(pgmap);
+
+	return page;
+}
+
+int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+		  struct vm_area_struct *vma)
+{
+	spinlock_t *dst_ptl, *src_ptl;
+	pud_t pud;
+	int ret;
+
+	dst_ptl = pud_lock(dst_mm, dst_pud);
+	src_ptl = pud_lockptr(src_mm, src_pud);
+	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+	ret = -EAGAIN;
+	pud = *src_pud;
+	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+		goto out_unlock;
+
+	/*
+	 * When page table lock is held, the huge zero pud should not be
+	 * under splitting since we don't split the page itself, only pud to
+	 * a page table.
+	 */
+	if (is_huge_zero_pud(pud)) {
+		/* No huge zero pud yet */
+	}
+
+	pudp_set_wrprotect(src_mm, addr, src_pud);
+	pud = pud_mkold(pud_wrprotect(pud));
+	set_pud_at(dst_mm, addr, dst_pud, pud);
+
+	ret = 0;
+out_unlock:
+	spin_unlock(src_ptl);
+	spin_unlock(dst_ptl);
+	return ret;
+}
+
+void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
+{
+	pud_t entry;
+	unsigned long haddr;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
+	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
+		goto unlock;
+
+	entry = pud_mkyoung(orig_pud);
+	if (write)
+		entry = pud_mkdirty(entry);
+	haddr = vmf->address & HPAGE_PUD_MASK;
+	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
+		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
+
+unlock:
+	spin_unlock(vmf->ptl);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	pmd_t entry;
@@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 	return NULL;
 }
 
+/*
+ * Returns true if a given pud maps a thp, false otherwise.
+ *
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
+ */
+spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
+{
+	spinlock_t *ptl;
+
+	ptl = pud_lock(vma->vm_mm, pud);
+	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+		return ptl;
+	spin_unlock(ptl);
+	return NULL;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		 pud_t *pud, unsigned long addr)
+{
+	pud_t orig_pud;
+	spinlock_t *ptl;
+
+	ptl = __pud_trans_huge_lock(pud, vma);
+	if (!ptl)
+		return 0;
+	/*
+	 * For architectures like ppc64 we look at deposited pgtable
+	 * when calling pudp_huge_get_and_clear. So do the
+	 * pgtable_trans_huge_withdraw after finishing pudp related
+	 * operations.
+	 */
+	orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
+			tlb->fullmm);
+	tlb_remove_pud_tlb_entry(tlb, pud, addr);
+	if (vma_is_dax(vma)) {
+		spin_unlock(ptl);
+		/* No zero page support yet */
+	} else {
+		/* No support for anonymous PUD pages yet */
+		BUG();
+	}
+	return 1;
+}
+
+static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
+		unsigned long haddr)
+{
+	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
+	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
+	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+
+	count_vm_event(THP_SPLIT_PMD);
+
+	pudp_huge_clear_flush_notify(vma, haddr, pud);
+}
+
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+		unsigned long address)
+{
+	spinlock_t *ptl;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long haddr = address & HPAGE_PUD_MASK;
+
+	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
+	ptl = pud_lock(mm, pud);
+	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+		goto out;
+	__split_huge_pud_locked(vma, pud, haddr);
+
+out:
+	spin_unlock(ptl);
+	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 		unsigned long haddr, pmd_t *pmd)
 {
diff --git a/mm/memory.c b/mm/memory.c
index e721e8eba570..41e2a2d4b2a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
 		next = pmd_addr_end(addr, end);
 		if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
 			int err;
-			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
 			err = copy_huge_pmd(dst_mm, src_mm,
 					    dst_pmd, src_pmd, addr, vma);
 			if (err == -ENOMEM)
@@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
 	src_pud = pud_offset(src_pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
+		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+			int err;
+
+			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+			err = copy_huge_pud(dst_mm, src_mm,
+					    dst_pud, src_pud, addr, vma);
+			if (err == -ENOMEM)
+				return -ENOMEM;
+			if (!err)
+				continue;
+			/* fall through */
+		}
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
+		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+			if (next - addr != HPAGE_PUD_SIZE) {
+				VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+				split_huge_pud(vma, pud, addr);
+			} else if (zap_huge_pud(tlb, vma, pud, addr))
+				goto next;
+			/* fall through */
+		}
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+		cond_resched();
 	} while (pud++, addr = next, addr != end);
 
 	return addr;
@@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
 }
 
+static int create_huge_pud(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/* No support for anonymous transparent PUD pages yet */
+	if (vma_is_anonymous(vmf->vma))
+		return VM_FAULT_FALLBACK;
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+	return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/* No support for anonymous transparent PUD pages yet */
+	if (vma_is_anonymous(vmf->vma))
+		return VM_FAULT_FALLBACK;
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+	return VM_FAULT_FALLBACK;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	};
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
-	pud_t *pud;
 	int ret;
 
 	pgd = pgd_offset(mm, address);
-	pud = pud_alloc(mm, pgd, address);
-	if (!pud)
+
+	vmf.pud = pud_alloc(mm, pgd, address);
+	if (!vmf.pud)
 		return VM_FAULT_OOM;
-	vmf.pmd = pmd_alloc(mm, pud, address);
+	if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+		vmf.flags |= FAULT_FLAG_SIZE_PUD;
+		ret = create_huge_pud(&vmf);
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
+	} else {
+		pud_t orig_pud = *vmf.pud;
+
+		barrier();
+		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+			unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+			vmf.flags |= FAULT_FLAG_SIZE_PUD;
+
+			/* NUMA case for anonymous PUDs would go here */
+
+			if (dirty && !pud_write(orig_pud)) {
+				ret = wp_huge_pud(&vmf, orig_pud);
+				if (!(ret & VM_FAULT_FALLBACK))
+					return ret;
+			} else {
+				huge_pud_set_accessed(&vmf, orig_pud);
+				return 0;
+			}
+		}
+	}
+
+	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
 	if (!vmf.pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
@@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
+	spinlock_t *ptl;
 	pmd_t *new = pmd_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 
 	smp_wmb(); /* See comment in __pte_alloc */
 
-	spin_lock(&mm->page_table_lock);
+	ptl = pud_lock(mm, pud);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (!pud_present(*pud)) {
 		mm_inc_nr_pmds(mm);
@@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	} else /* Another has populated it */
 		pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 207244489a68..03761577ae86 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -78,14 +78,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 
 	pud = pud_offset(pgd, addr);
 	do {
+ again:
 		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud)) {
+		if (pud_none(*pud) || !walk->vma) {
 			if (walk->pte_hole)
 				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
+
+		if (walk->pud_entry) {
+			spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
+
+			if (ptl) {
+				err = walk->pud_entry(pud, addr, next, walk);
+				spin_unlock(ptl);
+				if (err)
+					break;
+				continue;
+			}
+		}
+
+		split_huge_pud(walk->vma, pud, addr);
+		if (pud_none(*pud))
+			goto again;
+
 		if (walk->pmd_entry || walk->pte_entry)
 			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 71c5f9109f2a..4ed5908c65b0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -123,6 +123,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
 }
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+			    pud_t *pudp)
+{
+	pud_t pud;
+
+	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+	VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+	pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
+	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+	return pud;
+}
+#endif
 #endif
 
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
-- 
cgit v1.2.3


From c791ace1e747371658237f0d30234fef56c39669 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 24 Feb 2017 14:57:08 -0800
Subject: mm: replace FAULT_FLAG_SIZE with parameter to huge_fault

Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
been somewhat painful with getting the flags set and removed at the
correct locations.  More than one kernel oops was introduced due to
difficulties of getting the placement correctly.

Remove the flag values and introduce an input parameter to huge_fault
that indicates the size of the page entry.  This makes the code easier
to trace and should avoid the issues we see with the fault flags where
removal of the flag was necessary in the fallback paths.

Link: http://lkml.kernel.org/r/148615748258.43180.1690152053774975329.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dax/dax.c   | 18 ++++++++++++------
 fs/dax.c            |  9 +++++----
 fs/ext2/file.c      |  2 +-
 fs/ext4/file.c      | 12 +++++++++---
 fs/xfs/xfs_file.c   |  9 +++++----
 include/linux/dax.h |  3 ++-
 include/linux/mm.h  | 14 ++++++++------
 mm/memory.c         | 17 ++++-------------
 8 files changed, 46 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index b90bb301bda0..b75c77254fdb 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -538,7 +538,8 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 }
 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-static int dax_dev_fault(struct vm_fault *vmf)
+static int dax_dev_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
 {
 	int rc;
 	struct file *filp = vmf->vma->vm_file;
@@ -550,14 +551,14 @@ static int dax_dev_fault(struct vm_fault *vmf)
 			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
-	case FAULT_FLAG_SIZE_PTE:
+	switch (pe_size) {
+	case PE_SIZE_PTE:
 		rc = __dax_dev_pte_fault(dax_dev, vmf);
 		break;
-	case FAULT_FLAG_SIZE_PMD:
+	case PE_SIZE_PMD:
 		rc = __dax_dev_pmd_fault(dax_dev, vmf);
 		break;
-	case FAULT_FLAG_SIZE_PUD:
+	case PE_SIZE_PUD:
 		rc = __dax_dev_pud_fault(dax_dev, vmf);
 		break;
 	default:
@@ -568,9 +569,14 @@ static int dax_dev_fault(struct vm_fault *vmf)
 	return rc;
 }
 
+static int dax_dev_fault(struct vm_fault *vmf)
+{
+	return dax_dev_huge_fault(vmf, PE_SIZE_PTE);
+}
+
 static const struct vm_operations_struct dax_dev_vm_ops = {
 	.fault = dax_dev_fault,
-	.huge_fault = dax_dev_fault,
+	.huge_fault = dax_dev_huge_fault,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/dax.c b/fs/dax.c
index c3c29fbf64be..5ae8b71ebadc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1452,12 +1452,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
  * has done all the necessary locking for page fault to proceed
  * successfully.
  */
-int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+		    const struct iomap_ops *ops)
 {
-	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
-	case FAULT_FLAG_SIZE_PTE:
+	switch (pe_size) {
+	case PE_SIZE_PTE:
 		return dax_iomap_pte_fault(vmf, ops);
-	case FAULT_FLAG_SIZE_PMD:
+	case PE_SIZE_PMD:
 		return dax_iomap_pmd_fault(vmf, ops);
 	default:
 		return VM_FAULT_FALLBACK;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 68738832beda..b21891a6bfca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 502d2d07d191..8210c1f43556 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -253,7 +253,8 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
-static int ext4_dax_fault(struct vm_fault *vmf)
+static int ext4_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
 {
 	int result;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -265,7 +266,7 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
@@ -273,6 +274,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 	return result;
 }
 
+static int ext4_dax_fault(struct vm_fault *vmf)
+{
+	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
 /*
  * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
  * handler we check for races agaist truncate. Note that since we cycle through
@@ -305,7 +311,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
-	.huge_fault	= ext4_dax_fault,
+	.huge_fault	= ext4_dax_huge_fault,
 	.page_mkwrite	= ext4_dax_fault,
 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 990e03819370..a50eca676670 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1391,7 +1391,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	} else {
 		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1418,7 +1418,7 @@ xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	else
 		ret = filemap_fault(vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1435,7 +1435,8 @@ xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_huge_fault(
-	struct vm_fault		*vmf)
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size)
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1452,7 +1453,7 @@ xfs_filemap_huge_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+	ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index cf9af225962b..d8a3dc042e1c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,7 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+		    const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d8b75d7d6a9e..c65aa43b5712 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,11 +285,6 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
-#define FAULT_FLAG_SIZE_MASK	0x7000	/* Support up to 8-level page tables */
-#define FAULT_FLAG_SIZE_PTE	0x0000	/* First level (eg 4k) */
-#define FAULT_FLAG_SIZE_PMD	0x1000	/* Second level (eg 2MB) */
-#define FAULT_FLAG_SIZE_PUD	0x2000	/* Third level (eg 1GB) */
-
 #define FAULT_FLAG_TRACE \
 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
 	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
@@ -349,6 +344,13 @@ struct vm_fault {
 					 */
 };
 
+/* page entry size for vm->huge_fault() */
+enum page_entry_size {
+	PE_SIZE_PTE = 0,
+	PE_SIZE_PMD,
+	PE_SIZE_PUD,
+};
+
 /*
  * These are the virtual MM functions - opening of an area, closing and
  * unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -359,7 +361,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_fault *vmf);
-	int (*huge_fault)(struct vm_fault *vmf);
+	int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/mm/memory.c b/mm/memory.c
index 41e2a2d4b2a6..6040b74d02a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3489,7 +3489,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3498,7 +3498,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3519,7 +3519,7 @@ static int create_huge_pud(struct vm_fault *vmf)
 	if (vma_is_anonymous(vmf->vma))
 		return VM_FAULT_FALLBACK;
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 	return VM_FAULT_FALLBACK;
 }
@@ -3531,7 +3531,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 	if (vma_is_anonymous(vmf->vma))
 		return VM_FAULT_FALLBACK;
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 	return VM_FAULT_FALLBACK;
 }
@@ -3659,7 +3659,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pud)
 		return VM_FAULT_OOM;
 	if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
-		vmf.flags |= FAULT_FLAG_SIZE_PUD;
 		ret = create_huge_pud(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
@@ -3670,8 +3669,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
 			unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
-			vmf.flags |= FAULT_FLAG_SIZE_PUD;
-
 			/* NUMA case for anonymous PUDs would go here */
 
 			if (dirty && !pud_write(orig_pud)) {
@@ -3689,18 +3686,14 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-		vmf.flags |= FAULT_FLAG_SIZE_PMD;
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
-		/* fall through path, remove PMD flag */
-		vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
 	} else {
 		pmd_t orig_pmd = *vmf.pmd;
 
 		barrier();
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
-			vmf.flags |= FAULT_FLAG_SIZE_PMD;
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
@@ -3709,8 +3702,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 				ret = wp_huge_pmd(&vmf, orig_pmd);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
-				/* fall through path, remove PUD flag */
-				vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
 			} else {
 				huge_pmd_set_accessed(&vmf, orig_pmd);
 				return 0;
-- 
cgit v1.2.3


From 9e5bcd610ffcedf5e485e78a72762810b25c7f25 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Fri, 24 Feb 2017 14:57:29 -0800
Subject: mm/migration: make isolate_movable_page() return int type

Patch series "HWPOISON: soft offlining for non-lru movable page", v6.

After Minchan's commit bda807d44454 ("mm: migrate: support non-lru
movable page migration"), some type of non-lru page like zsmalloc and
virtio-balloon page also support migration.

Therefore, we can:

1) soft offlining no-lru movable pages, which means when memory
   corrected errors occur on a non-lru movable page, we can stop to use
   it by migrating data onto another page and disable the original
   (maybe half-broken) one.

2) enable memory hotplug for non-lru movable pages, i.e. we may offline
   blocks, which include such pages, by using non-lru page migration.

This patchset is heavily dependent on non-lru movable page migration.

This patch (of 4):

Change the return type of isolate_movable_page() from bool to int.  It
will return 0 when isolate movable page successfully, and return -EBUSY
when it isolates failed.

There is no functional change within this patch but prepare for later
patch.

[xieyisheng1@huawei.com: v6]
  Link: http://lkml.kernel.org/r/1486108770-630-2-git-send-email-xieyisheng1@huawei.com
Link: http://lkml.kernel.org/r/1485867981-16037-2-git-send-email-ysxie@foxmail.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 2 +-
 mm/compaction.c         | 2 +-
 mm/migrate.c            | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ae8d475a9385..43d5deb0c4cc 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -37,7 +37,7 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
-extern bool isolate_movable_page(struct page *page, isolate_mode_t mode);
+extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 extern void putback_movable_page(struct page *page);
 
 extern int migrate_prep(void);
diff --git a/mm/compaction.c b/mm/compaction.c
index 0aa2757399ee..0fdfde016ee2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -802,7 +802,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 					locked = false;
 				}
 
-				if (isolate_movable_page(page, isolate_mode))
+				if (!isolate_movable_page(page, isolate_mode))
 					goto isolate_success;
 			}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 87f4d0f81819..6807174e0715 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -74,7 +74,7 @@ int migrate_prep_local(void)
 	return 0;
 }
 
-bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+int isolate_movable_page(struct page *page, isolate_mode_t mode)
 {
 	struct address_space *mapping;
 
@@ -125,14 +125,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
 	__SetPageIsolated(page);
 	unlock_page(page);
 
-	return true;
+	return 0;
 
 out_no_isolated:
 	unlock_page(page);
 out_putpage:
 	put_page(page);
 out:
-	return false;
+	return -EBUSY;
 }
 
 /* It should be called on page which is PG_movable */
-- 
cgit v1.2.3


From cbae0170e5329c79c0a36a81fedd2800146aca12 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Fri, 24 Feb 2017 14:57:32 -0800
Subject: mm/migration: make isolate_movable_page always defined

Define isolate_movable_page as a static inline function when
CONFIG_MIGRATION is not enable.  It should return -EBUSY here which
means failed to isolate movable pages.

This patch do not have any functional change but prepare for later
patch.

Link: http://lkml.kernel.org/r/1485867981-16037-3-git-send-email-ysxie@foxmail.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 43d5deb0c4cc..fa76b516fa47 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -56,6 +56,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new,
 		free_page_t free, unsigned long private, enum migrate_mode mode,
 		int reason)
 	{ return -ENOSYS; }
+static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
+	{ return -EBUSY; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
-- 
cgit v1.2.3


From ace71a19cec5eb430207c3269d8a2683f0574306 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 24 Feb 2017 14:57:45 -0800
Subject: mm: introduce page_vma_mapped_walk()

Introduce a new interface to check if a page is mapped into a vma.  It
aims to address shortcomings of page_check_address{,_transhuge}.

Existing interface is not able to handle PTE-mapped THPs: it only finds
the first PTE.  The rest lefted unnoticed.

page_vma_mapped_walk() iterates over all possible mapping of the page in
the vma.

Link: http://lkml.kernel.org/r/20170129173858.45174-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  26 +++++++
 mm/Makefile          |   6 +-
 mm/huge_memory.c     |   9 ++-
 mm/page_vma_mapped.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 224 insertions(+), 5 deletions(-)
 create mode 100644 mm/page_vma_mapped.c

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 15321fb1df6b..b76343610653 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/rwsem.h>
 #include <linux/memcontrol.h>
+#include <linux/highmem.h>
 
 /*
  * The anon_vma heads a list of private "related" vmas, to scan if
@@ -232,6 +233,31 @@ static inline bool page_check_address_transhuge(struct page *page,
 }
 #endif
 
+/* Avoid racy checks */
+#define PVMW_SYNC		(1 << 0)
+/* Look for migarion entries rather than present PTEs */
+#define PVMW_MIGRATION		(1 << 1)
+
+struct page_vma_mapped_walk {
+	struct page *page;
+	struct vm_area_struct *vma;
+	unsigned long address;
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+	unsigned int flags;
+};
+
+static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
+{
+	if (pvmw->pte)
+		pte_unmap(pvmw->pte);
+	if (pvmw->ptl)
+		spin_unlock(pvmw->ptl);
+}
+
+bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
+
 /*
  * Used by swapoff to help locate where page is expected in vma.
  */
diff --git a/mm/Makefile b/mm/Makefile
index 433eaf9a876e..aa0aa17cb413 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,8 +23,10 @@ KCOV_INSTRUMENT_vmstat.o := n
 
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
-			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o pagewalk.o pgtable-generic.o
+			   mlock.o mmap.o mprotect.o mremap.o msync.o \
+			   page_vma_mapped.o pagewalk.o pgtable-generic.o \
+			   rmap.o vmalloc.o
+
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
 mmu-$(CONFIG_MMU)	+= process_vm_access.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 85742ac5b32e..a7bac4f2b78a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2129,9 +2129,12 @@ static void freeze_page(struct page *page)
 static void unfreeze_page(struct page *page)
 {
 	int i;
-
-	for (i = 0; i < HPAGE_PMD_NR; i++)
-		remove_migration_ptes(page + i, page + i, true);
+	if (PageTransHuge(page)) {
+		remove_migration_ptes(page, page, true);
+	} else {
+		for (i = 0; i < HPAGE_PMD_NR; i++)
+			remove_migration_ptes(page + i, page + i, true);
+	}
 }
 
 static void __split_huge_page_tail(struct page *head, int tail,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
new file mode 100644
index 000000000000..dc1a54826cf2
--- /dev/null
+++ b/mm/page_vma_mapped.c
@@ -0,0 +1,188 @@
+#include <linux/mm.h>
+#include <linux/rmap.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+static inline bool check_pmd(struct page_vma_mapped_walk *pvmw)
+{
+	pmd_t pmde;
+	/*
+	 * Make sure we don't re-load pmd between present and !trans_huge check.
+	 * We need a consistent view.
+	 */
+	pmde = READ_ONCE(*pvmw->pmd);
+	return pmd_present(pmde) && !pmd_trans_huge(pmde);
+}
+
+static inline bool not_found(struct page_vma_mapped_walk *pvmw)
+{
+	page_vma_mapped_walk_done(pvmw);
+	return false;
+}
+
+static bool map_pte(struct page_vma_mapped_walk *pvmw)
+{
+	pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
+	if (!(pvmw->flags & PVMW_SYNC)) {
+		if (pvmw->flags & PVMW_MIGRATION) {
+			if (!is_swap_pte(*pvmw->pte))
+				return false;
+		} else {
+			if (!pte_present(*pvmw->pte))
+				return false;
+		}
+	}
+	pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+	spin_lock(pvmw->ptl);
+	return true;
+}
+
+static bool check_pte(struct page_vma_mapped_walk *pvmw)
+{
+	if (pvmw->flags & PVMW_MIGRATION) {
+#ifdef CONFIG_MIGRATION
+		swp_entry_t entry;
+		if (!is_swap_pte(*pvmw->pte))
+			return false;
+		entry = pte_to_swp_entry(*pvmw->pte);
+		if (!is_migration_entry(entry))
+			return false;
+		if (migration_entry_to_page(entry) - pvmw->page >=
+				hpage_nr_pages(pvmw->page)) {
+			return false;
+		}
+		if (migration_entry_to_page(entry) < pvmw->page)
+			return false;
+#else
+		WARN_ON_ONCE(1);
+#endif
+	} else {
+		if (!pte_present(*pvmw->pte))
+			return false;
+
+		/* THP can be referenced by any subpage */
+		if (pte_page(*pvmw->pte) - pvmw->page >=
+				hpage_nr_pages(pvmw->page)) {
+			return false;
+		}
+		if (pte_page(*pvmw->pte) < pvmw->page)
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
+ * @pvmw->address
+ * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
+ * must be set. pmd, pte and ptl must be NULL.
+ *
+ * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point
+ * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is
+ * adjusted if needed (for PTE-mapped THPs).
+ *
+ * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page
+ * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in
+ * a loop to find all PTEs that map the THP.
+ *
+ * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry
+ * regardless of which page table level the page is mapped at. @pvmw->pmd is
+ * NULL.
+ *
+ * Retruns false if there are no more page table entries for the page in
+ * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
+ *
+ * If you need to stop the walk before page_vma_mapped_walk() returned false,
+ * use page_vma_mapped_walk_done(). It will do the housekeeping.
+ */
+bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+{
+	struct mm_struct *mm = pvmw->vma->vm_mm;
+	struct page *page = pvmw->page;
+	pgd_t *pgd;
+	pud_t *pud;
+
+	/* The only possible pmd mapping has been handled on last iteration */
+	if (pvmw->pmd && !pvmw->pte)
+		return not_found(pvmw);
+
+	/* Only for THP, seek to next pte entry makes sense */
+	if (pvmw->pte) {
+		if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+			return not_found(pvmw);
+		goto next_pte;
+	}
+
+	if (unlikely(PageHuge(pvmw->page))) {
+		/* when pud is not present, pte will be NULL */
+		pvmw->pte = huge_pte_offset(mm, pvmw->address);
+		if (!pvmw->pte)
+			return false;
+
+		pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+		spin_lock(pvmw->ptl);
+		if (!check_pte(pvmw))
+			return not_found(pvmw);
+		return true;
+	}
+restart:
+	pgd = pgd_offset(mm, pvmw->address);
+	if (!pgd_present(*pgd))
+		return false;
+	pud = pud_offset(pgd, pvmw->address);
+	if (!pud_present(*pud))
+		return false;
+	pvmw->pmd = pmd_offset(pud, pvmw->address);
+	if (pmd_trans_huge(*pvmw->pmd)) {
+		pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+		if (!pmd_present(*pvmw->pmd))
+			return not_found(pvmw);
+		if (likely(pmd_trans_huge(*pvmw->pmd))) {
+			if (pvmw->flags & PVMW_MIGRATION)
+				return not_found(pvmw);
+			if (pmd_page(*pvmw->pmd) != page)
+				return not_found(pvmw);
+			return true;
+		} else {
+			/* THP pmd was split under us: handle on pte level */
+			spin_unlock(pvmw->ptl);
+			pvmw->ptl = NULL;
+		}
+	} else {
+		if (!check_pmd(pvmw))
+			return false;
+	}
+	if (!map_pte(pvmw))
+		goto next_pte;
+	while (1) {
+		if (check_pte(pvmw))
+			return true;
+next_pte:	do {
+			pvmw->address += PAGE_SIZE;
+			if (pvmw->address >=
+					__vma_address(pvmw->page, pvmw->vma) +
+					hpage_nr_pages(pvmw->page) * PAGE_SIZE)
+				return not_found(pvmw);
+			/* Did we cross page table boundary? */
+			if (pvmw->address % PMD_SIZE == 0) {
+				pte_unmap(pvmw->pte);
+				if (pvmw->ptl) {
+					spin_unlock(pvmw->ptl);
+					pvmw->ptl = NULL;
+				}
+				goto restart;
+			} else {
+				pvmw->pte++;
+			}
+		} while (pte_none(*pvmw->pte));
+
+		if (!pvmw->ptl) {
+			pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+			spin_lock(pvmw->ptl);
+		}
+	}
+}
-- 
cgit v1.2.3


From d53a8b49a626fdfce4390710da6d04b4314db25f Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 24 Feb 2017 14:58:13 -0800
Subject: mm: drop page_check_address{,_transhuge}

All users are gone. Let's drop them.

Link: http://lkml.kernel.org/r/20170129173858.45174-12-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  36 --------------
 mm/rmap.c            | 138 ---------------------------------------------------
 2 files changed, 174 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b76343610653..8c89e902df3e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -197,42 +197,6 @@ int page_referenced(struct page *, int is_locked,
 
 int try_to_unmap(struct page *, enum ttu_flags flags);
 
-/*
- * Used by uprobes to replace a userspace page safely
- */
-pte_t *__page_check_address(struct page *, struct mm_struct *,
-				unsigned long, spinlock_t **, int);
-
-static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-					unsigned long address,
-					spinlock_t **ptlp, int sync)
-{
-	pte_t *ptep;
-
-	__cond_lock(*ptlp, ptep = __page_check_address(page, mm, address,
-						       ptlp, sync));
-	return ptep;
-}
-
-/*
- * Used by idle page tracking to check if a page was referenced via page
- * tables.
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
-				  unsigned long address, pmd_t **pmdp,
-				  pte_t **ptep, spinlock_t **ptlp);
-#else
-static inline bool page_check_address_transhuge(struct page *page,
-				struct mm_struct *mm, unsigned long address,
-				pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp)
-{
-	*ptep = page_check_address(page, mm, address, ptlp, 0);
-	*pmdp = NULL;
-	return !!*ptep;
-}
-#endif
-
 /* Avoid racy checks */
 #define PVMW_SYNC		(1 << 0)
 /* Look for migarion entries rather than present PTEs */
diff --git a/mm/rmap.c b/mm/rmap.c
index 80525820aada..8774791e2809 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -708,144 +708,6 @@ out:
 	return pmd;
 }
 
-/*
- * Check that @page is mapped at @address into @mm.
- *
- * If @sync is false, page_check_address may perform a racy check to avoid
- * the page table lock when the pte is not present (helpful when reclaiming
- * highly shared pages).
- *
- * On success returns with pte mapped and locked.
- */
-pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
-			  unsigned long address, spinlock_t **ptlp, int sync)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-	spinlock_t *ptl;
-
-	if (unlikely(PageHuge(page))) {
-		/* when pud is not present, pte will be NULL */
-		pte = huge_pte_offset(mm, address);
-		if (!pte)
-			return NULL;
-
-		ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
-		goto check;
-	}
-
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd)
-		return NULL;
-
-	pte = pte_offset_map(pmd, address);
-	/* Make a quick check before getting the lock */
-	if (!sync && !pte_present(*pte)) {
-		pte_unmap(pte);
-		return NULL;
-	}
-
-	ptl = pte_lockptr(mm, pmd);
-check:
-	spin_lock(ptl);
-	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
-		*ptlp = ptl;
-		return pte;
-	}
-	pte_unmap_unlock(pte, ptl);
-	return NULL;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * Check that @page is mapped at @address into @mm. In contrast to
- * page_check_address(), this function can handle transparent huge pages.
- *
- * On success returns true with pte mapped and locked. For PMD-mapped
- * transparent huge pages *@ptep is set to NULL.
- */
-bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
-				  unsigned long address, pmd_t **pmdp,
-				  pte_t **ptep, spinlock_t **ptlp)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	spinlock_t *ptl;
-
-	if (unlikely(PageHuge(page))) {
-		/* when pud is not present, pte will be NULL */
-		pte = huge_pte_offset(mm, address);
-		if (!pte)
-			return false;
-
-		ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
-		pmd = NULL;
-		goto check_pte;
-	}
-
-	pgd = pgd_offset(mm, address);
-	if (!pgd_present(*pgd))
-		return false;
-	pud = pud_offset(pgd, address);
-	if (!pud_present(*pud))
-		return false;
-	pmd = pmd_offset(pud, address);
-
-	if (pmd_trans_huge(*pmd)) {
-		ptl = pmd_lock(mm, pmd);
-		if (!pmd_present(*pmd))
-			goto unlock_pmd;
-		if (unlikely(!pmd_trans_huge(*pmd))) {
-			spin_unlock(ptl);
-			goto map_pte;
-		}
-
-		if (pmd_page(*pmd) != page)
-			goto unlock_pmd;
-
-		pte = NULL;
-		goto found;
-unlock_pmd:
-		spin_unlock(ptl);
-		return false;
-	} else {
-		pmd_t pmde = *pmd;
-
-		barrier();
-		if (!pmd_present(pmde) || pmd_trans_huge(pmde))
-			return false;
-	}
-map_pte:
-	pte = pte_offset_map(pmd, address);
-	if (!pte_present(*pte)) {
-		pte_unmap(pte);
-		return false;
-	}
-
-	ptl = pte_lockptr(mm, pmd);
-check_pte:
-	spin_lock(ptl);
-
-	if (!pte_present(*pte)) {
-		pte_unmap_unlock(pte, ptl);
-		return false;
-	}
-
-	/* THP can be referenced by any subpage */
-	if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
-		pte_unmap_unlock(pte, ptl);
-		return false;
-	}
-found:
-	*ptep = pte;
-	*pmdp = pmd;
-	*ptlp = ptl;
-	return true;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
 struct page_referenced_arg {
 	int mapcount;
 	int referenced;
-- 
cgit v1.2.3


From 897ab3e0c49e24b62e2d54d165c7afec6bbca65b Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:58:22 -0800
Subject: userfaultfd: non-cooperative: add event for memory unmaps

When a non-cooperative userfaultfd monitor copies pages in the
background, it may encounter regions that were already unmapped.
Addition of UFFD_EVENT_UNMAP allows the uffd monitor to track precisely
changes in the virtual memory layout.

Since there might be different uffd contexts for the affected VMAs, we
first should create a temporary representation for the unmap event for
each uffd context and then notify them one by one to the appropriate
userfault file descriptors.

The event notification occurs after the mmap_sem has been released.

[arnd@arndb.de: fix nommu build]
  Link: http://lkml.kernel.org/r/20170203165141.3665284-1-arnd@arndb.de
[mhocko@suse.com: fix nommu build]
  Link: http://lkml.kernel.org/r/20170202091503.GA22823@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/1485542673-24387-3-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/vdso.c          |  2 +-
 arch/tile/mm/elf.c               |  2 +-
 arch/x86/entry/vdso/vma.c        |  2 +-
 arch/x86/mm/mpx.c                |  4 +--
 fs/aio.c                         |  2 +-
 fs/proc/vmcore.c                 |  4 +--
 fs/userfaultfd.c                 | 65 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h               | 14 +++++----
 include/linux/userfaultfd_k.h    | 18 +++++++++++
 include/uapi/linux/userfaultfd.h |  3 ++
 ipc/shm.c                        |  8 ++---
 mm/mmap.c                        | 46 ++++++++++++++++++----------
 mm/mremap.c                      | 23 ++++++++------
 mm/nommu.c                       |  7 +++--
 mm/util.c                        |  5 +++-
 15 files changed, 160 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index f9dbfb14af33..093517e85a6c 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -111,7 +111,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
 			   VM_READ|VM_WRITE|VM_EXEC|
 			   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-			   0);
+			   0, NULL);
 	if (IS_ERR_VALUE(base)) {
 		ret = base;
 		goto out;
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 6225cc998db1..889901824400 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -143,7 +143,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 		unsigned long addr = MEM_USER_INTRPT;
 		addr = mmap_region(NULL, addr, INTRPT_SIZE,
 				   VM_READ|VM_EXEC|
-				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0, NULL);
 		if (addr > (unsigned long) -PAGE_SIZE)
 			retval = (int) addr;
 	}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 10820f6cefbf..572cee3fccff 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -186,7 +186,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
 
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
-		do_munmap(mm, text_start, image->size);
+		do_munmap(mm, text_start, image->size, NULL);
 	} else {
 		current->mm->context.vdso = (void __user *)text_start;
 		current->mm->context.vdso_image = image;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index aad4ac386f98..c98079684bdb 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -51,7 +51,7 @@ static unsigned long mpx_mmap(unsigned long len)
 
 	down_write(&mm->mmap_sem);
 	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
+		       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
 	up_write(&mm->mmap_sem);
 	if (populate)
 		mm_populate(addr, populate);
@@ -893,7 +893,7 @@ static int unmap_entire_bt(struct mm_struct *mm,
 	 * avoid recursion, do_munmap() will check whether it comes
 	 * from one bounds table through VM_MPX flag.
 	 */
-	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
+	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
 }
 
 static int try_unmap_single_bt(struct mm_struct *mm,
diff --git a/fs/aio.c b/fs/aio.c
index 873b4ca82ccb..7e2ab9c8e39c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,7 +512,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
 				       PROT_READ | PROT_WRITE,
-				       MAP_SHARED, 0, &unused);
+				       MAP_SHARED, 0, &unused, NULL);
 	up_write(&mm->mmap_sem);
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		ctx->mmap_size = 0;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f52d8e857ff7..885d445afa0d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -388,7 +388,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
 	}
 	return 0;
 fail:
-	do_munmap(vma->vm_mm, from, len);
+	do_munmap(vma->vm_mm, from, len, NULL);
 	return -EAGAIN;
 }
 
@@ -481,7 +481,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 
 	return 0;
 fail:
-	do_munmap(vma->vm_mm, vma->vm_start, len);
+	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
 	return -EAGAIN;
 }
 #else
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8fe601b4875e..4c78458ea78d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -71,6 +71,13 @@ struct userfaultfd_fork_ctx {
 	struct list_head list;
 };
 
+struct userfaultfd_unmap_ctx {
+	struct userfaultfd_ctx *ctx;
+	unsigned long start;
+	unsigned long end;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -709,6 +716,64 @@ void userfaultfd_remove(struct vm_area_struct *vma,
 	down_read(&mm->mmap_sem);
 }
 
+static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
+			  unsigned long start, unsigned long end)
+{
+	struct userfaultfd_unmap_ctx *unmap_ctx;
+
+	list_for_each_entry(unmap_ctx, unmaps, list)
+		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
+		    unmap_ctx->end == end)
+			return true;
+
+	return false;
+}
+
+int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+			   unsigned long start, unsigned long end,
+			   struct list_head *unmaps)
+{
+	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+		struct userfaultfd_unmap_ctx *unmap_ctx;
+		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+		if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
+		    has_unmap_ctx(ctx, unmaps, start, end))
+			continue;
+
+		unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
+		if (!unmap_ctx)
+			return -ENOMEM;
+
+		userfaultfd_ctx_get(ctx);
+		unmap_ctx->ctx = ctx;
+		unmap_ctx->start = start;
+		unmap_ctx->end = end;
+		list_add_tail(&unmap_ctx->list, unmaps);
+	}
+
+	return 0;
+}
+
+void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
+{
+	struct userfaultfd_unmap_ctx *ctx, *n;
+	struct userfaultfd_wait_queue ewq;
+
+	list_for_each_entry_safe(ctx, n, uf, list) {
+		msg_init(&ewq.msg);
+
+		ewq.msg.event = UFFD_EVENT_UNMAP;
+		ewq.msg.arg.remove.start = ctx->start;
+		ewq.msg.arg.remove.end = ctx->end;
+
+		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
+
+		list_del(&ctx->list);
+		kfree(ctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c65aa43b5712..c6fcba1d1ae5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2090,18 +2090,22 @@ extern int install_special_mapping(struct mm_struct *mm,
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
-	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
+	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+	struct list_head *uf);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
-	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
-extern int do_munmap(struct mm_struct *, unsigned long, size_t);
+	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf);
+extern int do_munmap(struct mm_struct *, unsigned long, size_t,
+		     struct list_head *uf);
 
 static inline unsigned long
 do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
-	unsigned long pgoff, unsigned long *populate)
+	unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf)
 {
-	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
 }
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 2521542f6c07..a40be5d0661b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -66,6 +66,12 @@ extern void userfaultfd_remove(struct vm_area_struct *vma,
 			       unsigned long start,
 			       unsigned long end);
 
+extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+				  unsigned long start, unsigned long end,
+				  struct list_head *uf);
+extern void userfaultfd_unmap_complete(struct mm_struct *mm,
+				       struct list_head *uf);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -118,6 +124,18 @@ static inline void userfaultfd_remove(struct vm_area_struct *vma,
 				      unsigned long end)
 {
 }
+
+static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+					 unsigned long start, unsigned long end,
+					 struct list_head *uf)
+{
+	return 0;
+}
+
+static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
+					      struct list_head *uf)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index b742c40c2880..3b059530dac9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -21,6 +21,7 @@
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
+			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
@@ -110,6 +111,7 @@ struct uffd_msg {
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
+#define UFFD_EVENT_UNMAP	0x16
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -158,6 +160,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_REMOVE		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
+#define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/ipc/shm.c b/ipc/shm.c
index 7f6537b84ef5..d7805acb44fd 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1222,7 +1222,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 			goto invalid;
 	}
 
-	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
+	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
 	*raddr = addr;
 	err = 0;
 	if (IS_ERR_VALUE(addr))
@@ -1329,7 +1329,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 			 */
 			file = vma->vm_file;
 			size = i_size_read(file_inode(vma->vm_file));
-			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 			/*
 			 * We discovered the size of the shm segment, so
 			 * break out of here and fall through to the next
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 		if ((vma->vm_ops == &shm_vm_ops) &&
 		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
 		    (vma->vm_file == file))
-			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 		vma = next;
 	}
 
@@ -1365,7 +1365,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 	 * given
 	 */
 	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
-		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 		retval = 0;
 	}
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 13d16a2b7623..1cec28d20583 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static int do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 	struct mm_struct *mm = current->mm;
 	unsigned long min_brk;
 	bool populate;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
@@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 
 	/* Always allow shrinking brk. */
 	if (brk <= mm->brk) {
-		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+		if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
 			goto set_brk;
 		goto out;
 	}
@@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) < 0)
+	if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
 		goto out;
 
 set_brk:
 	mm->brk = brk;
 	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	if (populate)
 		mm_populate(oldbrk, newbrk - oldbrk);
 	return brk;
@@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm,
 unsigned long do_mmap(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, vm_flags_t vm_flags,
-			unsigned long pgoff, unsigned long *populate)
+			unsigned long pgoff, unsigned long *populate,
+			struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	int pkey = 0;
@@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
-	addr = mmap_region(file, addr, len, vm_flags, pgoff);
+	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
@@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
-		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
+		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+		struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/* Clear old maps */
 	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
 			      &rb_parent)) {
-		if (do_munmap(mm, addr, len))
+		if (do_munmap(mm, addr, len, uf))
 			return -ENOMEM;
 	}
 
@@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
  * work.  This now handles partial unmappings.
  * Jeremy Fitzhardinge <jeremy@goop.org>
  */
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+	      struct list_head *uf)
 {
 	unsigned long end;
 	struct vm_area_struct *vma, *prev, *last;
@@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	if (vma->vm_start >= end)
 		return 0;
 
+	if (uf) {
+		int error = userfaultfd_unmap_prep(vma, start, end, uf);
+
+		if (error)
+			return error;
+	}
+
 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 *
@@ -2668,12 +2680,14 @@ int vm_munmap(unsigned long start, size_t len)
 {
 	int ret;
 	struct mm_struct *mm = current->mm;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_munmap(mm, start, len);
+	ret = do_munmap(mm, start, len, &uf);
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	return ret;
 }
 EXPORT_SYMBOL(vm_munmap);
@@ -2773,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
 	file = get_file(vma->vm_file);
 	ret = do_mmap_pgoff(vma->vm_file, start, size,
-			prot, flags, pgoff, &populate);
+			prot, flags, pgoff, &populate, NULL);
 	fput(file);
 out:
 	up_write(&mm->mmap_sem);
@@ -2799,7 +2813,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -2838,7 +2852,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
 	 */
 	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
 			      &rb_parent)) {
-		if (do_munmap(mm, addr, len))
+		if (do_munmap(mm, addr, len, uf))
 			return -ENOMEM;
 	}
 
@@ -2885,9 +2899,9 @@ out:
 	return 0;
 }
 
-static int do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
 {
-	return do_brk_flags(addr, len, 0);
+	return do_brk_flags(addr, len, 0, uf);
 }
 
 int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
@@ -2895,13 +2909,15 @@ int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
 	struct mm_struct *mm = current->mm;
 	int ret;
 	bool populate;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_brk_flags(addr, len, flags);
+	ret = do_brk_flags(addr, len, flags, &uf);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8779928d6a70..8233b0105c82 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -252,7 +252,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
 		unsigned long new_len, unsigned long new_addr,
-		bool *locked, struct vm_userfaultfd_ctx *uf)
+		bool *locked, struct vm_userfaultfd_ctx *uf,
+		struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -341,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn_moved(vma);
 
-	if (do_munmap(mm, old_addr, old_len) < 0) {
+	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
@@ -417,7 +418,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		unsigned long new_addr, unsigned long new_len, bool *locked,
-		struct vm_userfaultfd_ctx *uf)
+		struct vm_userfaultfd_ctx *uf,
+		struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -435,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (addr + old_len > new_addr && new_addr + new_len > addr)
 		goto out;
 
-	ret = do_munmap(mm, new_addr, new_len);
+	ret = do_munmap(mm, new_addr, new_len, NULL);
 	if (ret)
 		goto out;
 
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
 		if (ret && old_len != new_len)
 			goto out;
 		old_len = new_len;
@@ -462,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+		       uf_unmap);
 	if (!(offset_in_page(ret)))
 		goto out;
 out1:
@@ -502,6 +505,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long charged = 0;
 	bool locked = false;
 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+	LIST_HEAD(uf_unmap);
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;
@@ -528,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked, &uf);
+				&locked, &uf, &uf_unmap);
 		goto out;
 	}
 
@@ -538,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	 * do_munmap does all the needed commit accounting
 	 */
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
 		if (ret && old_len != new_len)
 			goto out;
 		ret = addr;
@@ -598,7 +602,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 
 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
-			       &locked, &uf);
+			       &locked, &uf, &uf_unmap);
 	}
 out:
 	if (offset_in_page(ret)) {
@@ -609,5 +613,6 @@ out:
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
 	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
+	userfaultfd_unmap_complete(mm, &uf_unmap);
 	return ret;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 215c62296028..fe9f4fa4a7a7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1205,7 +1205,8 @@ unsigned long do_mmap(struct file *file,
 			unsigned long flags,
 			vm_flags_t vm_flags,
 			unsigned long pgoff,
-			unsigned long *populate)
+			unsigned long *populate,
+			struct list_head *uf)
 {
 	struct vm_area_struct *vma;
 	struct vm_region *region;
@@ -1577,7 +1578,7 @@ static int shrink_vma(struct mm_struct *mm,
  * - under NOMMU conditions the chunk to be unmapped must be backed by a single
  *   VMA, though it need not cover the whole VMA
  */
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
 {
 	struct vm_area_struct *vma;
 	unsigned long end;
@@ -1643,7 +1644,7 @@ int vm_munmap(unsigned long addr, size_t len)
 	int ret;
 
 	down_write(&mm->mmap_sem);
-	ret = do_munmap(mm, addr, len);
+	ret = do_munmap(mm, addr, len, NULL);
 	up_write(&mm->mmap_sem);
 	return ret;
 }
diff --git a/mm/util.c b/mm/util.c
index 3cb2164f4099..b8f538863b5a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,6 +11,7 @@
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -297,14 +298,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long ret;
 	struct mm_struct *mm = current->mm;
 	unsigned long populate;
+	LIST_HEAD(uf);
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
 		if (down_write_killable(&mm->mmap_sem))
 			return -EINTR;
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
-				    &populate);
+				    &populate, &uf);
 		up_write(&mm->mmap_sem);
+		userfaultfd_unmap_complete(mm, &uf);
 		if (populate)
 			mm_populate(ret, populate);
 	}
-- 
cgit v1.2.3


From ca49ca7114553587736fe78319e22f073b631380 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:58:25 -0800
Subject: userfaultfd: non-cooperative: add event for exit() notification

Allow userfaultfd monitor track termination of the processes that have
memory backed by the uffd.

[rppt@linux.vnet.ibm.com: add comment]
  Link: http://lkml.kernel.org/r/20170202135448.GB19804@rapoport-lnxLink: http://lkml.kernel.org/r/1485542673-24387-4-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 28 ++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    |  7 +++++++
 include/uapi/linux/userfaultfd.h |  5 ++++-
 kernel/exit.c                    |  2 ++
 4 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 4c78458ea78d..b676575f2268 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -774,6 +774,34 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 	}
 }
 
+void userfaultfd_exit(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma = mm->mmap;
+
+	/*
+	 * We can do the vma walk without locking because the caller
+	 * (exit_mm) knows it now has exclusive access
+	 */
+	while (vma) {
+		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+		if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) {
+			struct userfaultfd_wait_queue ewq;
+
+			userfaultfd_ctx_get(ctx);
+
+			msg_init(&ewq.msg);
+			ewq.msg.event = UFFD_EVENT_EXIT;
+
+			userfaultfd_event_wait_completion(ctx, &ewq);
+
+			ctx->features &= ~UFFD_FEATURE_EVENT_EXIT;
+		}
+
+		vma = vma->vm_next;
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a40be5d0661b..0468548acebf 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -72,6 +72,8 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 
+extern void userfaultfd_exit(struct mm_struct *mm);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -136,6 +138,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 					      struct list_head *uf)
 {
 }
+
+static inline void userfaultfd_exit(struct mm_struct *mm)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..c055947c5c98 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,7 +18,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT |		\
+			   UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
@@ -112,6 +113,7 @@ struct uffd_msg {
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
 #define UFFD_EVENT_UNMAP	0x16
+#define UFFD_EVENT_EXIT		0x17
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -161,6 +163,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
+#define UFFD_FEATURE_EVENT_EXIT			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/exit.c b/kernel/exit.c
index 9960accbf2ab..90b09ca35c84 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -45,6 +45,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
@@ -547,6 +548,7 @@ static void exit_mm(void)
 	enter_lazy_tlb(mm, current);
 	task_unlock(current);
 	mm_update_next_owner(mm);
+	userfaultfd_exit(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
 		exit_oom_victim();
-- 
cgit v1.2.3


From ca96b625341027f611c3e61351a70311077ebcf5 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Fri, 24 Feb 2017 14:58:37 -0800
Subject: mm: alloc_contig_range: allow to specify GFP mask

Currently alloc_contig_range assumes that the compaction should be done
with the default GFP_KERNEL flags.  This is probably right for all
current uses of this interface, but may change as CMA is used in more
use-cases (including being the default DMA memory allocator on some
platforms).

Change the function prototype, to allow for passing through the GFP mask
set by upper layers.

Also respect global restrictions by applying memalloc_noio_flags to the
passed in flags.

Link: http://lkml.kernel.org/r/20170127172328.18574-1-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 mm/cma.c            | 3 ++-
 mm/hugetlb.c        | 3 ++-
 mm/page_alloc.c     | 5 +++--
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0fe0b6295ab5..db373b9d3223 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -541,7 +541,7 @@ static inline bool pm_suspended_storage(void)
 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
-			      unsigned migratetype);
+			      unsigned migratetype, gfp_t gfp_mask);
 extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
 #endif
 
diff --git a/mm/cma.c b/mm/cma.c
index 94b3460cd608..c6aed23ca6df 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -402,7 +402,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 
 		pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
 		mutex_lock(&cma_mutex);
-		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
+					 GFP_KERNEL);
 		mutex_unlock(&cma_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 167fd0722c15..2e0e8159ce8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1052,7 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
 				unsigned long nr_pages)
 {
 	unsigned long end_pfn = start_pfn + nr_pages;
-	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+				  GFP_KERNEL);
 }
 
 static bool pfn_range_valid_gigantic(struct zone *z,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d34cdb70f1d..8a0f33624335 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7399,6 +7399,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
+ * @gfp_mask:	GFP mask to use during compaction
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
@@ -7412,7 +7413,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
-		       unsigned migratetype)
+		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
 	unsigned int order;
@@ -7424,7 +7425,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
-		.gfp_mask = GFP_KERNEL,
+		.gfp_mask = memalloc_noio_flags(gfp_mask),
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 
-- 
cgit v1.2.3


From e2f466e32f56c8b3ee0d1a40cc39e1c779dfd598 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Fri, 24 Feb 2017 14:58:41 -0800
Subject: mm: cma_alloc: allow to specify GFP mask

Most users of this interface just want to use it with the default
GFP_KERNEL flags, but for cases where DMA memory is allocated it may be
called from a different context.

No functional change yet, just passing through the flag to the
underlying alloc_contig_range function.

Link: http://lkml.kernel.org/r/20170127172328.18574-2-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_hv_builtin.c | 3 ++-
 drivers/base/dma-contiguous.c        | 2 +-
 include/linux/cma.h                  | 3 ++-
 mm/cma.c                             | 5 +++--
 mm/cma_debug.c                       | 2 +-
 5 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index c42a7e63b39e..4d6c64b3041c 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -56,7 +56,8 @@ struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
 {
 	VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
 
-	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
+	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES),
+			 GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
 
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index e167a1e1bccb..d1a9cbabc627 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -193,7 +193,7 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 	if (align > CONFIG_CMA_ALIGNMENT)
 		align = CONFIG_CMA_ALIGNMENT;
 
-	return cma_alloc(dev_get_cma_area(dev), count, align);
+	return cma_alloc(dev_get_cma_area(dev), count, align, GFP_KERNEL);
 }
 
 /**
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 6f0a91b37f68..03f32d0bd1d8 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -29,6 +29,7 @@ extern int __init cma_declare_contiguous(phys_addr_t base,
 extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					unsigned int order_per_bit,
 					struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align);
+extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+			      gfp_t gfp_mask);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
 #endif
diff --git a/mm/cma.c b/mm/cma.c
index c6aed23ca6df..2906ae5a83ff 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -357,7 +357,8 @@ err:
  * This function allocates part of contiguous memory on specific
  * contiguous memory area.
  */
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+		       gfp_t gfp_mask)
 {
 	unsigned long mask, offset;
 	unsigned long pfn = -1;
@@ -403,7 +404,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 		pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
 		mutex_lock(&cma_mutex);
 		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
-					 GFP_KERNEL);
+					 gfp_mask);
 		mutex_unlock(&cma_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index f8e4b60db167..ffc0c3d0ae64 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -138,7 +138,7 @@ static int cma_alloc_mem(struct cma *cma, int count)
 	if (!mem)
 		return -ENOMEM;
 
-	p = cma_alloc(cma, count, 0);
+	p = cma_alloc(cma, count, 0, GFP_KERNEL);
 	if (!p) {
 		kfree(mem);
 		return -ENOMEM;
-- 
cgit v1.2.3


From 712c604dcdf8186295e2af694adf52c6842ad100 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Fri, 24 Feb 2017 14:58:44 -0800
Subject: mm: wire up GFP flag passing in dma_alloc_from_contiguous

The callers of the DMA alloc functions already provide the proper
context GFP flags.  Make sure to pass them through to the CMA allocator,
to make the CMA compaction context aware.

Link: http://lkml.kernel.org/r/20170127172328.18574-3-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/dma-mapping.c      | 16 +++++++++-------
 arch/arm64/mm/dma-mapping.c    |  4 ++--
 arch/mips/mm/dma-default.c     |  4 ++--
 arch/x86/kernel/pci-dma.c      |  3 ++-
 arch/xtensa/kernel/pci-dma.c   |  3 ++-
 drivers/base/dma-contiguous.c  |  5 +++--
 drivers/iommu/amd_iommu.c      |  2 +-
 drivers/iommu/intel-iommu.c    |  2 +-
 include/linux/dma-contiguous.h |  4 ++--
 9 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 82d3e79ec82b..6ffdf17e0d5c 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -349,7 +349,7 @@ static void __dma_free_buffer(struct page *page, size_t size)
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
 				     pgprot_t prot, struct page **ret_page,
 				     const void *caller, bool want_vaddr,
-				     int coherent_flag);
+				     int coherent_flag, gfp_t gfp);
 
 static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp,
 				 pgprot_t prot, struct page **ret_page,
@@ -420,7 +420,8 @@ static int __init atomic_pool_init(void)
 	 */
 	if (dev_get_cma_area(NULL))
 		ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot,
-				      &page, atomic_pool_init, true, NORMAL);
+				      &page, atomic_pool_init, true, NORMAL,
+				      GFP_KERNEL);
 	else
 		ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot,
 					   &page, atomic_pool_init, true);
@@ -594,14 +595,14 @@ static int __free_from_pool(void *start, size_t size)
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
 				     pgprot_t prot, struct page **ret_page,
 				     const void *caller, bool want_vaddr,
-				     int coherent_flag)
+				     int coherent_flag, gfp_t gfp)
 {
 	unsigned long order = get_order(size);
 	size_t count = size >> PAGE_SHIFT;
 	struct page *page;
 	void *ptr = NULL;
 
-	page = dma_alloc_from_contiguous(dev, count, order);
+	page = dma_alloc_from_contiguous(dev, count, order, gfp);
 	if (!page)
 		return NULL;
 
@@ -655,7 +656,7 @@ static inline pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot)
 #define __get_dma_pgprot(attrs, prot)				__pgprot(0)
 #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c, wv)	NULL
 #define __alloc_from_pool(size, ret_page)			NULL
-#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, coherent_flag)	NULL
+#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, coherent_flag, gfp)	NULL
 #define __free_from_pool(cpu_addr, size)			do { } while (0)
 #define __free_from_contiguous(dev, page, cpu_addr, size, wv)	do { } while (0)
 #define __dma_free_remap(cpu_addr, size)			do { } while (0)
@@ -697,7 +698,8 @@ static void *cma_allocator_alloc(struct arm_dma_alloc_args *args,
 {
 	return __alloc_from_contiguous(args->dev, args->size, args->prot,
 				       ret_page, args->caller,
-				       args->want_vaddr, args->coherent_flag);
+				       args->want_vaddr, args->coherent_flag,
+				       args->gfp);
 }
 
 static void cma_allocator_free(struct arm_dma_free_args *args)
@@ -1312,7 +1314,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		unsigned long order = get_order(size);
 		struct page *page;
 
-		page = dma_alloc_from_contiguous(dev, count, order);
+		page = dma_alloc_from_contiguous(dev, count, order, gfp);
 		if (!page)
 			goto error;
 
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 351f7595cb3e..aff1d0afeb1e 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -107,7 +107,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
 		void *addr;
 
 		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-							get_order(size));
+						 get_order(size), flags);
 		if (!page)
 			return NULL;
 
@@ -390,7 +390,7 @@ static int __init atomic_pool_init(void)
 
 	if (dev_get_cma_area(NULL))
 		page = dma_alloc_from_contiguous(NULL, nr_pages,
-							pool_size_order);
+						 pool_size_order, GFP_KERNEL);
 	else
 		page = alloc_pages(GFP_DMA, pool_size_order);
 
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index a39c36af97ad..1895a692efd4 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -148,8 +148,8 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
 	gfp = massage_gfp_flags(dev, gfp);
 
 	if (IS_ENABLED(CONFIG_DMA_CMA) && gfpflags_allow_blocking(gfp))
-		page = dma_alloc_from_contiguous(dev,
-					count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 gfp);
 	if (!page)
 		page = alloc_pages(gfp, get_order(size));
 
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d30c37750765..d5c223c9cf11 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -91,7 +91,8 @@ again:
 	page = NULL;
 	/* CMA can be used only in the context which permits sleeping */
 	if (gfpflags_allow_blocking(flag)) {
-		page = dma_alloc_from_contiguous(dev, count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 flag);
 		if (page && page_to_phys(page) + size > dma_mask) {
 			dma_release_from_contiguous(dev, page, count);
 			page = NULL;
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 70e362e6038e..34c1f9fa6acc 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -158,7 +158,8 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size,
 		flag |= GFP_DMA;
 
 	if (gfpflags_allow_blocking(flag))
-		page = dma_alloc_from_contiguous(dev, count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 flag);
 
 	if (!page)
 		page = alloc_pages(flag, get_order(size));
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index d1a9cbabc627..b55804cac4c4 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -181,6 +181,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
  * @dev:   Pointer to device for which the allocation is performed.
  * @count: Requested number of pages.
  * @align: Requested alignment of pages (in PAGE_SIZE order).
+ * @gfp_mask: GFP flags to use for this allocation.
  *
  * This function allocates memory buffer for specified device. It uses
  * device specific contiguous memory area if available or the default
@@ -188,12 +189,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
  * function.
  */
 struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
-				       unsigned int align)
+				       unsigned int align, gfp_t gfp_mask)
 {
 	if (align > CONFIG_CMA_ALIGNMENT)
 		align = CONFIG_CMA_ALIGNMENT;
 
-	return cma_alloc(dev_get_cma_area(dev), count, align, GFP_KERNEL);
+	return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask);
 }
 
 /**
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 1b5b8c5361c5..09bd3b290bb8 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2672,7 +2672,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
 			return NULL;
 
 		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-						 get_order(size));
+						 get_order(size), flag);
 		if (!page)
 			return NULL;
 	}
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f5e02f8e7371..a8f7ae0eb7a4 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3829,7 +3829,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
 	if (gfpflags_allow_blocking(flags)) {
 		unsigned int count = size >> PAGE_SHIFT;
 
-		page = dma_alloc_from_contiguous(dev, count, order);
+		page = dma_alloc_from_contiguous(dev, count, order, flags);
 		if (page && iommu_no_mapping(dev) &&
 		    page_to_phys(page) + size > dev->coherent_dma_mask) {
 			dma_release_from_contiguous(dev, page, count);
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index fec734df1524..b67bf6ac907d 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 }
 
 struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
-				       unsigned int order);
+				       unsigned int order, gfp_t gfp_mask);
 bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 				 int count);
 
@@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 
 static inline
 struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
-				       unsigned int order)
+				       unsigned int order, gfp_t gfp_mask)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From def5efe0376501ef7bd6b53ed061512c142e59aa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 24 Feb 2017 14:58:47 -0800
Subject: mm, madvise: fail with ENOMEM when splitting vma will hit
 max_map_count

If madvise(2) advice will result in the underlying vma being split and
the number of areas mapped by the process will exceed
/proc/sys/vm/max_map_count as a result, return ENOMEM instead of EAGAIN.

EAGAIN is returned by madvise(2) when a kernel resource, such as slab,
is temporarily unavailable.  It indicates that userspace should retry
the advice in the near future.  This is important for advice such as
MADV_DONTNEED which is often used by malloc implementations to free
memory back to the system: we really do want to free memory back when
madvise(2) returns EAGAIN because slab allocations (for vmas, anon_vmas,
or mempolicies) cannot be allocated.

Encountering /proc/sys/vm/max_map_count is not a temporary failure,
however, so return ENOMEM to indicate this is a more serious issue.  A
followup patch to the man page will specify this behavior.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701241431120.42507@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt |  4 ++--
 Documentation/vm/ksm.txt    |  4 ++++
 include/linux/mm.h          |  6 ++++--
 mm/madvise.c                | 51 +++++++++++++++++++++++++++++++++++++--------
 mm/mmap.c                   |  8 +++----
 5 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 95ccbe6d79ce..b4ad97f10b8e 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -376,8 +376,8 @@ max_map_count:
 
 This file contains the maximum number of memory map areas a process
 may have. Memory map areas are used as a side-effect of calling
-malloc, directly by mmap and mprotect, and also when loading shared
-libraries.
+malloc, directly by mmap, mprotect, and madvise, and also when loading
+shared libraries.
 
 While most applications need less than a thousand maps, certain
 programs, particularly malloc debuggers, may consume lots of them,
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
index 0c64a81d6808..6b0ca7feb135 100644
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -38,6 +38,10 @@ the range for whenever the KSM daemon is started; even if the range
 cannot contain any pages which KSM could actually merge; even if
 MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
 
+If a region of memory must be split into at least one new MADV_MERGEABLE
+or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
+will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
+
 Like other madvise calls, they are intended for use on mapped areas of
 the user address space: they will report ENOMEM if the specified range
 includes unmapped gaps (though working on the intervening mapped areas),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c6fcba1d1ae5..ed233b002e33 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2041,8 +2041,10 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
 	struct mempolicy *, struct vm_userfaultfd_ctx);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int split_vma(struct mm_struct *,
-	struct vm_area_struct *, unsigned long addr, int new_below);
+extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
+	unsigned long addr, int new_below);
+extern int split_vma(struct mm_struct *, struct vm_area_struct *,
+	unsigned long addr, int new_below);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
diff --git a/mm/madvise.c b/mm/madvise.c
index 0012071a6e50..11fc65f81ecd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -92,14 +92,28 @@ static long madvise_behavior(struct vm_area_struct *vma,
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
-		if (error)
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
 			goto out;
+		}
 		break;
 	case MADV_HUGEPAGE:
 	case MADV_NOHUGEPAGE:
 		error = hugepage_madvise(vma, &new_flags, behavior);
-		if (error)
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
 			goto out;
+		}
 		break;
 	}
 
@@ -120,15 +134,37 @@ static long madvise_behavior(struct vm_area_struct *vma,
 	*prev = vma;
 
 	if (start != vma->vm_start) {
-		error = split_vma(mm, vma, start, 1);
-		if (error)
+		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+			error = -ENOMEM;
 			goto out;
+		}
+		error = __split_vma(mm, vma, start, 1);
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
+			goto out;
+		}
 	}
 
 	if (end != vma->vm_end) {
-		error = split_vma(mm, vma, end, 0);
-		if (error)
+		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+			error = -ENOMEM;
+			goto out;
+		}
+		error = __split_vma(mm, vma, end, 0);
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
 			goto out;
+		}
 	}
 
 success:
@@ -136,10 +172,7 @@ success:
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	vma->vm_flags = new_flags;
-
 out:
-	if (error == -ENOMEM)
-		error = -EAGAIN;
 	return error;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1cec28d20583..499b988b1639 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2499,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 /*
- * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
- * munmap path where it doesn't make sense to fail.
+ * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
+ * has already been checked or doesn't make sense to fail.
  */
-static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
-	      unsigned long addr, int new_below)
+int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long addr, int new_below)
 {
 	struct vm_area_struct *new;
 	int err;
-- 
cgit v1.2.3


From 3a4f8a0b3ffa733ffbb327685e83b63383127cf6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 24 Feb 2017 14:59:36 -0800
Subject: mm: remove shmem_mapping() shmem_zero_setup() duplicates

Remove the prototypes for shmem_mapping() and shmem_zero_setup() from
linux/mm.h, since they are already provided in linux/shmem_fs.h.  But
shmem_fs.h must then provide the inline stub for shmem_mapping() when
CONFIG_SHMEM is not set, and a few more cfiles now need to #include it.

Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1702081658250.1549@eggly.anvils
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/microblaze/pci/pci-common.c |  1 +
 arch/powerpc/kernel/pci-common.c |  1 +
 include/linux/mm.h               | 10 ----------
 include/linux/shmem_fs.h         |  7 +++++++
 mm/madvise.c                     |  1 +
 mm/memcontrol.c                  |  1 +
 mm/mincore.c                     |  1 +
 mm/truncate.c                    |  1 +
 mm/workingset.c                  |  1 +
 9 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 7f696f97f9dd..13bc93242c0c 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/mm.h>
+#include <linux/shmem_fs.h>
 #include <linux/list.h>
 #include <linux/syscalls.h>
 #include <linux/irq.h>
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 74bec5498972..a3f5334f5d8c 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -25,6 +25,7 @@
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
 #include <linux/mm.h>
+#include <linux/shmem_fs.h>
 #include <linux/list.h>
 #include <linux/syscalls.h>
 #include <linux/irq.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ed233b002e33..0d65dd72c0f4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1168,16 +1168,6 @@ extern void pagefault_out_of_memory(void);
 
 extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
 
-int shmem_zero_setup(struct vm_area_struct *);
-#ifdef CONFIG_SHMEM
-bool shmem_mapping(struct address_space *mapping);
-#else
-static inline bool shmem_mapping(struct address_space *mapping)
-{
-	return false;
-}
-#endif
-
 extern bool can_do_mlock(void);
 extern int user_shm_lock(size_t, struct user_struct *);
 extern void user_shm_unlock(size_t, struct user_struct *);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index fdaac9d4d46d..a7d6bd2a918f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -57,7 +57,14 @@ extern int shmem_zero_setup(struct vm_area_struct *);
 extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+#ifdef CONFIG_SHMEM
 extern bool shmem_mapping(struct address_space *mapping);
+#else
+static inline bool shmem_mapping(struct address_space *mapping)
+{
+	return false;
+}
+#endif /* CONFIG_SHMEM */
 extern void shmem_unlock_mapping(struct address_space *mapping);
 extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
diff --git a/mm/madvise.c b/mm/madvise.c
index 11fc65f81ecd..dc5927c812d3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -21,6 +21,7 @@
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
 
 #include <asm/tlb.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1fd6affcdde7..45867e439d31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
diff --git a/mm/mincore.c b/mm/mincore.c
index ddb872da3f5b..c5687c45c326 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,6 +14,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
 
 #include <linux/uaccess.h>
diff --git a/mm/truncate.c b/mm/truncate.c
index dd7b24e083c5..f2db67465495 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -20,6 +20,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include <linux/shmem_fs.h>
 #include <linux/cleancache.h>
 #include <linux/rmap.h>
 #include "internal.h"
diff --git a/mm/workingset.c b/mm/workingset.c
index a67f5796b995..79ed5364375d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -6,6 +6,7 @@
 
 #include <linux/memcontrol.h>
 #include <linux/writeback.h>
+#include <linux/shmem_fs.h>
 #include <linux/pagemap.h>
 #include <linux/atomic.h>
 #include <linux/module.h>
-- 
cgit v1.2.3


From dc18d706a4367454ad1fc51e06148d54e8ecfaa0 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 15:00:02 -0800
Subject: memory-hotplug: use dev_online for memhp_auto_online

Commit 31bc3858ea3e ("add automatic onlining policy for the newly added
memory") provides the capability to have added memory automatically
onlined during add, but this appears to be slightly broken.

The current implementation uses walk_memory_range() to call
online_memory_block, which uses memory_block_change_state() to online
the memory.  Instead, we should be calling device_online() for the
memory block in online_memory_block().  This would online the memory
(the memory bus online routine memory_subsys_online() called from
device_online calls memory_block_change_state()) and properly update the
device struct offline flag.

As a result of the current implementation, attempting to remove a memory
block after adding it using auto online fails.  This is because doing a
remove, for instance

  echo offline > /sys/devices/system/memory/memoryXXX/state

uses device_offline() which checks the dev->offline flag.

Link: http://lkml.kernel.org/r/20170222220744.8119.19687.stgit@ltcalpine2-lp14.aus.stglabs.ibm.com
Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 2 +-
 include/linux/memory.h | 3 ---
 mm/memory_hotplug.c    | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index fa26ffd25fa6..cc4f1d0cbffe 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -249,7 +249,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
 	return ret;
 }
 
-int memory_block_change_state(struct memory_block *mem,
+static int memory_block_change_state(struct memory_block *mem,
 		unsigned long to_state, unsigned long from_state_req)
 {
 	int ret = 0;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 093607f90b91..b723a686fc10 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -109,9 +109,6 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 extern int register_new_memory(int, struct mem_section *);
-extern int memory_block_change_state(struct memory_block *mem,
-				     unsigned long to_state,
-				     unsigned long from_state_req);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern int unregister_memory_section(struct mem_section *);
 #endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c35dd1976574..1d3ed58f92ab 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1337,7 +1337,7 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
 
 static int online_memory_block(struct memory_block *mem, void *arg)
 {
-	return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+	return device_online(&mem->dev);
 }
 
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From f9fa1d919c696e90c887d8742198023e7639d139 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 24 Feb 2017 15:00:05 -0800
Subject: kasan: drain quarantine of memcg slab objects

Per memcg slab accounting and kasan have a problem with kmem_cache
destruction.
 - kmem_cache_create() allocates a kmem_cache, which is used for
   allocations from processes running in root (top) memcg.
 - Processes running in non root memcg and allocating with either
   __GFP_ACCOUNT or from a SLAB_ACCOUNT cache use a per memcg
   kmem_cache.
 - Kasan catches use-after-free by having kfree() and kmem_cache_free()
   defer freeing of objects. Objects are placed in a quarantine.
 - kmem_cache_destroy() destroys root and non root kmem_caches. It takes
   care to drain the quarantine of objects from the root memcg's
   kmem_cache, but ignores objects associated with non root memcg. This
   causes leaks because quarantined per memcg objects refer to per memcg
   kmem cache being destroyed.

To see the problem:

 1) create a slab cache with kmem_cache_create(,,,SLAB_ACCOUNT,)
 2) from non root memcg, allocate and free a few objects from cache
 3) dispose of the cache with kmem_cache_destroy() kmem_cache_destroy()
    will trigger a "Slab cache still has objects" warning indicating
    that the per memcg kmem_cache structure was leaked.

Fix the leak by draining kasan quarantined objects allocated from non
root memcg.

Racing memcg deletion is tricky, but handled.  kmem_cache_destroy() =>
shutdown_memcg_caches() => __shutdown_memcg_cache() => shutdown_cache()
flushes per memcg quarantined objects, even if that memcg has been
rmdir'd and gone through memcg_deactivate_kmem_caches().

This leak only affects destroyed SLAB_ACCOUNT kmem caches when kasan is
enabled.  So I don't think it's worth patching stable kernels.

Link: http://lkml.kernel.org/r/1482257462-36948-1-git-send-email-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 4 ++--
 mm/kasan/kasan.c      | 2 +-
 mm/kasan/quarantine.c | 1 +
 mm/slab_common.c      | 4 +++-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 820c0ad54a01..c908b25bf5a5 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -52,7 +52,7 @@ void kasan_free_pages(struct page *page, unsigned int order);
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
 void kasan_cache_shrink(struct kmem_cache *cache);
-void kasan_cache_destroy(struct kmem_cache *cache);
+void kasan_cache_shutdown(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -98,7 +98,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
-static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
+static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index b2a0cff2bb35..25f0e6521f36 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -435,7 +435,7 @@ void kasan_cache_shrink(struct kmem_cache *cache)
 	quarantine_remove_cache(cache);
 }
 
-void kasan_cache_destroy(struct kmem_cache *cache)
+void kasan_cache_shutdown(struct kmem_cache *cache)
 {
 	quarantine_remove_cache(cache);
 }
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index dae929c02bbb..6f1ed1630873 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -274,6 +274,7 @@ static void per_cpu_remove_cache(void *arg)
 	qlist_free_all(&to_free, cache);
 }
 
+/* Free all quarantined objects belonging to cache. */
 void quarantine_remove_cache(struct kmem_cache *cache)
 {
 	unsigned long flags, i;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23ff74e61838..09d0e849b07f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -528,6 +528,9 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 
 static int shutdown_cache(struct kmem_cache *s)
 {
+	/* free asan quarantined objects */
+	kasan_cache_shutdown(s);
+
 	if (__kmem_cache_shutdown(s) != 0)
 		return -EBUSY;
 
@@ -816,7 +819,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
-	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
-- 
cgit v1.2.3


From 796f571b0c5cf3efd2f652779770fa7bbbc2bb03 Mon Sep 17 00:00:00 2001
From: Lafcadio Wluiki <wluikil@gmail.com>
Date: Fri, 24 Feb 2017 15:00:23 -0800
Subject: procfs: use an enum for possible hidepid values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the hidepid parameter was checked by comparing literal
integers 0, 1, 2.  Let's add a proper enum for this, to make the
checking more expressive:

        0 → HIDEPID_OFF
        1 → HIDEPID_NO_ACCESS
        2 → HIDEPID_INVISIBLE

This changes the internal labelling only, the userspace-facing interface
remains unmodified, and still works with literal integers 0, 1, 2.

No functional changes.

Link: http://lkml.kernel.org/r/1484572984-13388-2-git-send-email-djalal@gmail.com
Signed-off-by: Lafcadio Wluiki <wluikil@gmail.com>
Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c                | 8 ++++----
 fs/proc/inode.c               | 2 +-
 fs/proc/root.c                | 3 ++-
 include/linux/pid_namespace.h | 6 ++++++
 4 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4ecb5edc3c61..b8f06273353e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -697,11 +697,11 @@ static int proc_pid_permission(struct inode *inode, int mask)
 	task = get_proc_task(inode);
 	if (!task)
 		return -ESRCH;
-	has_perms = has_pid_permissions(pid, task, 1);
+	has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
 	put_task_struct(task);
 
 	if (!has_perms) {
-		if (pid->hide_pid == 2) {
+		if (pid->hide_pid == HIDEPID_INVISIBLE) {
 			/*
 			 * Let's make getdents(), stat(), and open()
 			 * consistent with each other.  If a process
@@ -1737,7 +1737,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	stat->gid = GLOBAL_ROOT_GID;
 	task = pid_task(proc_pid(inode), PIDTYPE_PID);
 	if (task) {
-		if (!has_pid_permissions(pid, task, 2)) {
+		if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
 			rcu_read_unlock();
 			/*
 			 * This doesn't prevent learning whether PID exists,
@@ -3168,7 +3168,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 		int len;
 
 		cond_resched();
-		if (!has_pid_permissions(ns, iter.task, 2))
+		if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
 			continue;
 
 		len = snprintf(name, sizeof(name), "%d", iter.tgid);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ad9ed7958af..2cc7a8030275 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -107,7 +107,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
 
 	if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
 		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
-	if (pid->hide_pid != 0)
+	if (pid->hide_pid != HIDEPID_OFF)
 		seq_printf(seq, ",hidepid=%u", pid->hide_pid);
 
 	return 0;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1988440b2049..b90da888b81a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -58,7 +58,8 @@ int proc_parse_options(char *options, struct pid_namespace *pid)
 		case Opt_hidepid:
 			if (match_int(&args[0], &option))
 				return 0;
-			if (option < 0 || option > 2) {
+			if (option < HIDEPID_OFF ||
+			    option > HIDEPID_INVISIBLE) {
 				pr_err("proc: hidepid value must be between 0 and 2.\n");
 				return 0;
 			}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 34cce96741bc..c2a989dee876 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -21,6 +21,12 @@ struct pidmap {
 
 struct fs_pin;
 
+enum { /* definitions for pid_namespace's hide_pid field */
+	HIDEPID_OFF	  = 0,
+	HIDEPID_NO_ACCESS = 1,
+	HIDEPID_INVISIBLE = 2,
+};
+
 struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
-- 
cgit v1.2.3


From 16f4e3195156f2f06e90d00a36bc48d7a513f253 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 24 Feb 2017 15:00:29 -0800
Subject: include/linux/iopoll.h: include <linux/ktime.h> instead of
 <linux/hrtimer.h>

The timer APIs this header needs are ktime_get(), ktime_add_us(), and
ktime_compare().  So, including <linux/ktime.h> seems enough.  This
commit will cut unnecessary header file parsing.

Link: http://lkml.kernel.org/r/1481679225-10885-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iopoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index 1c30014ed176..d29e1e21bf3f 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -17,7 +17,7 @@
 
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/hrtimer.h>
+#include <linux/ktime.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/io.h>
-- 
cgit v1.2.3


From a3f0825e7e37d99a02a8a1b1599687667ee50d04 Mon Sep 17 00:00:00 2001
From: Gideon Israel Dsouza <gidisrael@gmail.com>
Date: Fri, 24 Feb 2017 15:00:32 -0800
Subject: compiler-gcc.h: add a new macro to wrap gcc attribute

Add __mode(x) into compiler-gcc.h as part of a cleanup task I've taken
up, to replace gcc specific attributes with macros.

The next patch is a cleanup of the m68k subsystem and it requires a new
macro to wrap __attribute__ ((mode (...)))

Link: http://lkml.kernel.org/r/1485540901-1988-2-git-send-email-gidisrael@gmail.com
Signed-off-by: Gideon Israel Dsouza <gidisrael@gmail.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index fddd1a5eb322..811f7a915658 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -122,6 +122,7 @@
 #define __attribute_const__	__attribute__((__const__))
 #define __maybe_unused		__attribute__((unused))
 #define __always_unused		__attribute__((unused))
+#define __mode(x)               __attribute__((mode(x)))
 
 /* gcc version specific checks */
 
-- 
cgit v1.2.3


From 85caa95b9f19bb3a26d7e025d1134760b69e0c40 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 24 Feb 2017 15:00:38 -0800
Subject: bug: switch data corruption check to __must_check

The CHECK_DATA_CORRUPTION() macro was designed to have callers do
something meaningful/protective on failure.  However, using "return
false" in the macro too strictly limits the design patterns of callers.
Instead, let callers handle the logic test directly, but make sure that
the result IS checked by forcing __must_check (which appears to not be
able to be used directly on macro expressions).

Link: http://lkml.kernel.org/r/20170206204547.GA125312@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bug.h | 12 +++++++-----
 lib/list_debug.c    | 45 ++++++++++++++++++++++++---------------------
 2 files changed, 31 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index baff2e8fc8a8..5828489309bb 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -124,18 +124,20 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 
 /*
  * Since detected data corruption should stop operation on the affected
- * structures, this returns false if the corruption condition is found.
+ * structures. Return value must be checked and sanely acted on by caller.
  */
+static inline __must_check bool check_data_corruption(bool v) { return v; }
 #define CHECK_DATA_CORRUPTION(condition, fmt, ...)			 \
-	do {								 \
-		if (unlikely(condition)) {				 \
+	check_data_corruption(({					 \
+		bool corruption = unlikely(condition);			 \
+		if (corruption) {					 \
 			if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
 				pr_err(fmt, ##__VA_ARGS__);		 \
 				BUG();					 \
 			} else						 \
 				WARN(1, fmt, ##__VA_ARGS__);		 \
-			return false;					 \
 		}							 \
-	} while (0)
+		corruption;						 \
+	}))
 
 #endif	/* _LINUX_BUG_H */
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 7f7bfa55eb6d..a34db8d27667 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -20,15 +20,16 @@
 bool __list_add_valid(struct list_head *new, struct list_head *prev,
 		      struct list_head *next)
 {
-	CHECK_DATA_CORRUPTION(next->prev != prev,
-		"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
-		prev, next->prev, next);
-	CHECK_DATA_CORRUPTION(prev->next != next,
-		"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
-		next, prev->next, prev);
-	CHECK_DATA_CORRUPTION(new == prev || new == next,
-		"list_add double add: new=%p, prev=%p, next=%p.\n",
-		new, prev, next);
+	if (CHECK_DATA_CORRUPTION(next->prev != prev,
+			"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
+			prev, next->prev, next) ||
+	    CHECK_DATA_CORRUPTION(prev->next != next,
+			"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
+			next, prev->next, prev) ||
+	    CHECK_DATA_CORRUPTION(new == prev || new == next,
+			"list_add double add: new=%p, prev=%p, next=%p.\n",
+			new, prev, next))
+		return false;
 
 	return true;
 }
@@ -41,18 +42,20 @@ bool __list_del_entry_valid(struct list_head *entry)
 	prev = entry->prev;
 	next = entry->next;
 
-	CHECK_DATA_CORRUPTION(next == LIST_POISON1,
-		"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
-		entry, LIST_POISON1);
-	CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
-		"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
-		entry, LIST_POISON2);
-	CHECK_DATA_CORRUPTION(prev->next != entry,
-		"list_del corruption. prev->next should be %p, but was %p\n",
-		entry, prev->next);
-	CHECK_DATA_CORRUPTION(next->prev != entry,
-		"list_del corruption. next->prev should be %p, but was %p\n",
-		entry, next->prev);
+	if (CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+			"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
+			entry, LIST_POISON1) ||
+	    CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
+			"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
+			entry, LIST_POISON2) ||
+	    CHECK_DATA_CORRUPTION(prev->next != entry,
+			"list_del corruption. prev->next should be %p, but was %p\n",
+			entry, prev->next) ||
+	    CHECK_DATA_CORRUPTION(next->prev != entry,
+			"list_del corruption. next->prev should be %p, but was %p\n",
+			entry, next->prev))
+		return false;
+
 	return true;
 
 }
-- 
cgit v1.2.3


From 4f5901f5a6724f4ed1641e4a94978c5039a1f8c4 Mon Sep 17 00:00:00 2001
From: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Date: Fri, 24 Feb 2017 15:01:01 -0800
Subject: linux/kernel.h: fix DIV_ROUND_CLOSEST to support negative divisors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While working on a thermal driver I encounter a scenario where the
divisor could be negative, instead of adding local code to handle this I
though I first try to add support for this in DIV_ROUND_CLOSEST.

Add support to DIV_ROUND_CLOSEST for negative divisors if both dividend
and divisor variable types are signed.  This should not alter current
behavior for users of the macro as previously negative divisors where
not supported.

Before:

DIV_ROUND_CLOSEST(  59,  4) =  15
DIV_ROUND_CLOSEST(  59, -4) = -14
DIV_ROUND_CLOSEST( -59,  4) = -15
DIV_ROUND_CLOSEST( -59, -4) =  14

After:

DIV_ROUND_CLOSEST(  59,  4) =  15
DIV_ROUND_CLOSEST(  59, -4) = -15
DIV_ROUND_CLOSEST( -59,  4) = -15
DIV_ROUND_CLOSEST( -59, -4) =  15

[akpm@linux-foundation.org: fix comment, per Guenter]
Link: http://lkml.kernel.org/r/20161222102217.29011-1-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cb09238f6d32..4c26dc3a8295 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -100,16 +100,18 @@
 )
 
 /*
- * Divide positive or negative dividend by positive divisor and round
- * to closest integer. Result is undefined for negative divisors and
- * for negative dividends if the divisor variable type is unsigned.
+ * Divide positive or negative dividend by positive or negative divisor
+ * and round to closest integer. Result is undefined for negative
+ * divisors if he dividend variable type is unsigned and for negative
+ * dividends if the divisor variable type is unsigned.
  */
 #define DIV_ROUND_CLOSEST(x, divisor)(			\
 {							\
 	typeof(x) __x = x;				\
 	typeof(divisor) __d = divisor;			\
 	(((typeof(x))-1) > 0 ||				\
-	 ((typeof(divisor))-1) > 0 || (__x) > 0) ?	\
+	 ((typeof(divisor))-1) > 0 ||			\
+	 (((__x) > 0) == ((__d) > 0))) ?		\
 		(((__x) + ((__d) / 2)) / (__d)) :	\
 		(((__x) - ((__d) / 2)) / (__d));	\
 }							\
-- 
cgit v1.2.3


From f231aebfc4cae2f6ed27a46a31e2630909513d77 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 24 Feb 2017 15:01:04 -0800
Subject: rbtree: use designated initializers

Prepare to mark sensitive kernel structures for randomization by making
sure they're using designated initializers.  These were identified
during allyesconfig builds of x86, arm, and arm64, with most initializer
fixes extracted from grsecurity.

Link: http://lkml.kernel.org/r/20161217010253.GA140470@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Jie Chen <fykcee1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree_augmented.h | 4 +++-
 lib/rbtree.c                     | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index d076183e49be..9702b6e183bc 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -90,7 +90,9 @@ rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
 	old->rbaugmented = rbcompute(old);				\
 }									\
 rbstatic const struct rb_augment_callbacks rbname = {			\
-	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	\
+	.propagate = rbname ## _propagate,				\
+	.copy = rbname ## _copy,					\
+	.rotate = rbname ## _rotate					\
 };
 
 
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 1f8b112a7c35..4ba2828a67c0 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -427,7 +427,9 @@ static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
 static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
 
 static const struct rb_augment_callbacks dummy_callbacks = {
-	dummy_propagate, dummy_copy, dummy_rotate
+	.propagate = dummy_propagate,
+	.copy = dummy_copy,
+	.rotate = dummy_rotate
 };
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
-- 
cgit v1.2.3


From 4e1a33b105ddf201f66dcc44490c6086a25eca0b Mon Sep 17 00:00:00 2001
From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Date: Fri, 24 Feb 2017 15:01:12 -0800
Subject: lib: update LZ4 compressor module

Patch series "Update LZ4 compressor module", v7.

This patchset updates the LZ4 compression module to a version based on
LZ4 v1.7.3 allowing to use the fast compression algorithm aka LZ4 fast
which provides an "acceleration" parameter as a tradeoff between high
compression ratio and high compression speed.

We want to use LZ4 fast in order to support compression in lustre and
(mostly, based on that) investigate data reduction techniques in behalf
of storage systems.

Also, it will be useful for other users of LZ4 compression, as with LZ4
fast it is possible to enable applications to use fast and/or high
compression depending on the usecase.  For instance, ZRAM is offering a
LZ4 backend and could benefit from an updated LZ4 in the kernel.

LZ4 homepage: http://www.lz4.org/
LZ4 source repository: https://github.com/lz4/lz4 Source version: 1.7.3

Benchmark (taken from [1], Core i5-4300U @1.9GHz):
----------------|--------------|----------------|----------
Compressor      | Compression  | Decompression  | Ratio
----------------|--------------|----------------|----------
memcpy          |  4200 MB/s   |  4200 MB/s     | 1.000
LZ4 fast 50     |  1080 MB/s   |  2650 MB/s     | 1.375
LZ4 fast 17     |   680 MB/s   |  2220 MB/s     | 1.607
LZ4 fast 5      |   475 MB/s   |  1920 MB/s     | 1.886
LZ4 default     |   385 MB/s   |  1850 MB/s     | 2.101

[1] http://fastcompression.blogspot.de/2015/04/sampling-or-faster-lz4.html

[PATCH 1/5] lib: Update LZ4 compressor module
[PATCH 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version
[PATCH 3/5] crypto: Change LZ4 modules to work with new LZ4 module version
[PATCH 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version
[PATCH 5/5] lib/lz4: Remove back-compat wrappers

This patch (of 5):

Update the LZ4 kernel module to LZ4 v1.7.3 by Yann Collet.  The kernel
module is inspired by the previous work by Chanho Min.  The updated LZ4
module will not break existing code since the patchset contains
appropriate changes.

API changes:

New method LZ4_compress_fast which differs from the variant available in
kernel by the new acceleration parameter, allowing to trade compression
ratio for more compression speed and vice versa.

LZ4_decompress_fast is the respective decompression method, featuring a
very fast decoder (multiple GB/s per core), able to reach RAM speed in
multi-core systems.  The decompressor allows to decompress data
compressed with LZ4 fast as well as the LZ4 HC (high compression)
algorithm.

Also the useful functions LZ4_decompress_safe_partial and
LZ4_compress_destsize were added.  The latter reverses the logic by
trying to compress as much data as possible from source to dest while
the former aims to decompress partial blocks of data.

A bunch of streaming functions were also added which allow
compressig/decompressing data in multiple steps (so called "streaming
mode").

The methods lz4_compress and lz4_decompress_unknownoutputsize are now
known as LZ4_compress_default respectivley LZ4_decompress_safe.  The old
methods will be removed since there's no callers left in the code.

[arnd@arndb.de: fix KERNEL_LZ4 support]
  Link: http://lkml.kernel.org/r/20170208211946.2839649-1-arnd@arndb.de
[akpm@linux-foundation.org: simplify]
[akpm@linux-foundation.org: fix the simplification]
[4sschmid@informatik.uni-hamburg.de: fix performance regressions]
  Link: http://lkml.kernel.org/r/1486898178-17125-2-git-send-email-4sschmid@informatik.uni-hamburg.de
[4sschmid@informatik.uni-hamburg.de: v8]
  Link: http://lkml.kernel.org/r/1487182598-15351-2-git-send-email-4sschmid@informatik.uni-hamburg.de
Link: http://lkml.kernel.org/r/1486321748-19085-2-git-send-email-4sschmid@informatik.uni-hamburg.de
Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Bongkyu Kim <bongkyu.kim@lge.com>
Cc: Rui Salvaterra <rsalvaterra@gmail.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lz4.h      |  762 +++++++++++++++++++++++++++---
 lib/lz4/Makefile         |    2 +
 lib/lz4/lz4_compress.c   | 1161 +++++++++++++++++++++++++++++++++-------------
 lib/lz4/lz4_decompress.c |  705 ++++++++++++++++++----------
 lib/lz4/lz4defs.h        |  338 ++++++++------
 lib/lz4/lz4hc_compress.c |  867 ++++++++++++++++++++++------------
 6 files changed, 2758 insertions(+), 1077 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 6b784c59f321..1b0f8ca0f9c8 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -1,87 +1,717 @@
-#ifndef __LZ4_H__
-#define __LZ4_H__
-/*
- * LZ4 Kernel Interface
+/* LZ4 Kernel Interface
  *
  * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ * Copyright (C) 2016, Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
+ *
+ * This file is based on the original header file
+ * for LZ4 - Fast LZ compression algorithm.
+ *
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2016, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  */
-#define LZ4_MEM_COMPRESS	(16384)
-#define LZ4HC_MEM_COMPRESS	(262144 + (2 * sizeof(unsigned char *)))
 
+#ifndef __LZ4_H__
+#define __LZ4_H__
+
+#include <linux/types.h>
+#include <linux/string.h>	 /* memset, memcpy */
+
+/*-************************************************************************
+ *	CONSTANTS
+ **************************************************************************/
 /*
- * lz4_compressbound()
- * Provides the maximum size that LZ4 may output in a "worst case" scenario
- * (input data not compressible)
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes
+ * (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio
+ * Reduced memory usage can improve speed, due to cache effect
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
  */
-static inline size_t lz4_compressbound(size_t isize)
+#define LZ4_MEMORY_USAGE 14
+
+#define LZ4_MAX_INPUT_SIZE	0x7E000000 /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)	(\
+	(unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE \
+	? 0 \
+	: (isize) + ((isize)/255) + 16)
+
+#define LZ4_ACCELERATION_DEFAULT 1
+#define LZ4_HASHLOG	 (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)
+
+#define LZ4HC_MIN_CLEVEL			3
+#define LZ4HC_DEFAULT_CLEVEL			9
+#define LZ4HC_MAX_CLEVEL			16
+
+#define LZ4HC_DICTIONARY_LOGSIZE 16
+#define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE)
+#define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1)
+#define LZ4HC_HASH_LOG (LZ4HC_DICTIONARY_LOGSIZE - 1)
+#define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG)
+#define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1)
+
+/*-************************************************************************
+ *	STREAMING CONSTANTS AND STRUCTURES
+ **************************************************************************/
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE - 3)) + 4)
+#define LZ4_STREAMSIZE	(LZ4_STREAMSIZE_U64 * sizeof(unsigned long long))
+
+#define LZ4_STREAMHCSIZE        262192
+#define LZ4_STREAMHCSIZE_SIZET (262192 / sizeof(size_t))
+
+#define LZ4_STREAMDECODESIZE_U64	4
+#define LZ4_STREAMDECODESIZE		 (LZ4_STREAMDECODESIZE_U64 * \
+	sizeof(unsigned long long))
+
+/*
+ * LZ4_stream_t - information structure to track an LZ4 stream.
+ */
+typedef struct {
+	uint32_t hashTable[LZ4_HASH_SIZE_U32];
+	uint32_t currentOffset;
+	uint32_t initCheck;
+	const uint8_t *dictionary;
+	uint8_t *bufferStart;
+	uint32_t dictSize;
+} LZ4_stream_t_internal;
+typedef union {
+	unsigned long long table[LZ4_STREAMSIZE_U64];
+	LZ4_stream_t_internal internal_donotuse;
+} LZ4_stream_t;
+
+/*
+ * LZ4_streamHC_t - information structure to track an LZ4HC stream.
+ */
+typedef struct {
+	unsigned int	 hashTable[LZ4HC_HASHTABLESIZE];
+	unsigned short	 chainTable[LZ4HC_MAXD];
+	/* next block to continue on current prefix */
+	const unsigned char *end;
+	/* All index relative to this position */
+	const unsigned char *base;
+	/* alternate base for extDict */
+	const unsigned char *dictBase;
+	/* below that point, need extDict */
+	unsigned int	 dictLimit;
+	/* below that point, no more dict */
+	unsigned int	 lowLimit;
+	/* index from which to continue dict update */
+	unsigned int	 nextToUpdate;
+	unsigned int	 compressionLevel;
+} LZ4HC_CCtx_internal;
+typedef union {
+	size_t table[LZ4_STREAMHCSIZE_SIZET];
+	LZ4HC_CCtx_internal internal_donotuse;
+} LZ4_streamHC_t;
+
+/*
+ * LZ4_streamDecode_t - information structure to track an
+ *	LZ4 stream during decompression.
+ *
+ * init this structure using LZ4_setStreamDecode (or memset()) before first use
+ */
+typedef struct {
+	const uint8_t *externalDict;
+	size_t extDictSize;
+	const uint8_t *prefixEnd;
+	size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+typedef union {
+	unsigned long long table[LZ4_STREAMDECODESIZE_U64];
+	LZ4_streamDecode_t_internal internal_donotuse;
+} LZ4_streamDecode_t;
+
+/*-************************************************************************
+ *	SIZE OF STATE
+ **************************************************************************/
+#define LZ4_MEM_COMPRESS	LZ4_STREAMSIZE
+#define LZ4HC_MEM_COMPRESS	LZ4_STREAMHCSIZE
+
+/*-************************************************************************
+ *	Compression Functions
+ **************************************************************************/
+
+/**
+ * LZ4_compressBound() - Max. output size in worst case szenarios
+ * @isize: Size of the input data
+ *
+ * Return: Max. size LZ4 may output in a "worst case" szenario
+ * (data not compressible)
+ */
+static inline int LZ4_compressBound(size_t isize)
 {
-	return isize + (isize / 255) + 16;
+	return LZ4_COMPRESSBOUND(isize);
 }
 
-/*
- * lz4_compress()
- *	src     : source address of the original data
- *	src_len : size of the original data
- *	dst	: output buffer address of the compressed data
- *		This requires 'dst' of size LZ4_COMPRESSBOUND.
- *	dst_len : is the output size, which is returned after compress done
- *	workmem : address of the working memory.
- *		This requires 'workmem' of size LZ4_MEM_COMPRESS.
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer and workmem must be already allocated with
- *		the defined size.
- */
-int lz4_compress(const unsigned char *src, size_t src_len,
-		unsigned char *dst, size_t *dst_len, void *wrkmem);
-
- /*
-  * lz4hc_compress()
-  *	 src	 : source address of the original data
-  *	 src_len : size of the original data
-  *	 dst	 : output buffer address of the compressed data
-  *		This requires 'dst' of size LZ4_COMPRESSBOUND.
-  *	 dst_len : is the output size, which is returned after compress done
-  *	 workmem : address of the working memory.
-  *		This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
-  *	 return  : Success if return 0
-  *		   Error if return (< 0)
-  *	 note :  Destination buffer and workmem must be already allocated with
-  *		 the defined size.
-  */
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-		unsigned char *dst, size_t *dst_len, void *wrkmem);
+/**
+ * lz4_compressbound() - For backwards compatibility; see LZ4_compressBound
+ * @isize: Size of the input data
+ *
+ * Return: Max. size LZ4 may output in a "worst case" szenario
+ *	(data not compressible)
+ */
+static inline int lz4_compressbound(size_t isize)
+{
+	return LZ4_COMPRESSBOUND(isize);
+}
+
+/**
+ * LZ4_compress_default() - Compress data from source to dest
+ * @source: source address of the original data
+ * @dest: output buffer address of the compressed data
+ * @inputSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxOutputSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of LZ4_MEM_COMPRESS.
+ *
+ * Compresses 'sourceSize' bytes from buffer 'source'
+ * into already allocated 'dest' buffer of size 'maxOutputSize'.
+ * Compression is guaranteed to succeed if
+ * 'maxOutputSize' >= LZ4_compressBound(inputSize).
+ * It also runs faster, so it's a recommended setting.
+ * If the function cannot compress 'source' into a more limited 'dest' budget,
+ * compression stops *immediately*, and the function result is zero.
+ * As a consequence, 'dest' content is not valid.
+ *
+ * Return: Number of bytes written into buffer 'dest'
+ *	(necessarily <= maxOutputSize) or 0 if compression fails
+ */
+int LZ4_compress_default(const char *source, char *dest, int inputSize,
+	int maxOutputSize, void *wrkmem);
+
+/**
+ * LZ4_compress_fast() - As LZ4_compress_default providing an acceleration param
+ * @source: source address of the original data
+ * @dest: output buffer address of the compressed data
+ * @inputSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxOutputSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ * @acceleration: acceleration factor
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of LZ4_MEM_COMPRESS.
+ *
+ * Same as LZ4_compress_default(), but allows to select an "acceleration"
+ * factor. The larger the acceleration value, the faster the algorithm,
+ * but also the lesser the compression. It's a trade-off. It can be fine tuned,
+ * with each successive value providing roughly +~3% to speed.
+ * An acceleration value of "1" is the same as regular LZ4_compress_default()
+ * Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT, which is 1.
+ *
+ * Return: Number of bytes written into buffer 'dest'
+ *	(necessarily <= maxOutputSize) or 0 if compression fails
+ */
+int LZ4_compress_fast(const char *source, char *dest, int inputSize,
+	int maxOutputSize, int acceleration, void *wrkmem);
+
+/**
+ * LZ4_compress_destSize() - Compress as much data as possible
+ *	from source to dest
+ * @source: source address of the original data
+ * @dest: output buffer address of the compressed data
+ * @sourceSizePtr: will be modified to indicate how many bytes where read
+ *	from 'source' to fill 'dest'. New value is necessarily <= old value.
+ * @targetDestSize: Size of buffer 'dest' which must be already allocated
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of LZ4_MEM_COMPRESS.
+ *
+ * Reverse the logic, by compressing as much data as possible
+ * from 'source' buffer into already allocated buffer 'dest'
+ * of size 'targetDestSize'.
+ * This function either compresses the entire 'source' content into 'dest'
+ * if it's large enough, or fill 'dest' buffer completely with as much data as
+ * possible from 'source'.
+ *
+ * Return: Number of bytes written into 'dest' (necessarily <= targetDestSize)
+ *	or 0 if compression fails
+ */
+int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr,
+	int targetDestSize, void *wrkmem);
 
 /*
- * lz4_decompress()
- *	src     : source address of the compressed data
- *	src_len : is the input size, whcih is returned after decompress done
- *	dest	: output buffer address of the decompressed data
- *	actual_dest_len: is the size of uncompressed data, supposing it's known
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer must be already allocated.
- *		slightly faster than lz4_decompress_unknownoutputsize()
+ * lz4_compress() - For backward compatibility, see LZ4_compress_default
+ * @src: source address of the original data
+ * @src_len: size of the original data
+ * @dst: output buffer address of the compressed data. This requires 'dst'
+ *	of size LZ4_COMPRESSBOUND
+ * @dst_len: is the output size, which is returned after compress done
+ * @workmem: address of the working memory.
+ *
+ * Return: Success if return 0, Error if return < 0
  */
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-		unsigned char *dest, size_t actual_dest_len);
+int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
+	size_t *dst_len, void *wrkmem);
+
+/*-************************************************************************
+ *	Decompression Functions
+ **************************************************************************/
+
+/**
+ * LZ4_decompress_fast() - Decompresses data from 'source' into 'dest'
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated with 'originalSize' bytes
+ * @originalSize: is the original and therefore uncompressed size
+ *
+ * Decompresses data from 'source' into 'dest'.
+ * This function fully respect memory boundaries for properly formed
+ * compressed data.
+ * It is a bit faster than LZ4_decompress_safe().
+ * However, it does not provide any protection against intentionally
+ * modified data stream (malicious input).
+ * Use this function in trusted environment only
+ * (data to decode comes from a trusted source).
+ *
+ * Return: number of bytes read from the source buffer
+ *	or a negative result if decompression fails.
+ */
+int LZ4_decompress_fast(const char *source, char *dest, int originalSize);
+
+/**
+ * LZ4_decompress_safe() - Decompression protected against buffer overflow
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated
+ * @compressedSize: is the precise full size of the compressed block
+ * @maxDecompressedSize: is the size of 'dest' buffer
+ *
+ * Decompresses data fom 'source' into 'dest'.
+ * If the source stream is detected malformed, the function will
+ * stop decoding and return a negative result.
+ * This function is protected against buffer overflow exploits,
+ * including malicious data packets. It never writes outside output buffer,
+ * nor reads outside input buffer.
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_safe(const char *source, char *dest, int compressedSize,
+	int maxDecompressedSize);
+
+/**
+ * LZ4_decompress_safe_partial() - Decompress a block of size 'compressedSize'
+ *	at position 'source' into buffer 'dest'
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the decompressed data which must be
+ *	already allocated
+ * @compressedSize: is the precise full size of the compressed block.
+ * @targetOutputSize: the decompression operation will try
+ *	to stop as soon as 'targetOutputSize' has been reached
+ * @maxDecompressedSize: is the size of destination buffer
+ *
+ * This function decompresses a compressed block of size 'compressedSize'
+ * at position 'source' into destination buffer 'dest'
+ * of size 'maxDecompressedSize'.
+ * The function tries to stop decompressing operation as soon as
+ * 'targetOutputSize' has been reached, reducing decompression time.
+ * This function never writes outside of output buffer,
+ * and never reads outside of input buffer.
+ * It is therefore protected against malicious data packets.
+ *
+ * Return: the number of bytes decoded in the destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ *
+ */
+int LZ4_decompress_safe_partial(const char *source, char *dest,
+	int compressedSize, int targetOutputSize, int maxDecompressedSize);
 
 /*
- * lz4_decompress_unknownoutputsize()
- *	src     : source address of the compressed data
- *	src_len : is the input size, therefore the compressed size
- *	dest	: output buffer address of the decompressed data
- *	dest_len: is the max size of the destination buffer, which is
- *			returned with actual size of decompressed data after
- *			decompress done
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer must be already allocated.
+ * lz4_decompress_unknownoutputsize() - For backwards compatibility,
+ *	see LZ4_decompress_safe
+ * @src: source address of the compressed data
+ * @src_len: is the input size, therefore the compressed size
+ * @dest: output buffer address of the decompressed data
+ *	which must be already allocated
+ * @dest_len: is the max size of the destination buffer, which is
+ *	returned with actual size of decompressed data after decompress done
+ *
+ * Return: Success if return 0, Error if return (< 0)
  */
 int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-		unsigned char *dest, size_t *dest_len);
+	unsigned char *dest, size_t *dest_len);
+
+/**
+ * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast
+ * @src: source address of the compressed data
+ * @src_len: is the input size, which is returned after decompress done
+ * @dest: output buffer address of the decompressed data,
+ *	which must be already allocated
+ * @actual_dest_len: is the size of uncompressed data, supposing it's known
+ *
+ * Return: Success if return 0, Error if return (< 0)
+ */
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+	unsigned char *dest, size_t actual_dest_len);
+
+/*-************************************************************************
+ *	LZ4 HC Compression
+ **************************************************************************/
+
+/**
+ * LZ4_compress_HC() - Compress data from `src` into `dst`, using HC algorithm
+ * @src: source address of the original data
+ * @dst: output buffer address of the compressed data
+ * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @dstCapacity: full or partial size of buffer 'dst',
+ *	which must be already allocated
+ * @compressionLevel: Recommended values are between 4 and 9, although any
+ *	value between 1 and LZ4HC_MAX_CLEVEL will work.
+ *	Values >LZ4HC_MAX_CLEVEL behave the same as 16.
+ * @wrkmem: address of the working memory.
+ *	This requires 'wrkmem' of size LZ4HC_MEM_COMPRESS.
+ *
+ * Compress data from 'src' into 'dst', using the more powerful
+ * but slower "HC" algorithm. Compression is guaranteed to succeed if
+ * `dstCapacity >= LZ4_compressBound(srcSize)
+ *
+ * Return : the number of bytes written into 'dst' or 0 if compression fails.
+ */
+int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity,
+	int compressionLevel, void *wrkmem);
+
+/**
+ * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC
+ * @src: source address of the original data
+ * @src_len: size of the original data
+ * @dst: output buffer address of the compressed data. This requires 'dst'
+ *	of size LZ4_COMPRESSBOUND.
+ * @dst_len: is the output size, which is returned after compress done
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
+ *
+ * Return	: Success if return 0, Error if return (< 0)
+ */
+int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
+	size_t *dst_len, void *wrkmem);
+
+/**
+ * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure
+ * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure
+ * @compressionLevel: Recommended values are between 4 and 9, although any
+ *	value between 1 and LZ4HC_MAX_CLEVEL will work.
+ *	Values >LZ4HC_MAX_CLEVEL behave the same as 16.
+ *
+ * An LZ4_streamHC_t structure can be allocated once
+ * and re-used multiple times.
+ * Use this function to init an allocated `LZ4_streamHC_t` structure
+ * and start a new compression.
+ */
+void LZ4_resetStreamHC(LZ4_streamHC_t *streamHCPtr, int compressionLevel);
+
+/**
+ * LZ4_loadDictHC() - Load a static dictionary into LZ4_streamHC
+ * @streamHCPtr: pointer to the LZ4HC_stream_t
+ * @dictionary: dictionary to load
+ * @dictSize: size of dictionary
+ *
+ * Use this function to load a static dictionary into LZ4HC_stream.
+ * Any previous data will be forgotten, only 'dictionary'
+ * will remain in memory.
+ * Loading a size of 0 is allowed.
+ *
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int	LZ4_loadDictHC(LZ4_streamHC_t *streamHCPtr, const char *dictionary,
+	int dictSize);
+
+/**
+ * LZ4_compress_HC_continue() - Compress 'src' using data from previously
+ *	compressed blocks as a dictionary using the HC algorithm
+ * @streamHCPtr: Pointer to the previous 'LZ4_streamHC_t' structure
+ * @src: source address of the original data
+ * @dst: output buffer address of the compressed data,
+ *	which must be already allocated
+ * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxDstSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ *
+ * These functions compress data in successive blocks of any size, using
+ * previous blocks as dictionary. One key assumption is that previous
+ * blocks (up to 64 KB) remain read-accessible while
+ * compressing next blocks. There is an exception for ring buffers,
+ * which can be smaller than 64 KB.
+ * Ring buffers scenario is automatically detected and handled by
+ * LZ4_compress_HC_continue().
+ * Before starting compression, state must be properly initialized,
+ * using LZ4_resetStreamHC().
+ * A first "fictional block" can then be designated as
+ * initial dictionary, using LZ4_loadDictHC() (Optional).
+ * Then, use LZ4_compress_HC_continue()
+ * to compress each successive block. Previous memory blocks
+ * (including initial dictionary when present) must remain accessible
+ * and unmodified during compression.
+ * 'dst' buffer should be sized to handle worst case scenarios, using
+ *  LZ4_compressBound(), to ensure operation success.
+ *  If, for any reason, previous data blocks can't be preserved unmodified
+ *  in memory during next compression block,
+ *  you must save it to a safer memory space, using LZ4_saveDictHC().
+ * Return value of LZ4_saveDictHC() is the size of dictionary
+ * effectively saved into 'safeBuffer'.
+ *
+ * Return: Number of bytes written into buffer 'dst'  or 0 if compression fails
+ */
+int LZ4_compress_HC_continue(LZ4_streamHC_t *streamHCPtr, const char *src,
+	char *dst, int srcSize, int maxDstSize);
+
+/**
+ * LZ4_saveDictHC() - Save static dictionary from LZ4HC_stream
+ * @streamHCPtr: pointer to the 'LZ4HC_stream_t' structure
+ * @safeBuffer: buffer to save dictionary to, must be already allocated
+ * @maxDictSize: size of 'safeBuffer'
+ *
+ * If previously compressed data block is not guaranteed
+ * to remain available at its memory location,
+ * save it into a safer place (char *safeBuffer).
+ * Note : you don't need to call LZ4_loadDictHC() afterwards,
+ * dictionary is immediately usable, you can therefore call
+ * LZ4_compress_HC_continue().
+ *
+ * Return : saved dictionary size in bytes (necessarily <= maxDictSize),
+ *	or 0 if error.
+ */
+int LZ4_saveDictHC(LZ4_streamHC_t *streamHCPtr, char *safeBuffer,
+	int maxDictSize);
+
+/*-*********************************************
+ *	Streaming Compression Functions
+ ***********************************************/
+
+/**
+ * LZ4_resetStream() - Init an allocated 'LZ4_stream_t' structure
+ * @LZ4_stream: pointer to the 'LZ4_stream_t' structure
+ *
+ * An LZ4_stream_t structure can be allocated once
+ * and re-used multiple times.
+ * Use this function to init an allocated `LZ4_stream_t` structure
+ * and start a new compression.
+ */
+void LZ4_resetStream(LZ4_stream_t *LZ4_stream);
+
+/**
+ * LZ4_loadDict() - Load a static dictionary into LZ4_stream
+ * @streamPtr: pointer to the LZ4_stream_t
+ * @dictionary: dictionary to load
+ * @dictSize: size of dictionary
+ *
+ * Use this function to load a static dictionary into LZ4_stream.
+ * Any previous data will be forgotten, only 'dictionary'
+ * will remain in memory.
+ * Loading a size of 0 is allowed.
+ *
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int LZ4_loadDict(LZ4_stream_t *streamPtr, const char *dictionary,
+	int dictSize);
+
+/**
+ * LZ4_saveDict() - Save static dictionary from LZ4_stream
+ * @streamPtr: pointer to the 'LZ4_stream_t' structure
+ * @safeBuffer: buffer to save dictionary to, must be already allocated
+ * @dictSize: size of 'safeBuffer'
+ *
+ * If previously compressed data block is not guaranteed
+ * to remain available at its memory location,
+ * save it into a safer place (char *safeBuffer).
+ * Note : you don't need to call LZ4_loadDict() afterwards,
+ * dictionary is immediately usable, you can therefore call
+ * LZ4_compress_fast_continue().
+ *
+ * Return : saved dictionary size in bytes (necessarily <= dictSize),
+ *	or 0 if error.
+ */
+int LZ4_saveDict(LZ4_stream_t *streamPtr, char *safeBuffer, int dictSize);
+
+/**
+ * LZ4_compress_fast_continue() - Compress 'src' using data from previously
+ *	compressed blocks as a dictionary
+ * @streamPtr: Pointer to the previous 'LZ4_stream_t' structure
+ * @src: source address of the original data
+ * @dst: output buffer address of the compressed data,
+ *	which must be already allocated
+ * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxDstSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ * @acceleration: acceleration factor
+ *
+ * Compress buffer content 'src', using data from previously compressed blocks
+ * as dictionary to improve compression ratio.
+ * Important : Previous data blocks are assumed to still
+ * be present and unmodified !
+ * If maxDstSize >= LZ4_compressBound(srcSize),
+ * compression is guaranteed to succeed, and runs faster.
+ *
+ * Return: Number of bytes written into buffer 'dst'  or 0 if compression fails
+ */
+int LZ4_compress_fast_continue(LZ4_stream_t *streamPtr, const char *src,
+	char *dst, int srcSize, int maxDstSize, int acceleration);
+
+/**
+ * LZ4_setStreamDecode() - Instruct where to find dictionary
+ * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
+ * @dictionary: dictionary to use
+ * @dictSize: size of dictionary
+ *
+ * Use this function to instruct where to find the dictionary.
+ *	Setting a size of 0 is allowed (same effect as reset).
+ *
+ * Return: 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *dictionary, int dictSize);
+
+/**
+ * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode
+ * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated
+ * @compressedSize: is the precise full size of the compressed block
+ * @maxDecompressedSize: is the size of 'dest' buffer
+ *
+ * These decoding function allows decompression of multiple blocks
+ * in "streaming" mode.
+ * Previously decoded blocks *must* remain available at the memory position
+ * where they were decoded (up to 64 KB)
+ * In the case of a ring buffers, decoding buffer must be either :
+ *    - Exactly same size as encoding buffer, with same update rule
+ *      (block boundaries at same positions) In which case,
+ *      the decoding & encoding ring buffer can have any size,
+ *      including very small ones ( < 64 KB).
+ *    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *      maxBlockSize is implementation dependent.
+ *      It's the maximum size you intend to compress into a single block.
+ *      In which case, encoding and decoding buffers do not need
+ *      to be synchronized, and encoding ring buffer can have any size,
+ *      including small ones ( < 64 KB).
+ *    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+ *      In which case, encoding and decoding buffers do not need to be
+ *      synchronized, and encoding ring buffer can have any size,
+ *      including larger than decoding buffer. W
+ * Whenever these conditions are not possible, save the last 64KB of decoded
+ * data into a safe buffer, and indicate where it is saved
+ * using LZ4_setStreamDecode()
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int compressedSize,
+	int maxDecompressedSize);
+
+/**
+ * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode
+ * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated with 'originalSize' bytes
+ * @originalSize: is the original and therefore uncompressed size
+ *
+ * These decoding function allows decompression of multiple blocks
+ * in "streaming" mode.
+ * Previously decoded blocks *must* remain available at the memory position
+ * where they were decoded (up to 64 KB)
+ * In the case of a ring buffers, decoding buffer must be either :
+ *    - Exactly same size as encoding buffer, with same update rule
+ *      (block boundaries at same positions) In which case,
+ *      the decoding & encoding ring buffer can have any size,
+ *      including very small ones ( < 64 KB).
+ *    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *      maxBlockSize is implementation dependent.
+ *      It's the maximum size you intend to compress into a single block.
+ *      In which case, encoding and decoding buffers do not need
+ *      to be synchronized, and encoding ring buffer can have any size,
+ *      including small ones ( < 64 KB).
+ *    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+ *      In which case, encoding and decoding buffers do not need to be
+ *      synchronized, and encoding ring buffer can have any size,
+ *      including larger than decoding buffer. W
+ * Whenever these conditions are not possible, save the last 64KB of decoded
+ * data into a safe buffer, and indicate where it is saved
+ * using LZ4_setStreamDecode()
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int originalSize);
+
+/**
+ * LZ4_decompress_safe_usingDict() - Same as LZ4_setStreamDecode()
+ *	followed by LZ4_decompress_safe_continue()
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated
+ * @compressedSize: is the precise full size of the compressed block
+ * @maxDecompressedSize: is the size of 'dest' buffer
+ * @dictStart: pointer to the start of the dictionary in memory
+ * @dictSize: size of dictionary
+ *
+ * These decoding function works the same as
+ * a combination of LZ4_setStreamDecode() followed by
+ * LZ4_decompress_safe_continue()
+ * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_safe_usingDict(const char *source, char *dest,
+	int compressedSize, int maxDecompressedSize, const char *dictStart,
+	int dictSize);
+
+/**
+ * LZ4_decompress_fast_usingDict() - Same as LZ4_setStreamDecode()
+ *	followed by LZ4_decompress_fast_continue()
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated with 'originalSize' bytes
+ * @originalSize: is the original and therefore uncompressed size
+ * @dictStart: pointer to the start of the dictionary in memory
+ * @dictSize: size of dictionary
+ *
+ * These decoding function works the same as
+ * a combination of LZ4_setStreamDecode() followed by
+ * LZ4_decompress_safe_continue()
+ * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_fast_usingDict(const char *source, char *dest,
+	int originalSize, const char *dictStart, int dictSize);
+
 #endif
diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile
index 8085d04e9309..f7b113271d13 100644
--- a/lib/lz4/Makefile
+++ b/lib/lz4/Makefile
@@ -1,3 +1,5 @@
+ccflags-y += -O3
+
 obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o
 obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o
 obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index 28321d8f75ef..53f313f05185 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -1,19 +1,16 @@
 /*
  * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
+ * Copyright (C) 2011 - 2016, Yann Collet.
+ * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
- *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -25,417 +22,939 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- *  Changed for kernel use by:
- *  Chanho Min <chanho.min@lge.com>
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
+/*-************************************
+ *	Dependencies
+ **************************************/
+#include <linux/lz4.h>
+#include "lz4defs.h"
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/lz4.h>
 #include <asm/unaligned.h>
-#include "lz4defs.h"
 
-/*
- * LZ4_compressCtx :
- * -----------------
- * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
- * maximum size 'maxOutputSize'.  * If it cannot achieve it, compression
- * will stop, and result of the function will be zero.
- * return : the number of bytes written in buffer 'dest', or 0 if the
- * compression fails
- */
-static inline int lz4_compressctx(void *ctx,
-		const char *source,
-		char *dest,
-		int isize,
-		int maxoutputsize)
+static const int LZ4_minLength = (MFLIMIT + 1);
+static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT - 1));
+
+/*-******************************
+ *	Compression functions
+ ********************************/
+static FORCE_INLINE U32 LZ4_hash4(
+	U32 sequence,
+	tableType_t const tableType)
 {
-	HTYPE *hashtable = (HTYPE *)ctx;
-	const u8 *ip = (u8 *)source;
-#if LZ4_ARCH64
-	const BYTE * const base = ip;
+	if (tableType == byU16)
+		return ((sequence * 2654435761U)
+			>> ((MINMATCH * 8) - (LZ4_HASHLOG + 1)));
+	else
+		return ((sequence * 2654435761U)
+			>> ((MINMATCH * 8) - LZ4_HASHLOG));
+}
+
+static FORCE_INLINE U32 LZ4_hash5(
+	U64 sequence,
+	tableType_t const tableType)
+{
+	const U32 hashLog = (tableType == byU16)
+		? LZ4_HASHLOG + 1
+		: LZ4_HASHLOG;
+
+#if LZ4_LITTLE_ENDIAN
+	static const U64 prime5bytes = 889523592379ULL;
+
+	return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
 #else
-	const int base = 0;
+	static const U64 prime8bytes = 11400714785074694791ULL;
+
+	return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
 #endif
-	const u8 *anchor = ip;
-	const u8 *const iend = ip + isize;
-	const u8 *const mflimit = iend - MFLIMIT;
-	#define MATCHLIMIT (iend - LASTLITERALS)
-
-	u8 *op = (u8 *) dest;
-	u8 *const oend = op + maxoutputsize;
-	int length;
-	const int skipstrength = SKIPSTRENGTH;
-	u32 forwardh;
-	int lastrun;
-
-	/* Init */
-	if (isize < MINLENGTH)
-		goto _last_literals;
+}
+
+static FORCE_INLINE U32 LZ4_hashPosition(
+	const void *p,
+	tableType_t const tableType)
+{
+#if LZ4_ARCH64
+	if (tableType == byU32)
+		return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+#endif
+
+	return LZ4_hash4(LZ4_read32(p), tableType);
+}
+
+static void LZ4_putPositionOnHash(
+	const BYTE *p,
+	U32 h,
+	void *tableBase,
+	tableType_t const tableType,
+	const BYTE *srcBase)
+{
+	switch (tableType) {
+	case byPtr:
+	{
+		const BYTE **hashTable = (const BYTE **)tableBase;
+
+		hashTable[h] = p;
+		return;
+	}
+	case byU32:
+	{
+		U32 *hashTable = (U32 *) tableBase;
+
+		hashTable[h] = (U32)(p - srcBase);
+		return;
+	}
+	case byU16:
+	{
+		U16 *hashTable = (U16 *) tableBase;
+
+		hashTable[h] = (U16)(p - srcBase);
+		return;
+	}
+	}
+}
+
+static FORCE_INLINE void LZ4_putPosition(
+	const BYTE *p,
+	void *tableBase,
+	tableType_t tableType,
+	const BYTE *srcBase)
+{
+	U32 const h = LZ4_hashPosition(p, tableType);
+
+	LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+static const BYTE *LZ4_getPositionOnHash(
+	U32 h,
+	void *tableBase,
+	tableType_t tableType,
+	const BYTE *srcBase)
+{
+	if (tableType == byPtr) {
+		const BYTE **hashTable = (const BYTE **) tableBase;
+
+		return hashTable[h];
+	}
+
+	if (tableType == byU32) {
+		const U32 * const hashTable = (U32 *) tableBase;
+
+		return hashTable[h] + srcBase;
+	}
+
+	{
+		/* default, to ensure a return */
+		const U16 * const hashTable = (U16 *) tableBase;
+
+		return hashTable[h] + srcBase;
+	}
+}
+
+static FORCE_INLINE const BYTE *LZ4_getPosition(
+	const BYTE *p,
+	void *tableBase,
+	tableType_t tableType,
+	const BYTE *srcBase)
+{
+	U32 const h = LZ4_hashPosition(p, tableType);
+
+	return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
 
-	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+/*
+ * LZ4_compress_generic() :
+ * inlined, to ensure branches are decided at compilation time
+ */
+static FORCE_INLINE int LZ4_compress_generic(
+	LZ4_stream_t_internal * const dictPtr,
+	const char * const source,
+	char * const dest,
+	const int inputSize,
+	const int maxOutputSize,
+	const limitedOutput_directive outputLimited,
+	const tableType_t tableType,
+	const dict_directive dict,
+	const dictIssue_directive dictIssue,
+	const U32 acceleration)
+{
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *base;
+	const BYTE *lowLimit;
+	const BYTE * const lowRefLimit = ip - dictPtr->dictSize;
+	const BYTE * const dictionary = dictPtr->dictionary;
+	const BYTE * const dictEnd = dictionary + dictPtr->dictSize;
+	const size_t dictDelta = dictEnd - (const BYTE *)source;
+	const BYTE *anchor = (const BYTE *) source;
+	const BYTE * const iend = ip + inputSize;
+	const BYTE * const mflimit = iend - MFLIMIT;
+	const BYTE * const matchlimit = iend - LASTLITERALS;
+
+	BYTE *op = (BYTE *) dest;
+	BYTE * const olimit = op + maxOutputSize;
+
+	U32 forwardH;
+	size_t refDelta = 0;
+
+	/* Init conditions */
+	if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) {
+		/* Unsupported inputSize, too large (or negative) */
+		return 0;
+	}
+
+	switch (dict) {
+	case noDict:
+	default:
+		base = (const BYTE *)source;
+		lowLimit = (const BYTE *)source;
+		break;
+	case withPrefix64k:
+		base = (const BYTE *)source - dictPtr->currentOffset;
+		lowLimit = (const BYTE *)source - dictPtr->dictSize;
+		break;
+	case usingExtDict:
+		base = (const BYTE *)source - dictPtr->currentOffset;
+		lowLimit = (const BYTE *)source;
+		break;
+	}
+
+	if ((tableType == byU16)
+		&& (inputSize >= LZ4_64Klimit)) {
+		/* Size too large (not within 64K limit) */
+		return 0;
+	}
+
+	if (inputSize < LZ4_minLength) {
+		/* Input too small, no compression (all literals) */
+		goto _last_literals;
+	}
 
 	/* First Byte */
-	hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
+	LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
 	ip++;
-	forwardh = LZ4_HASH_VALUE(ip);
+	forwardH = LZ4_hashPosition(ip, tableType);
 
 	/* Main Loop */
-	for (;;) {
-		int findmatchattempts = (1U << skipstrength) + 3;
-		const u8 *forwardip = ip;
-		const u8 *ref;
-		u8 *token;
+	for ( ; ; ) {
+		const BYTE *match;
+		BYTE *token;
 
 		/* Find a match */
-		do {
-			u32 h = forwardh;
-			int step = findmatchattempts++ >> skipstrength;
-			ip = forwardip;
-			forwardip = ip + step;
-
-			if (unlikely(forwardip > mflimit))
-				goto _last_literals;
-
-			forwardh = LZ4_HASH_VALUE(forwardip);
-			ref = base + hashtable[h];
-			hashtable[h] = ip - base;
-		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+		{
+			const BYTE *forwardIp = ip;
+			unsigned int step = 1;
+			unsigned int searchMatchNb = acceleration << LZ4_SKIPTRIGGER;
+
+			do {
+				U32 const h = forwardH;
+
+				ip = forwardIp;
+				forwardIp += step;
+				step = (searchMatchNb++ >> LZ4_SKIPTRIGGER);
+
+				if (unlikely(forwardIp > mflimit))
+					goto _last_literals;
+
+				match = LZ4_getPositionOnHash(h,
+					dictPtr->hashTable,
+					tableType, base);
+
+				if (dict == usingExtDict) {
+					if (match < (const BYTE *)source) {
+						refDelta = dictDelta;
+						lowLimit = dictionary;
+					} else {
+						refDelta = 0;
+						lowLimit = (const BYTE *)source;
+				}	 }
+
+				forwardH = LZ4_hashPosition(forwardIp,
+					tableType);
+
+				LZ4_putPositionOnHash(ip, h, dictPtr->hashTable,
+					tableType, base);
+			} while (((dictIssue == dictSmall)
+					? (match < lowRefLimit)
+					: 0)
+				|| ((tableType == byU16)
+					? 0
+					: (match + MAX_DISTANCE < ip))
+				|| (LZ4_read32(match + refDelta)
+					!= LZ4_read32(ip)));
+		}
 
 		/* Catch up */
-		while ((ip > anchor) && (ref > (u8 *)source) &&
-			unlikely(ip[-1] == ref[-1])) {
+		while (((ip > anchor) & (match + refDelta > lowLimit))
+				&& (unlikely(ip[-1] == match[refDelta - 1]))) {
 			ip--;
-			ref--;
+			match--;
 		}
 
-		/* Encode Literal length */
-		length = (int)(ip - anchor);
-		token = op++;
-		/* check output limit */
-		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
-			(length >> 8) > oend))
-			return 0;
+		/* Encode Literals */
+		{
+			unsigned const int litLength = (unsigned int)(ip - anchor);
 
-		if (length >= (int)RUN_MASK) {
-			int len;
-			*token = (RUN_MASK << ML_BITS);
-			len = length - RUN_MASK;
-			for (; len > 254 ; len -= 255)
-				*op++ = 255;
-			*op++ = (u8)len;
-		} else
-			*token = (length << ML_BITS);
+			token = op++;
+
+			if ((outputLimited) &&
+				/* Check output buffer overflow */
+				(unlikely(op + litLength +
+					(2 + 1 + LASTLITERALS) +
+					(litLength / 255) > olimit)))
+				return 0;
+
+			if (litLength >= RUN_MASK) {
+				int len = (int)litLength - RUN_MASK;
+
+				*token = (RUN_MASK << ML_BITS);
+
+				for (; len >= 255; len -= 255)
+					*op++ = 255;
+				*op++ = (BYTE)len;
+			} else
+				*token = (BYTE)(litLength << ML_BITS);
+
+			/* Copy Literals */
+			LZ4_wildCopy(op, anchor, op + litLength);
+			op += litLength;
+		}
 
-		/* Copy Literals */
-		LZ4_BLINDCOPY(anchor, op, length);
 _next_match:
 		/* Encode Offset */
-		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+		LZ4_writeLE16(op, (U16)(ip - match));
+		op += 2;
 
-		/* Start Counting */
-		ip += MINMATCH;
-		/* MinMatch verified */
-		ref += MINMATCH;
-		anchor = ip;
-		while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) {
-			#if LZ4_ARCH64
-			u64 diff = A64(ref) ^ A64(ip);
-			#else
-			u32 diff = A32(ref) ^ A32(ip);
-			#endif
-			if (!diff) {
-				ip += STEPSIZE;
-				ref += STEPSIZE;
-				continue;
-			}
-			ip += LZ4_NBCOMMONBYTES(diff);
-			goto _endcount;
-		}
-		#if LZ4_ARCH64
-		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
-			ip += 4;
-			ref += 4;
-		}
-		#endif
-		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
-			ip += 2;
-			ref += 2;
-		}
-		if ((ip < MATCHLIMIT) && (*ref == *ip))
-			ip++;
-_endcount:
 		/* Encode MatchLength */
-		length = (int)(ip - anchor);
-		/* Check output limit */
-		if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend))
-			return 0;
-		if (length >= (int)ML_MASK) {
-			*token += ML_MASK;
-			length -= ML_MASK;
-			for (; length > 509 ; length -= 510) {
-				*op++ = 255;
-				*op++ = 255;
-			}
-			if (length > 254) {
-				length -= 255;
-				*op++ = 255;
+		{
+			unsigned int matchCode;
+
+			if ((dict == usingExtDict)
+				&& (lowLimit == dictionary)) {
+				const BYTE *limit;
+
+				match += refDelta;
+				limit = ip + (dictEnd - match);
+
+				if (limit > matchlimit)
+					limit = matchlimit;
+
+				matchCode = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, limit);
+
+				ip += MINMATCH + matchCode;
+
+				if (ip == limit) {
+					unsigned const int more = LZ4_count(ip,
+						(const BYTE *)source,
+						matchlimit);
+
+					matchCode += more;
+					ip += more;
+				}
+			} else {
+				matchCode = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, matchlimit);
+				ip += MINMATCH + matchCode;
 			}
-			*op++ = (u8)length;
-		} else
-			*token += length;
+
+			if (outputLimited &&
+				/* Check output buffer overflow */
+				(unlikely(op +
+					(1 + LASTLITERALS) +
+					(matchCode >> 8) > olimit)))
+				return 0;
+
+			if (matchCode >= ML_MASK) {
+				*token += ML_MASK;
+				matchCode -= ML_MASK;
+				LZ4_write32(op, 0xFFFFFFFF);
+
+				while (matchCode >= 4 * 255) {
+					op += 4;
+					LZ4_write32(op, 0xFFFFFFFF);
+					matchCode -= 4 * 255;
+				}
+
+				op += matchCode / 255;
+				*op++ = (BYTE)(matchCode % 255);
+			} else
+				*token += (BYTE)(matchCode);
+		}
+
+		anchor = ip;
 
 		/* Test end of chunk */
-		if (ip > mflimit) {
-			anchor = ip;
+		if (ip > mflimit)
 			break;
-		}
 
 		/* Fill table */
-		hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base;
+		LZ4_putPosition(ip - 2, dictPtr->hashTable, tableType, base);
 
 		/* Test next position */
-		ref = base + hashtable[LZ4_HASH_VALUE(ip)];
-		hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
-		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+		match = LZ4_getPosition(ip, dictPtr->hashTable,
+			tableType, base);
+
+		if (dict == usingExtDict) {
+			if (match < (const BYTE *)source) {
+				refDelta = dictDelta;
+				lowLimit = dictionary;
+			} else {
+				refDelta = 0;
+				lowLimit = (const BYTE *)source;
+			}
+		}
+
+		LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
+
+		if (((dictIssue == dictSmall) ? (match >= lowRefLimit) : 1)
+			&& (match + MAX_DISTANCE >= ip)
+			&& (LZ4_read32(match + refDelta) == LZ4_read32(ip))) {
 			token = op++;
 			*token = 0;
 			goto _next_match;
 		}
 
 		/* Prepare next loop */
-		anchor = ip++;
-		forwardh = LZ4_HASH_VALUE(ip);
+		forwardH = LZ4_hashPosition(++ip, tableType);
 	}
 
 _last_literals:
 	/* Encode Last Literals */
-	lastrun = (int)(iend - anchor);
-	if (((char *)op - dest) + lastrun + 1
-		+ ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize)
-		return 0;
+	{
+		size_t const lastRun = (size_t)(iend - anchor);
 
-	if (lastrun >= (int)RUN_MASK) {
-		*op++ = (RUN_MASK << ML_BITS);
-		lastrun -= RUN_MASK;
-		for (; lastrun > 254 ; lastrun -= 255)
-			*op++ = 255;
-		*op++ = (u8)lastrun;
-	} else
-		*op++ = (lastrun << ML_BITS);
-	memcpy(op, anchor, iend - anchor);
-	op += iend - anchor;
+		if ((outputLimited) &&
+			/* Check output buffer overflow */
+			((op - (BYTE *)dest) + lastRun + 1 +
+			((lastRun + 255 - RUN_MASK) / 255) > (U32)maxOutputSize))
+			return 0;
+
+		if (lastRun >= RUN_MASK) {
+			size_t accumulator = lastRun - RUN_MASK;
+			*op++ = RUN_MASK << ML_BITS;
+			for (; accumulator >= 255; accumulator -= 255)
+				*op++ = 255;
+			*op++ = (BYTE) accumulator;
+		} else {
+			*op++ = (BYTE)(lastRun << ML_BITS);
+		}
+
+		memcpy(op, anchor, lastRun);
+
+		op += lastRun;
+	}
 
 	/* End */
-	return (int)(((char *)op) - dest);
+	return (int) (((char *)op) - dest);
 }
 
-static inline int lz4_compress64kctx(void *ctx,
-		const char *source,
-		char *dest,
-		int isize,
-		int maxoutputsize)
+static int LZ4_compress_fast_extState(
+	void *state,
+	const char *source,
+	char *dest,
+	int inputSize,
+	int maxOutputSize,
+	int acceleration)
 {
-	u16 *hashtable = (u16 *)ctx;
-	const u8 *ip = (u8 *) source;
-	const u8 *anchor = ip;
-	const u8 *const base = ip;
-	const u8 *const iend = ip + isize;
-	const u8 *const mflimit = iend - MFLIMIT;
-	#define MATCHLIMIT (iend - LASTLITERALS)
-
-	u8 *op = (u8 *) dest;
-	u8 *const oend = op + maxoutputsize;
-	int len, length;
-	const int skipstrength = SKIPSTRENGTH;
-	u32 forwardh;
-	int lastrun;
-
-	/* Init */
-	if (isize < MINLENGTH)
-		goto _last_literals;
+	LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse;
+#if LZ4_ARCH64
+	const tableType_t tableType = byU32;
+#else
+	const tableType_t tableType = byPtr;
+#endif
 
-	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+	LZ4_resetStream((LZ4_stream_t *)state);
+
+	if (acceleration < 1)
+		acceleration = LZ4_ACCELERATION_DEFAULT;
+
+	if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize)) {
+		if (inputSize < LZ4_64Klimit)
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize, 0,
+				noLimit, byU16, noDict,
+				noDictIssue, acceleration);
+		else
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize, 0,
+				noLimit, tableType, noDict,
+				noDictIssue, acceleration);
+	} else {
+		if (inputSize < LZ4_64Klimit)
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize,
+				maxOutputSize, limitedOutput, byU16, noDict,
+				noDictIssue, acceleration);
+		else
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize,
+				maxOutputSize, limitedOutput, tableType, noDict,
+				noDictIssue, acceleration);
+	}
+}
+
+int LZ4_compress_fast(const char *source, char *dest, int inputSize,
+	int maxOutputSize, int acceleration, void *wrkmem)
+{
+	return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
+		maxOutputSize, acceleration);
+}
+EXPORT_SYMBOL(LZ4_compress_fast);
+
+int LZ4_compress_default(const char *source, char *dest, int inputSize,
+	int maxOutputSize, void *wrkmem)
+{
+	return LZ4_compress_fast(source, dest, inputSize,
+		maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem);
+}
+EXPORT_SYMBOL(LZ4_compress_default);
+
+/*-******************************
+ *	*_destSize() variant
+ ********************************/
+static int LZ4_compress_destSize_generic(
+	LZ4_stream_t_internal * const ctx,
+	const char * const src,
+	char * const dst,
+	int * const srcSizePtr,
+	const int targetDstSize,
+	const tableType_t tableType)
+{
+	const BYTE *ip = (const BYTE *) src;
+	const BYTE *base = (const BYTE *) src;
+	const BYTE *lowLimit = (const BYTE *) src;
+	const BYTE *anchor = ip;
+	const BYTE * const iend = ip + *srcSizePtr;
+	const BYTE * const mflimit = iend - MFLIMIT;
+	const BYTE * const matchlimit = iend - LASTLITERALS;
+
+	BYTE *op = (BYTE *) dst;
+	BYTE * const oend = op + targetDstSize;
+	BYTE * const oMaxLit = op + targetDstSize - 2 /* offset */
+		- 8 /* because 8 + MINMATCH == MFLIMIT */ - 1 /* token */;
+	BYTE * const oMaxMatch = op + targetDstSize
+		- (LASTLITERALS + 1 /* token */);
+	BYTE * const oMaxSeq = oMaxLit - 1 /* token */;
+
+	U32 forwardH;
+
+	/* Init conditions */
+	/* Impossible to store anything */
+	if (targetDstSize < 1)
+		return 0;
+	/* Unsupported input size, too large (or negative) */
+	if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE)
+		return 0;
+	/* Size too large (not within 64K limit) */
+	if ((tableType == byU16) && (*srcSizePtr >= LZ4_64Klimit))
+		return 0;
+	/* Input too small, no compression (all literals) */
+	if (*srcSizePtr < LZ4_minLength)
+		goto _last_literals;
 
 	/* First Byte */
-	ip++;
-	forwardh = LZ4_HASH64K_VALUE(ip);
+	*srcSizePtr = 0;
+	LZ4_putPosition(ip, ctx->hashTable, tableType, base);
+	ip++; forwardH = LZ4_hashPosition(ip, tableType);
 
 	/* Main Loop */
-	for (;;) {
-		int findmatchattempts = (1U << skipstrength) + 3;
-		const u8 *forwardip = ip;
-		const u8 *ref;
-		u8 *token;
+	for ( ; ; ) {
+		const BYTE *match;
+		BYTE *token;
 
 		/* Find a match */
-		do {
-			u32 h = forwardh;
-			int step = findmatchattempts++ >> skipstrength;
-			ip = forwardip;
-			forwardip = ip + step;
-
-			if (forwardip > mflimit)
-				goto _last_literals;
-
-			forwardh = LZ4_HASH64K_VALUE(forwardip);
-			ref = base + hashtable[h];
-			hashtable[h] = (u16)(ip - base);
-		} while (A32(ref) != A32(ip));
+		{
+			const BYTE *forwardIp = ip;
+			unsigned int step = 1;
+			unsigned int searchMatchNb = 1 << LZ4_SKIPTRIGGER;
+
+			do {
+				U32 h = forwardH;
+
+				ip = forwardIp;
+				forwardIp += step;
+				step = (searchMatchNb++ >> LZ4_SKIPTRIGGER);
+
+				if (unlikely(forwardIp > mflimit))
+					goto _last_literals;
+
+				match = LZ4_getPositionOnHash(h, ctx->hashTable,
+					tableType, base);
+				forwardH = LZ4_hashPosition(forwardIp,
+					tableType);
+				LZ4_putPositionOnHash(ip, h,
+					ctx->hashTable, tableType,
+					base);
+
+			} while (((tableType == byU16)
+				? 0
+				: (match + MAX_DISTANCE < ip))
+				|| (LZ4_read32(match) != LZ4_read32(ip)));
+		}
 
 		/* Catch up */
-		while ((ip > anchor) && (ref > (u8 *)source)
-			&& (ip[-1] == ref[-1])) {
+		while ((ip > anchor)
+			&& (match > lowLimit)
+			&& (unlikely(ip[-1] == match[-1]))) {
 			ip--;
-			ref--;
+			match--;
 		}
 
 		/* Encode Literal length */
-		length = (int)(ip - anchor);
-		token = op++;
-		/* Check output limit */
-		if (unlikely(op + length + (2 + 1 + LASTLITERALS)
-			+ (length >> 8) > oend))
-			return 0;
-		if (length >= (int)RUN_MASK) {
-			*token = (RUN_MASK << ML_BITS);
-			len = length - RUN_MASK;
-			for (; len > 254 ; len -= 255)
-				*op++ = 255;
-			*op++ = (u8)len;
-		} else
-			*token = (length << ML_BITS);
+		{
+			unsigned int litLength = (unsigned int)(ip - anchor);
 
-		/* Copy Literals */
-		LZ4_BLINDCOPY(anchor, op, length);
+			token = op++;
+			if (op + ((litLength + 240) / 255)
+				+ litLength > oMaxLit) {
+				/* Not enough space for a last match */
+				op--;
+				goto _last_literals;
+			}
+			if (litLength >= RUN_MASK) {
+				unsigned int len = litLength - RUN_MASK;
+				*token = (RUN_MASK<<ML_BITS);
+				for (; len >= 255; len -= 255)
+					*op++ = 255;
+				*op++ = (BYTE)len;
+			} else
+				*token = (BYTE)(litLength << ML_BITS);
+
+			/* Copy Literals */
+			LZ4_wildCopy(op, anchor, op + litLength);
+			op += litLength;
+		}
 
 _next_match:
 		/* Encode Offset */
-		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+		LZ4_writeLE16(op, (U16)(ip - match)); op += 2;
 
-		/* Start Counting */
-		ip += MINMATCH;
-		/* MinMatch verified */
-		ref += MINMATCH;
-		anchor = ip;
+		/* Encode MatchLength */
+		{
+			size_t matchLength = LZ4_count(ip + MINMATCH,
+			match + MINMATCH, matchlimit);
 
-		while (ip < MATCHLIMIT - (STEPSIZE - 1)) {
-			#if LZ4_ARCH64
-			u64 diff = A64(ref) ^ A64(ip);
-			#else
-			u32 diff = A32(ref) ^ A32(ip);
-			#endif
-
-			if (!diff) {
-				ip += STEPSIZE;
-				ref += STEPSIZE;
-				continue;
+			if (op + ((matchLength + 240)/255) > oMaxMatch) {
+				/* Match description too long : reduce it */
+				matchLength = (15 - 1) + (oMaxMatch - op) * 255;
 			}
-			ip += LZ4_NBCOMMONBYTES(diff);
-			goto _endcount;
-		}
-		#if LZ4_ARCH64
-		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
-			ip += 4;
-			ref += 4;
-		}
-		#endif
-		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
-			ip += 2;
-			ref += 2;
+			ip += MINMATCH + matchLength;
+
+			if (matchLength >= ML_MASK) {
+				*token += ML_MASK;
+				matchLength -= ML_MASK;
+				while (matchLength >= 255) {
+					matchLength -= 255;
+					*op++ = 255;
+				}
+				*op++ = (BYTE)matchLength;
+			} else
+				*token += (BYTE)(matchLength);
 		}
-		if ((ip < MATCHLIMIT) && (*ref == *ip))
-			ip++;
-_endcount:
 
-		/* Encode MatchLength */
-		len = (int)(ip - anchor);
-		/* Check output limit */
-		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
-			return 0;
-		if (len >= (int)ML_MASK) {
-			*token += ML_MASK;
-			len -= ML_MASK;
-			for (; len > 509 ; len -= 510) {
-				*op++ = 255;
-				*op++ = 255;
-			}
-			if (len > 254) {
-				len -= 255;
-				*op++ = 255;
-			}
-			*op++ = (u8)len;
-		} else
-			*token += len;
+		anchor = ip;
 
-		/* Test end of chunk */
-		if (ip > mflimit) {
-			anchor = ip;
+		/* Test end of block */
+		if (ip > mflimit)
+			break;
+		if (op > oMaxSeq)
 			break;
-		}
 
 		/* Fill table */
-		hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base);
+		LZ4_putPosition(ip - 2, ctx->hashTable, tableType, base);
 
 		/* Test next position */
-		ref = base + hashtable[LZ4_HASH64K_VALUE(ip)];
-		hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base);
-		if (A32(ref) == A32(ip)) {
-			token = op++;
-			*token = 0;
+		match = LZ4_getPosition(ip, ctx->hashTable, tableType, base);
+		LZ4_putPosition(ip, ctx->hashTable, tableType, base);
+
+		if ((match + MAX_DISTANCE >= ip)
+			&& (LZ4_read32(match) == LZ4_read32(ip))) {
+			token = op++; *token = 0;
 			goto _next_match;
 		}
 
 		/* Prepare next loop */
-		anchor = ip++;
-		forwardh = LZ4_HASH64K_VALUE(ip);
+		forwardH = LZ4_hashPosition(++ip, tableType);
 	}
 
 _last_literals:
 	/* Encode Last Literals */
-	lastrun = (int)(iend - anchor);
-	if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend)
-		return 0;
-	if (lastrun >= (int)RUN_MASK) {
-		*op++ = (RUN_MASK << ML_BITS);
-		lastrun -= RUN_MASK;
-		for (; lastrun > 254 ; lastrun -= 255)
-			*op++ = 255;
-		*op++ = (u8)lastrun;
-	} else
-		*op++ = (lastrun << ML_BITS);
-	memcpy(op, anchor, iend - anchor);
-	op += iend - anchor;
+	{
+		size_t lastRunSize = (size_t)(iend - anchor);
+
+		if (op + 1 /* token */
+			+ ((lastRunSize + 240) / 255) /* litLength */
+			+ lastRunSize /* literals */ > oend) {
+			/* adapt lastRunSize to fill 'dst' */
+			lastRunSize	= (oend - op) - 1;
+			lastRunSize -= (lastRunSize + 240) / 255;
+		}
+		ip = anchor + lastRunSize;
+
+		if (lastRunSize >= RUN_MASK) {
+			size_t accumulator = lastRunSize - RUN_MASK;
+
+			*op++ = RUN_MASK << ML_BITS;
+			for (; accumulator >= 255; accumulator -= 255)
+				*op++ = 255;
+			*op++ = (BYTE) accumulator;
+		} else {
+			*op++ = (BYTE)(lastRunSize<<ML_BITS);
+		}
+		memcpy(op, anchor, lastRunSize);
+		op += lastRunSize;
+	}
+
 	/* End */
-	return (int)(((char *)op) - dest);
+	*srcSizePtr = (int) (((const char *)ip) - src);
+	return (int) (((char *)op) - dst);
 }
 
-int lz4_compress(const unsigned char *src, size_t src_len,
-			unsigned char *dst, size_t *dst_len, void *wrkmem)
+static int LZ4_compress_destSize_extState(
+	LZ4_stream_t *state,
+	const char *src,
+	char *dst,
+	int *srcSizePtr,
+	int targetDstSize)
 {
-	int ret = -1;
-	int out_len = 0;
+#if LZ4_ARCH64
+	const tableType_t tableType = byU32;
+#else
+	const tableType_t tableType = byPtr;
+#endif
 
-	if (src_len < LZ4_64KLIMIT)
-		out_len = lz4_compress64kctx(wrkmem, src, dst, src_len,
-				lz4_compressbound(src_len));
-	else
-		out_len = lz4_compressctx(wrkmem, src, dst, src_len,
-				lz4_compressbound(src_len));
+	LZ4_resetStream(state);
+
+	if (targetDstSize >= LZ4_COMPRESSBOUND(*srcSizePtr)) {
+		/* compression success is guaranteed */
+		return LZ4_compress_fast_extState(
+			state, src, dst, *srcSizePtr,
+			targetDstSize, 1);
+	} else {
+		if (*srcSizePtr < LZ4_64Klimit)
+			return LZ4_compress_destSize_generic(
+				&state->internal_donotuse,
+				src, dst, srcSizePtr,
+				targetDstSize, byU16);
+		else
+			return LZ4_compress_destSize_generic(
+				&state->internal_donotuse,
+				src, dst, srcSizePtr,
+				targetDstSize, tableType);
+	}
+}
+
+
+int LZ4_compress_destSize(
+	const char *src,
+	char *dst,
+	int *srcSizePtr,
+	int targetDstSize,
+	void *wrkmem)
+{
+	return LZ4_compress_destSize_extState(wrkmem, src, dst, srcSizePtr,
+		targetDstSize);
+}
+EXPORT_SYMBOL(LZ4_compress_destSize);
+
+/*-******************************
+ *	Streaming functions
+ ********************************/
+void LZ4_resetStream(LZ4_stream_t *LZ4_stream)
+{
+	memset(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+int LZ4_loadDict(LZ4_stream_t *LZ4_dict,
+	const char *dictionary, int dictSize)
+{
+	LZ4_stream_t_internal *dict = &LZ4_dict->internal_donotuse;
+	const BYTE *p = (const BYTE *)dictionary;
+	const BYTE * const dictEnd = p + dictSize;
+	const BYTE *base;
+
+	if ((dict->initCheck)
+		|| (dict->currentOffset > 1 * GB)) {
+		/* Uninitialized structure, or reuse overflow */
+		LZ4_resetStream(LZ4_dict);
+	}
+
+	if (dictSize < (int)HASH_UNIT) {
+		dict->dictionary = NULL;
+		dict->dictSize = 0;
+		return 0;
+	}
+
+	if ((dictEnd - p) > 64 * KB)
+		p = dictEnd - 64 * KB;
+	dict->currentOffset += 64 * KB;
+	base = p - dict->currentOffset;
+	dict->dictionary = p;
+	dict->dictSize = (U32)(dictEnd - p);
+	dict->currentOffset += dict->dictSize;
+
+	while (p <= dictEnd - HASH_UNIT) {
+		LZ4_putPosition(p, dict->hashTable, byU32, base);
+		p += 3;
+	}
+
+	return dict->dictSize;
+}
+EXPORT_SYMBOL(LZ4_loadDict);
+
+static void LZ4_renormDictT(LZ4_stream_t_internal *LZ4_dict,
+	const BYTE *src)
+{
+	if ((LZ4_dict->currentOffset > 0x80000000) ||
+		((uptrval)LZ4_dict->currentOffset > (uptrval)src)) {
+		/* address space overflow */
+		/* rescale hash table */
+		U32 const delta = LZ4_dict->currentOffset - 64 * KB;
+		const BYTE *dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+		int i;
+
+		for (i = 0; i < LZ4_HASH_SIZE_U32; i++) {
+			if (LZ4_dict->hashTable[i] < delta)
+				LZ4_dict->hashTable[i] = 0;
+			else
+				LZ4_dict->hashTable[i] -= delta;
+		}
+		LZ4_dict->currentOffset = 64 * KB;
+		if (LZ4_dict->dictSize > 64 * KB)
+			LZ4_dict->dictSize = 64 * KB;
+		LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+	}
+}
+
+int LZ4_saveDict(LZ4_stream_t *LZ4_dict, char *safeBuffer, int dictSize)
+{
+	LZ4_stream_t_internal * const dict = &LZ4_dict->internal_donotuse;
+	const BYTE * const previousDictEnd = dict->dictionary + dict->dictSize;
+
+	if ((U32)dictSize > 64 * KB) {
+		/* useless to define a dictionary > 64 * KB */
+		dictSize = 64 * KB;
+	}
+	if ((U32)dictSize > dict->dictSize)
+		dictSize = dict->dictSize;
+
+	memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+	dict->dictionary = (const BYTE *)safeBuffer;
+	dict->dictSize = (U32)dictSize;
+
+	return dictSize;
+}
+EXPORT_SYMBOL(LZ4_saveDict);
+
+int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
+	char *dest, int inputSize, int maxOutputSize, int acceleration)
+{
+	LZ4_stream_t_internal *streamPtr = &LZ4_stream->internal_donotuse;
+	const BYTE * const dictEnd = streamPtr->dictionary
+		+ streamPtr->dictSize;
+
+	const BYTE *smallest = (const BYTE *) source;
+
+	if (streamPtr->initCheck) {
+		/* Uninitialized structure detected */
+		return 0;
+	}
+
+	if ((streamPtr->dictSize > 0) && (smallest > dictEnd))
+		smallest = dictEnd;
+
+	LZ4_renormDictT(streamPtr, smallest);
+
+	if (acceleration < 1)
+		acceleration = LZ4_ACCELERATION_DEFAULT;
 
-	if (out_len < 0)
-		goto exit;
+	/* Check overlapping input/dictionary space */
+	{
+		const BYTE *sourceEnd = (const BYTE *) source + inputSize;
+
+		if ((sourceEnd > streamPtr->dictionary)
+			&& (sourceEnd < dictEnd)) {
+			streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+			if (streamPtr->dictSize > 64 * KB)
+				streamPtr->dictSize = 64 * KB;
+			if (streamPtr->dictSize < 4)
+				streamPtr->dictSize = 0;
+			streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+		}
+	}
 
-	*dst_len = out_len;
+	/* prefix mode : source data follows dictionary */
+	if (dictEnd == (const BYTE *)source) {
+		int result;
+
+		if ((streamPtr->dictSize < 64 * KB) &&
+			(streamPtr->dictSize < streamPtr->currentOffset)) {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				withPrefix64k, dictSmall, acceleration);
+		} else {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				withPrefix64k, noDictIssue, acceleration);
+		}
+		streamPtr->dictSize += (U32)inputSize;
+		streamPtr->currentOffset += (U32)inputSize;
+		return result;
+	}
 
-	return 0;
-exit:
-	return ret;
+	/* external dictionary mode */
+	{
+		int result;
+
+		if ((streamPtr->dictSize < 64 * KB) &&
+			(streamPtr->dictSize < streamPtr->currentOffset)) {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				usingExtDict, dictSmall, acceleration);
+		} else {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				usingExtDict, noDictIssue, acceleration);
+		}
+		streamPtr->dictionary = (const BYTE *)source;
+		streamPtr->dictSize = (U32)inputSize;
+		streamPtr->currentOffset += (U32)inputSize;
+		return result;
+	}
+}
+EXPORT_SYMBOL(LZ4_compress_fast_continue);
+
+/*-******************************
+ *	For backwards compatibility
+ ********************************/
+int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
+	size_t *dst_len, void *wrkmem) {
+	*dst_len = LZ4_compress_default(src, dst, src_len,
+		*dst_len, wrkmem);
+
+	/*
+	 * Prior lz4_compress will return -1 in case of error
+	 * and 0 on success
+	 * while new LZ4_compress_fast/default
+	 * returns 0 in case of error
+	 * and the output length on success
+	 */
+	if (!*dst_len)
+		return -1;
+	else
+		return 0;
 }
 EXPORT_SYMBOL(lz4_compress);
 
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 6d940c72b5fc..b5e00a19a7b4 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -1,25 +1,16 @@
 /*
- * LZ4 Decompressor for Linux kernel
- *
- * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
- *
- * Based on LZ4 implementation by Yann Collet.
- *
  * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
- *
+ * Copyright (C) 2011 - 2016, Yann Collet.
+ * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
- *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -31,313 +22,529 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- *  You can contact the author at :
- *  - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- *  - LZ4 source repository : http://code.google.com/p/lz4/
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
-#ifndef STATIC
+/*-************************************
+ *	Dependencies
+ **************************************/
+#include <linux/lz4.h>
+#include "lz4defs.h"
+#include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#endif
-#include <linux/lz4.h>
-
 #include <asm/unaligned.h>
 
-#include "lz4defs.h"
-
-static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
-static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
-#endif
-
-static int lz4_uncompress(const char *source, char *dest, int osize)
+/*-*****************************
+ *	Decompression functions
+ *******************************/
+/* LZ4_decompress_generic() :
+ * This generic decompression function cover all use cases.
+ * It shall be instantiated several times, using different sets of directives
+ * Note that it is important this generic function is really inlined,
+ * in order to remove useless branches during compilation optimization.
+ */
+static FORCE_INLINE int LZ4_decompress_generic(
+	 const char * const source,
+	 char * const dest,
+	 int inputSize,
+		/*
+		 * If endOnInput == endOnInputSize,
+		 * this value is the max size of Output Buffer.
+		 */
+	 int outputSize,
+	 /* endOnOutputSize, endOnInputSize */
+	 int endOnInput,
+	 /* full, partial */
+	 int partialDecoding,
+	 /* only used if partialDecoding == partial */
+	 int targetOutputSize,
+	 /* noDict, withPrefix64k, usingExtDict */
+	 int dict,
+	 /* == dest when no prefix */
+	 const BYTE * const lowPrefix,
+	 /* only if dict == usingExtDict */
+	 const BYTE * const dictStart,
+	 /* note : = 0 if noDict */
+	 const size_t dictSize
+	 )
 {
+	/* Local Variables */
 	const BYTE *ip = (const BYTE *) source;
-	const BYTE *ref;
+	const BYTE * const iend = ip + inputSize;
+
 	BYTE *op = (BYTE *) dest;
-	BYTE * const oend = op + osize;
+	BYTE * const oend = op + outputSize;
 	BYTE *cpy;
-	unsigned token;
-	size_t length;
+	BYTE *oexit = op + targetOutputSize;
+	const BYTE * const lowLimit = lowPrefix - dictSize;
+
+	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
+	const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
+	const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 };
 
+	const int safeDecode = (endOnInput == endOnInputSize);
+	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
+
+	/* Special cases */
+	/* targetOutputSize too high => decode everything */
+	if ((partialDecoding) && (oexit > oend - MFLIMIT))
+		oexit = oend - MFLIMIT;
+
+	/* Empty output buffer */
+	if ((endOnInput) && (unlikely(outputSize == 0)))
+		return ((inputSize == 1) && (*ip == 0)) ? 0 : -1;
+
+	if ((!endOnInput) && (unlikely(outputSize == 0)))
+		return (*ip == 0 ? 1 : -1);
+
+	/* Main Loop : decode sequences */
 	while (1) {
+		size_t length;
+		const BYTE *match;
+		size_t offset;
+
+		/* get literal length */
+		unsigned int const token = *ip++;
+
+		length = token>>ML_BITS;
 
-		/* get runlength */
-		token = *ip++;
-		length = (token >> ML_BITS);
 		if (length == RUN_MASK) {
-			size_t len;
+			unsigned int s;
 
-			len = *ip++;
-			for (; len == 255; length += 255)
-				len = *ip++;
-			if (unlikely(length > (size_t)(length + len)))
+			do {
+				s = *ip++;
+				length += s;
+			} while (likely(endOnInput
+				? ip < iend - RUN_MASK
+				: 1) & (s == 255));
+
+			if ((safeDecode)
+				&& unlikely(
+					(size_t)(op + length) < (size_t)(op))) {
+				/* overflow detection */
 				goto _output_error;
-			length += len;
+			}
+			if ((safeDecode)
+				&& unlikely(
+					(size_t)(ip + length) < (size_t)(ip))) {
+				/* overflow detection */
+				goto _output_error;
+			}
 		}
 
 		/* copy literals */
 		cpy = op + length;
-		if (unlikely(cpy > oend - COPYLENGTH)) {
-			/*
-			 * Error: not enough place for another match
-			 * (min 4) + 5 literals
-			 */
-			if (cpy != oend)
-				goto _output_error;
+		if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT))
+			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
+			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
+			if (partialDecoding) {
+				if (cpy > oend) {
+					/*
+					 * Error :
+					 * write attempt beyond end of output buffer
+					 */
+					goto _output_error;
+				}
+				if ((endOnInput)
+					&& (ip + length > iend)) {
+					/*
+					 * Error :
+					 * read attempt beyond
+					 * end of input buffer
+					 */
+					goto _output_error;
+				}
+			} else {
+				if ((!endOnInput)
+					&& (cpy != oend)) {
+					/*
+					 * Error :
+					 * block decoding must
+					 * stop exactly there
+					 */
+					goto _output_error;
+				}
+				if ((endOnInput)
+					&& ((ip + length != iend)
+					|| (cpy > oend))) {
+					/*
+					 * Error :
+					 * input must be consumed
+					 */
+					goto _output_error;
+				}
+			}
 
 			memcpy(op, ip, length);
 			ip += length;
-			break; /* EOF */
+			op += length;
+			/* Necessarily EOF, due to parsing restrictions */
+			break;
 		}
-		LZ4_WILDCOPY(ip, op, cpy);
-		ip -= (op - cpy);
+
+		LZ4_wildCopy(op, ip, cpy);
+		ip += length;
 		op = cpy;
 
 		/* get offset */
-		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+		offset = LZ4_readLE16(ip);
 		ip += 2;
+		match = op - offset;
 
-		/* Error: offset create reference outside destination buffer */
-		if (unlikely(ref < (BYTE *const) dest))
+		if ((checkOffset) && (unlikely(match < lowLimit))) {
+			/* Error : offset outside buffers */
 			goto _output_error;
+		}
+
+		/* costs ~1%; silence an msan warning when offset == 0 */
+		LZ4_write32(op, (U32)offset);
 
 		/* get matchlength */
 		length = token & ML_MASK;
 		if (length == ML_MASK) {
-			for (; *ip == 255; length += 255)
-				ip++;
-			if (unlikely(length > (size_t)(length + *ip)))
+			unsigned int s;
+
+			do {
+				s = *ip++;
+
+				if ((endOnInput) && (ip > iend - LASTLITERALS))
+					goto _output_error;
+
+				length += s;
+			} while (s == 255);
+
+			if ((safeDecode)
+				&& unlikely(
+					(size_t)(op + length) < (size_t)op)) {
+				/* overflow detection */
 				goto _output_error;
-			length += *ip++;
+			}
 		}
 
-		/* copy repeated sequence */
-		if (unlikely((op - ref) < STEPSIZE)) {
-#if LZ4_ARCH64
-			int dec64 = dec64table[op - ref];
-#else
-			const int dec64 = 0;
-#endif
-			op[0] = ref[0];
-			op[1] = ref[1];
-			op[2] = ref[2];
-			op[3] = ref[3];
-			op += 4;
-			ref += 4;
-			ref -= dec32table[op-ref];
-			PUT4(ref, op);
-			op += STEPSIZE - 4;
-			ref -= dec64;
+		length += MINMATCH;
+
+		/* check external dictionary */
+		if ((dict == usingExtDict) && (match < lowPrefix)) {
+			if (unlikely(op + length > oend - LASTLITERALS)) {
+				/* doesn't respect parsing restriction */
+				goto _output_error;
+			}
+
+			if (length <= (size_t)(lowPrefix - match)) {
+				/*
+				 * match can be copied as a single segment
+				 * from external dictionary
+				 */
+				memmove(op, dictEnd - (lowPrefix - match),
+					length);
+				op += length;
+			} else {
+				/*
+				 * match encompass external
+				 * dictionary and current block
+				 */
+				size_t const copySize = (size_t)(lowPrefix - match);
+				size_t const restSize = length - copySize;
+
+				memcpy(op, dictEnd - copySize, copySize);
+				op += copySize;
+
+				if (restSize > (size_t)(op - lowPrefix)) {
+					/* overlap copy */
+					BYTE * const endOfMatch = op + restSize;
+					const BYTE *copyFrom = lowPrefix;
+
+					while (op < endOfMatch)
+						*op++ = *copyFrom++;
+				} else {
+					memcpy(op, lowPrefix, restSize);
+					op += restSize;
+				}
+			}
+
+			continue;
+		}
+
+		/* copy match within block */
+		cpy = op + length;
+
+		if (unlikely(offset < 8)) {
+			const int dec64 = dec64table[offset];
+
+			op[0] = match[0];
+			op[1] = match[1];
+			op[2] = match[2];
+			op[3] = match[3];
+			match += dec32table[offset];
+			memcpy(op + 4, match, 4);
+			match -= dec64;
 		} else {
-			LZ4_COPYSTEP(ref, op);
+			LZ4_copy8(op, match);
+			match += 8;
 		}
-		cpy = op + length - (STEPSIZE - 4);
-		if (cpy > (oend - COPYLENGTH)) {
 
-			/* Error: request to write beyond destination buffer */
-			if (cpy > oend)
-				goto _output_error;
-#if LZ4_ARCH64
-			if ((ref + COPYLENGTH) > oend)
-#else
-			if ((ref + COPYLENGTH) > oend ||
-					(op + COPYLENGTH) > oend)
-#endif
+		op += 8;
+
+		if (unlikely(cpy > oend - 12)) {
+			BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
+
+			if (cpy > oend - LASTLITERALS) {
+				/*
+				 * Error : last LASTLITERALS bytes
+				 * must be literals (uncompressed)
+				 */
 				goto _output_error;
-			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			}
+
+			if (op < oCopyLimit) {
+				LZ4_wildCopy(op, match, oCopyLimit);
+				match += oCopyLimit - op;
+				op = oCopyLimit;
+			}
+
 			while (op < cpy)
-				*op++ = *ref++;
-			op = cpy;
-			/*
-			 * Check EOF (should never happen, since last 5 bytes
-			 * are supposed to be literals)
-			 */
-			if (op == oend)
-				goto _output_error;
-			continue;
+				*op++ = *match++;
+		} else {
+			LZ4_copy8(op, match);
+
+			if (length > 16)
+				LZ4_wildCopy(op + 8, match + 8, cpy);
 		}
-		LZ4_SECURECOPY(ref, op, cpy);
+
 		op = cpy; /* correction */
 	}
+
 	/* end of decoding */
-	return (int) (((char *)ip) - source);
+	if (endOnInput) {
+		/* Nb of output bytes decoded */
+		return (int) (((char *)op) - dest);
+	} else {
+		/* Nb of input bytes read */
+		return (int) (((const char *)ip) - source);
+	}
 
-	/* write overflow error detected */
+	/* Overflow error detected */
 _output_error:
 	return -1;
 }
 
-static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
-				int isize, size_t maxoutputsize)
+int LZ4_decompress_safe(const char *source, char *dest,
+	int compressedSize, int maxDecompressedSize)
 {
-	const BYTE *ip = (const BYTE *) source;
-	const BYTE *const iend = ip + isize;
-	const BYTE *ref;
-
+	return LZ4_decompress_generic(source, dest, compressedSize,
+		maxDecompressedSize, endOnInputSize, full, 0,
+		noDict, (BYTE *)dest, NULL, 0);
+}
 
-	BYTE *op = (BYTE *) dest;
-	BYTE * const oend = op + maxoutputsize;
-	BYTE *cpy;
+int LZ4_decompress_safe_partial(const char *source, char *dest,
+	int compressedSize, int targetOutputSize, int maxDecompressedSize)
+{
+	return LZ4_decompress_generic(source, dest, compressedSize,
+		maxDecompressedSize, endOnInputSize, partial,
+		targetOutputSize, noDict, (BYTE *)dest, NULL, 0);
+}
 
-	/* Main Loop */
-	while (ip < iend) {
+int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
+{
+	return LZ4_decompress_generic(source, dest, 0, originalSize,
+		endOnOutputSize, full, 0, withPrefix64k,
+		(BYTE *)(dest - 64 * KB), NULL, 64 * KB);
+}
 
-		unsigned token;
-		size_t length;
+int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *dictionary, int dictSize)
+{
+	LZ4_streamDecode_t_internal *lz4sd = (LZ4_streamDecode_t_internal *) LZ4_streamDecode;
 
-		/* get runlength */
-		token = *ip++;
-		length = (token >> ML_BITS);
-		if (length == RUN_MASK) {
-			int s = 255;
-			while ((ip < iend) && (s == 255)) {
-				s = *ip++;
-				if (unlikely(length > (size_t)(length + s)))
-					goto _output_error;
-				length += s;
-			}
-		}
-		/* copy literals */
-		cpy = op + length;
-		if ((cpy > oend - COPYLENGTH) ||
-			(ip + length > iend - COPYLENGTH)) {
-
-			if (cpy > oend)
-				goto _output_error;/* writes beyond buffer */
-
-			if (ip + length != iend)
-				goto _output_error;/*
-						    * Error: LZ4 format requires
-						    * to consume all input
-						    * at this stage
-						    */
-			memcpy(op, ip, length);
-			op += length;
-			break;/* Necessarily EOF, due to parsing restrictions */
-		}
-		LZ4_WILDCOPY(ip, op, cpy);
-		ip -= (op - cpy);
-		op = cpy;
+	lz4sd->prefixSize = (size_t) dictSize;
+	lz4sd->prefixEnd = (const BYTE *) dictionary + dictSize;
+	lz4sd->externalDict = NULL;
+	lz4sd->extDictSize	= 0;
+	return 1;
+}
 
-		/* get offset */
-		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
-		ip += 2;
-		if (ref < (BYTE * const) dest)
-			goto _output_error;
-			/*
-			 * Error : offset creates reference
-			 * outside of destination buffer
-			 */
+/*
+ * *_continue() :
+ * These decoding functions allow decompression of multiple blocks
+ * in "streaming" mode.
+ * Previously decoded blocks must still be available at the memory
+ * position where they were decoded.
+ * If it's not possible, save the relevant part of
+ * decoded data into a safe buffer,
+ * and indicate where it stands using LZ4_setStreamDecode()
+ */
+int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int compressedSize, int maxOutputSize)
+{
+	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
+	int result;
+
+	if (lz4sd->prefixEnd == (BYTE *)dest) {
+		result = LZ4_decompress_generic(source, dest,
+			compressedSize,
+			maxOutputSize,
+			endOnInputSize, full, 0,
+			usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize,
+			lz4sd->externalDict,
+			lz4sd->extDictSize);
+
+		if (result <= 0)
+			return result;
+
+		lz4sd->prefixSize += result;
+		lz4sd->prefixEnd	+= result;
+	} else {
+		lz4sd->extDictSize = lz4sd->prefixSize;
+		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+		result = LZ4_decompress_generic(source, dest,
+			compressedSize, maxOutputSize,
+			endOnInputSize, full, 0,
+			usingExtDict, (BYTE *)dest,
+			lz4sd->externalDict, lz4sd->extDictSize);
+		if (result <= 0)
+			return result;
+		lz4sd->prefixSize = result;
+		lz4sd->prefixEnd	= (BYTE *)dest + result;
+	}
 
-		/* get matchlength */
-		length = (token & ML_MASK);
-		if (length == ML_MASK) {
-			while (ip < iend) {
-				int s = *ip++;
-				if (unlikely(length > (size_t)(length + s)))
-					goto _output_error;
-				length += s;
-				if (s == 255)
-					continue;
-				break;
-			}
-		}
+	return result;
+}
 
-		/* copy repeated sequence */
-		if (unlikely((op - ref) < STEPSIZE)) {
-#if LZ4_ARCH64
-			int dec64 = dec64table[op - ref];
-#else
-			const int dec64 = 0;
-#endif
-				op[0] = ref[0];
-				op[1] = ref[1];
-				op[2] = ref[2];
-				op[3] = ref[3];
-				op += 4;
-				ref += 4;
-				ref -= dec32table[op - ref];
-				PUT4(ref, op);
-				op += STEPSIZE - 4;
-				ref -= dec64;
-		} else {
-			LZ4_COPYSTEP(ref, op);
-		}
-		cpy = op + length - (STEPSIZE-4);
-		if (cpy > oend - COPYLENGTH) {
-			if (cpy > oend)
-				goto _output_error; /* write outside of buf */
-#if LZ4_ARCH64
-			if ((ref + COPYLENGTH) > oend)
-#else
-			if ((ref + COPYLENGTH) > oend ||
-					(op + COPYLENGTH) > oend)
-#endif
-				goto _output_error;
-			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
-			while (op < cpy)
-				*op++ = *ref++;
-			op = cpy;
-			/*
-			 * Check EOF (should never happen, since last 5 bytes
-			 * are supposed to be literals)
-			 */
-			if (op == oend)
-				goto _output_error;
-			continue;
-		}
-		LZ4_SECURECOPY(ref, op, cpy);
-		op = cpy; /* correction */
+int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int originalSize)
+{
+	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
+	int result;
+
+	if (lz4sd->prefixEnd == (BYTE *)dest) {
+		result = LZ4_decompress_generic(source, dest, 0, originalSize,
+			endOnOutputSize, full, 0,
+			usingExtDict,
+			lz4sd->prefixEnd - lz4sd->prefixSize,
+			lz4sd->externalDict, lz4sd->extDictSize);
+
+		if (result <= 0)
+			return result;
+
+		lz4sd->prefixSize += originalSize;
+		lz4sd->prefixEnd	+= originalSize;
+	} else {
+		lz4sd->extDictSize = lz4sd->prefixSize;
+		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+		result = LZ4_decompress_generic(source, dest, 0, originalSize,
+			endOnOutputSize, full, 0,
+			usingExtDict, (BYTE *)dest,
+			lz4sd->externalDict, lz4sd->extDictSize);
+		if (result <= 0)
+			return result;
+		lz4sd->prefixSize = originalSize;
+		lz4sd->prefixEnd	= (BYTE *)dest + originalSize;
 	}
-	/* end of decoding */
-	return (int) (((char *) op) - dest);
 
-	/* write overflow error detected */
-_output_error:
-	return -1;
+	return result;
 }
 
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-		unsigned char *dest, size_t actual_dest_len)
+/*
+ * Advanced decoding functions :
+ * *_usingDict() :
+ * These decoding functions work the same as "_continue" ones,
+ * the dictionary must be explicitly provided within parameters
+ */
+static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
+	char *dest, int compressedSize, int maxOutputSize, int safe,
+	const char *dictStart, int dictSize)
 {
-	int ret = -1;
-	int input_len = 0;
-
-	input_len = lz4_uncompress(src, dest, actual_dest_len);
-	if (input_len < 0)
-		goto exit_0;
-	*src_len = input_len;
+	if (dictSize == 0)
+		return LZ4_decompress_generic(source, dest,
+			compressedSize, maxOutputSize, safe, full, 0,
+			noDict, (BYTE *)dest, NULL, 0);
+	if (dictStart + dictSize == dest) {
+		if (dictSize >= (int)(64 * KB - 1))
+			return LZ4_decompress_generic(source, dest,
+				compressedSize, maxOutputSize, safe, full, 0,
+				withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0);
+		return LZ4_decompress_generic(source, dest, compressedSize,
+			maxOutputSize, safe, full, 0, noDict,
+			(BYTE *)dest - dictSize, NULL, 0);
+	}
+	return LZ4_decompress_generic(source, dest, compressedSize,
+		maxOutputSize, safe, full, 0, usingExtDict,
+		(BYTE *)dest, (const BYTE *)dictStart, dictSize);
+}
 
-	return 0;
-exit_0:
-	return ret;
+int LZ4_decompress_safe_usingDict(const char *source, char *dest,
+	int compressedSize, int maxOutputSize,
+	const char *dictStart, int dictSize)
+{
+	return LZ4_decompress_usingDict_generic(source, dest,
+		compressedSize, maxOutputSize, 1, dictStart, dictSize);
 }
-#ifndef STATIC
-EXPORT_SYMBOL(lz4_decompress);
-#endif
 
-int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-		unsigned char *dest, size_t *dest_len)
+int LZ4_decompress_fast_usingDict(const char *source, char *dest,
+	int originalSize, const char *dictStart, int dictSize)
 {
-	int ret = -1;
-	int out_len = 0;
-
-	out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len,
-					*dest_len);
-	if (out_len < 0)
-		goto exit_0;
-	*dest_len = out_len;
-
-	return 0;
-exit_0:
-	return ret;
+	return LZ4_decompress_usingDict_generic(source, dest, 0,
+		originalSize, 0, dictStart, dictSize);
 }
+
+/*-******************************
+ *	For backwards compatibility
+ ********************************/
+int lz4_decompress_unknownoutputsize(const unsigned char *src,
+	size_t src_len, unsigned char *dest, size_t *dest_len) {
+	*dest_len = LZ4_decompress_safe(src, dest,
+		src_len, *dest_len);
+
+	/*
+	 * Prior lz4_decompress_unknownoutputsize will return
+	 * 0 for success and a negative result for error
+	 * new LZ4_decompress_safe returns
+	 * - the length of data read on success
+	 * - and also a negative result on error
+	 * meaning when result > 0, we just return 0 here
+	 */
+	if (src_len > 0)
+		return 0;
+	else
+		return -1;
+}
+
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+	unsigned char *dest, size_t actual_dest_len) {
+	*src_len = LZ4_decompress_fast(src, dest, actual_dest_len);
+
+	/*
+	 * Prior lz4_decompress will return
+	 * 0 for success and a negative result for error
+	 * new LZ4_decompress_fast returns
+	 * - the length of data read on success
+	 * - and also a negative result on error
+	 * meaning when result > 0, we just return 0 here
+	 */
+	if (*src_len > 0)
+		return 0;
+	else
+		return -1;
+}
+
 #ifndef STATIC
+EXPORT_SYMBOL(LZ4_decompress_safe);
+EXPORT_SYMBOL(LZ4_decompress_safe_partial);
+EXPORT_SYMBOL(LZ4_decompress_fast);
+EXPORT_SYMBOL(LZ4_setStreamDecode);
+EXPORT_SYMBOL(LZ4_decompress_safe_continue);
+EXPORT_SYMBOL(LZ4_decompress_fast_continue);
+EXPORT_SYMBOL(LZ4_decompress_safe_usingDict);
+EXPORT_SYMBOL(LZ4_decompress_fast_usingDict);
 EXPORT_SYMBOL(lz4_decompress_unknownoutputsize);
+EXPORT_SYMBOL(lz4_decompress);
 
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("LZ4 Decompressor");
+MODULE_DESCRIPTION("LZ4 decompressor");
 #endif
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index c79d7ea8a38e..00a0b58a0871 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -1,157 +1,227 @@
+#ifndef __LZ4DEFS_H__
+#define __LZ4DEFS_H__
+
 /*
- * lz4defs.h -- architecture specific defines
- *
- * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ * lz4defs.h -- common and architecture specific defines for the kernel usage
+
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2016, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
-/*
- * Detects 64 bits mode
- */
+#include <asm/unaligned.h>
+#include <linux/string.h>	 /* memset, memcpy */
+
+#define FORCE_INLINE __always_inline
+
+/*-************************************
+ *	Basic Types
+ **************************************/
+#include <linux/types.h>
+
+typedef	uint8_t BYTE;
+typedef uint16_t U16;
+typedef uint32_t U32;
+typedef	int32_t S32;
+typedef uint64_t U64;
+typedef uintptr_t uptrval;
+
+/*-************************************
+ *	Architecture specifics
+ **************************************/
 #if defined(CONFIG_64BIT)
 #define LZ4_ARCH64 1
 #else
 #define LZ4_ARCH64 0
 #endif
 
-/*
- * Architecture-specific macros
- */
-#define BYTE	u8
-typedef struct _U16_S { u16 v; } U16_S;
-typedef struct _U32_S { u32 v; } U32_S;
-typedef struct _U64_S { u64 v; } U64_S;
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-
-#define A16(x) (((U16_S *)(x))->v)
-#define A32(x) (((U32_S *)(x))->v)
-#define A64(x) (((U64_S *)(x))->v)
-
-#define PUT4(s, d) (A32(d) = A32(s))
-#define PUT8(s, d) (A64(d) = A64(s))
-
-#define LZ4_READ_LITTLEENDIAN_16(d, s, p)	\
-	(d = s - A16(p))
-
-#define LZ4_WRITE_LITTLEENDIAN_16(p, v)	\
-	do {	\
-		A16(p) = v; \
-		p += 2; \
-	} while (0)
-#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
-
-#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v))
-#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v))
-#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v))
-
-#define PUT4(s, d) \
-	put_unaligned(get_unaligned((const u32 *) s), (u32 *) d)
-#define PUT8(s, d) \
-	put_unaligned(get_unaligned((const u64 *) s), (u64 *) d)
-
-#define LZ4_READ_LITTLEENDIAN_16(d, s, p)	\
-	(d = s - get_unaligned_le16(p))
-
-#define LZ4_WRITE_LITTLEENDIAN_16(p, v)			\
-	do {						\
-		put_unaligned_le16(v, (u16 *)(p));	\
-		p += 2;					\
-	} while (0)
+#if defined(__LITTLE_ENDIAN)
+#define LZ4_LITTLE_ENDIAN 1
+#else
+#define LZ4_LITTLE_ENDIAN 0
 #endif
 
-#define COPYLENGTH 8
-#define ML_BITS  4
-#define ML_MASK  ((1U << ML_BITS) - 1)
+/*-************************************
+ *	Constants
+ **************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
+
+/* Increase this value ==> compression run slower on incompressible data */
+#define LZ4_SKIPTRIGGER 6
+
+#define HASH_UNIT sizeof(size_t)
+
+#define KB (1 << 10)
+#define MB (1 << 20)
+#define GB (1U << 30)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+#define STEPSIZE sizeof(size_t)
+
+#define ML_BITS	4
+#define ML_MASK	((1U << ML_BITS) - 1)
 #define RUN_BITS (8 - ML_BITS)
 #define RUN_MASK ((1U << RUN_BITS) - 1)
-#define MEMORY_USAGE	14
-#define MINMATCH	4
-#define SKIPSTRENGTH	6
-#define LASTLITERALS	5
-#define MFLIMIT		(COPYLENGTH + MINMATCH)
-#define MINLENGTH	(MFLIMIT + 1)
-#define MAXD_LOG	16
-#define MAXD		(1 << MAXD_LOG)
-#define MAXD_MASK	(u32)(MAXD - 1)
-#define MAX_DISTANCE	(MAXD - 1)
-#define HASH_LOG	(MAXD_LOG - 1)
-#define HASHTABLESIZE	(1 << HASH_LOG)
-#define MAX_NB_ATTEMPTS	256
-#define OPTIMAL_ML	(int)((ML_MASK-1)+MINMATCH)
-#define LZ4_64KLIMIT	((1<<16) + (MFLIMIT - 1))
-#define HASHLOG64K	((MEMORY_USAGE - 2) + 1)
-#define HASH64KTABLESIZE	(1U << HASHLOG64K)
-#define LZ4_HASH_VALUE(p)	(((A32(p)) * 2654435761U) >> \
-				((MINMATCH * 8) - (MEMORY_USAGE-2)))
-#define LZ4_HASH64K_VALUE(p)	(((A32(p)) * 2654435761U) >> \
-				((MINMATCH * 8) - HASHLOG64K))
-#define HASH_VALUE(p)		(((A32(p)) * 2654435761U) >> \
-				((MINMATCH * 8) - HASH_LOG))
-
-#if LZ4_ARCH64/* 64-bit */
-#define STEPSIZE 8
-
-#define LZ4_COPYSTEP(s, d)	\
-	do {			\
-		PUT8(s, d);	\
-		d += 8;		\
-		s += 8;		\
-	} while (0)
-
-#define LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
-
-#define LZ4_SECURECOPY(s, d, e)			\
-	do {					\
-		if (d < e) {			\
-			LZ4_WILDCOPY(s, d, e);	\
-		}				\
-	} while (0)
-#define HTYPE u32
-
-#ifdef __BIG_ENDIAN
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+
+/*-************************************
+ *	Reading and writing into memory
+ **************************************/
+static FORCE_INLINE U16 LZ4_read16(const void *ptr)
+{
+	return get_unaligned((const U16 *)ptr);
+}
+
+static FORCE_INLINE U32 LZ4_read32(const void *ptr)
+{
+	return get_unaligned((const U32 *)ptr);
+}
+
+static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr)
+{
+	return get_unaligned((const size_t *)ptr);
+}
+
+static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value)
+{
+	put_unaligned(value, (U16 *)memPtr);
+}
+
+static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value)
+{
+	put_unaligned(value, (U32 *)memPtr);
+}
+
+static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr)
+{
+	return get_unaligned_le16(memPtr);
+}
+
+static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
+{
+	return put_unaligned_le16(value, memPtr);
+}
+
+static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
+{
+#if LZ4_ARCH64
+	U64 a = get_unaligned((const U64 *)src);
+
+	put_unaligned(a, (U64 *)dst);
+#else
+	U32 a = get_unaligned((const U32 *)src);
+	U32 b = get_unaligned((const U32 *)src + 1);
+
+	put_unaligned(a, (U32 *)dst);
+	put_unaligned(b, (U32 *)dst + 1);
+#endif
+}
+
+/*
+ * customized variant of memcpy,
+ * which can overwrite up to 7 bytes beyond dstEnd
+ */
+static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
+	const void *srcPtr, void *dstEnd)
+{
+	BYTE *d = (BYTE *)dstPtr;
+	const BYTE *s = (const BYTE *)srcPtr;
+	BYTE *const e = (BYTE *)dstEnd;
+
+	do {
+		LZ4_copy8(d, s);
+		d += 8;
+		s += 8;
+	} while (d < e);
+}
+
+static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
+{
+#if LZ4_LITTLE_ENDIAN
+	return __ffs(val) >> 3;
 #else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
+	return (BITS_PER_LONG - 1 - __fls(val)) >> 3;
+#endif
+}
+
+static FORCE_INLINE unsigned int LZ4_count(
+	const BYTE *pIn,
+	const BYTE *pMatch,
+	const BYTE *pInLimit)
+{
+	const BYTE *const pStart = pIn;
+
+	while (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+		size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+
+		if (!diff) {
+			pIn += STEPSIZE;
+			pMatch += STEPSIZE;
+			continue;
+		}
+
+		pIn += LZ4_NbCommonBytes(diff);
+
+		return (unsigned int)(pIn - pStart);
+	}
+
+#if LZ4_ARCH64
+	if ((pIn < (pInLimit - 3))
+		&& (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
+		pIn += 4;
+		pMatch += 4;
+	}
 #endif
 
-#else	/* 32-bit */
-#define STEPSIZE 4
+	if ((pIn < (pInLimit - 1))
+		&& (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
+		pIn += 2;
+		pMatch += 2;
+	}
 
-#define LZ4_COPYSTEP(s, d)	\
-	do {			\
-		PUT4(s, d);	\
-		d += 4;		\
-		s += 4;		\
-	} while (0)
+	if ((pIn < pInLimit) && (*pMatch == *pIn))
+		pIn++;
 
-#define LZ4_COPYPACKET(s, d)		\
-	do {				\
-		LZ4_COPYSTEP(s, d);	\
-		LZ4_COPYSTEP(s, d);	\
-	} while (0)
+	return (unsigned int)(pIn - pStart);
+}
 
-#define LZ4_SECURECOPY	LZ4_WILDCOPY
-#define HTYPE const u8*
+typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
 
-#ifdef __BIG_ENDIAN
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
-#else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
-#endif
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
 
-#endif
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
 
-#define LZ4_WILDCOPY(s, d, e)		\
-	do {				\
-		LZ4_COPYPACKET(s, d);	\
-	} while (d < e)
-
-#define LZ4_BLINDCOPY(s, d, l)	\
-	do {	\
-		u8 *e = (d) + l;	\
-		LZ4_WILDCOPY(s, d, e);	\
-		d = e;	\
-	} while (0)
+#endif
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index f344f76b6559..e1fbf1561e55 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -1,19 +1,17 @@
 /*
  * LZ4 HC - High Compression Mode of LZ4
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ * Copyright (C) 2011-2015, Yann Collet.
  *
+ * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
- *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -25,323 +23,361 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- *  Changed for kernel use by:
- *  Chanho Min <chanho.min@lge.com>
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
-#include <linux/module.h>
-#include <linux/kernel.h>
+/*-************************************
+ *	Dependencies
+ **************************************/
 #include <linux/lz4.h>
-#include <asm/unaligned.h>
 #include "lz4defs.h"
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memset */
+
+/* *************************************
+ *	Local Constants and types
+ ***************************************/
 
-struct lz4hc_data {
-	const u8 *base;
-	HTYPE hashtable[HASHTABLESIZE];
-	u16 chaintable[MAXD];
-	const u8 *nexttoupdate;
-} __attribute__((__packed__));
+#define OPTIMAL_ML (int)((ML_MASK - 1) + MINMATCH)
 
-static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base)
+#define HASH_FUNCTION(i)	(((i) * 2654435761U) \
+	>> ((MINMATCH*8) - LZ4HC_HASH_LOG))
+#define DELTANEXTU16(p)	chainTable[(U16)(p)] /* faster */
+
+static U32 LZ4HC_hashPtr(const void *ptr)
+{
+	return HASH_FUNCTION(LZ4_read32(ptr));
+}
+
+/**************************************
+ *	HC Compression
+ **************************************/
+static void LZ4HC_init(LZ4HC_CCtx_internal *hc4, const BYTE *start)
 {
-	memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable));
-	memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable));
-
-#if LZ4_ARCH64
-	hc4->nexttoupdate = base + 1;
-#else
-	hc4->nexttoupdate = base;
-#endif
-	hc4->base = base;
-	return 1;
+	memset((void *)hc4->hashTable, 0, sizeof(hc4->hashTable));
+	memset(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
+	hc4->nextToUpdate = 64 * KB;
+	hc4->base = start - 64 * KB;
+	hc4->end = start;
+	hc4->dictBase = start - 64 * KB;
+	hc4->dictLimit = 64 * KB;
+	hc4->lowLimit = 64 * KB;
 }
 
 /* Update chains up to ip (excluded) */
-static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip)
+static FORCE_INLINE void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
+	const BYTE *ip)
 {
-	u16 *chaintable = hc4->chaintable;
-	HTYPE *hashtable  = hc4->hashtable;
-#if LZ4_ARCH64
+	U16 * const chainTable = hc4->chainTable;
+	U32 * const hashTable	= hc4->hashTable;
 	const BYTE * const base = hc4->base;
-#else
-	const int base = 0;
-#endif
+	U32 const target = (U32)(ip - base);
+	U32 idx = hc4->nextToUpdate;
+
+	while (idx < target) {
+		U32 const h = LZ4HC_hashPtr(base + idx);
+		size_t delta = idx - hashTable[h];
 
-	while (hc4->nexttoupdate < ip) {
-		const u8 *p = hc4->nexttoupdate;
-		size_t delta = p - (hashtable[HASH_VALUE(p)] + base);
 		if (delta > MAX_DISTANCE)
 			delta = MAX_DISTANCE;
-		chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta;
-		hashtable[HASH_VALUE(p)] = (p) - base;
-		hc4->nexttoupdate++;
-	}
-}
 
-static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2,
-		const u8 *const matchlimit)
-{
-	const u8 *p1t = p1;
-
-	while (p1t < matchlimit - (STEPSIZE - 1)) {
-#if LZ4_ARCH64
-		u64 diff = A64(p2) ^ A64(p1t);
-#else
-		u32 diff = A32(p2) ^ A32(p1t);
-#endif
-		if (!diff) {
-			p1t += STEPSIZE;
-			p2 += STEPSIZE;
-			continue;
-		}
-		p1t += LZ4_NBCOMMONBYTES(diff);
-		return p1t - p1;
-	}
-#if LZ4_ARCH64
-	if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) {
-		p1t += 4;
-		p2 += 4;
-	}
-#endif
+		DELTANEXTU16(idx) = (U16)delta;
 
-	if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) {
-		p1t += 2;
-		p2 += 2;
+		hashTable[h] = idx;
+		idx++;
 	}
-	if ((p1t < matchlimit) && (*p2 == *p1t))
-		p1t++;
-	return p1t - p1;
+
+	hc4->nextToUpdate = target;
 }
 
-static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4,
-		const u8 *ip, const u8 *const matchlimit, const u8 **matchpos)
+static FORCE_INLINE int LZ4HC_InsertAndFindBestMatch(
+	LZ4HC_CCtx_internal *hc4, /* Index table will be updated */
+	const BYTE *ip,
+	const BYTE * const iLimit,
+	const BYTE **matchpos,
+	const int maxNbAttempts)
 {
-	u16 *const chaintable = hc4->chaintable;
-	HTYPE *const hashtable = hc4->hashtable;
-	const u8 *ref;
-#if LZ4_ARCH64
+	U16 * const chainTable = hc4->chainTable;
+	U32 * const HashTable = hc4->hashTable;
 	const BYTE * const base = hc4->base;
-#else
-	const int base = 0;
-#endif
-	int nbattempts = MAX_NB_ATTEMPTS;
-	size_t repl = 0, ml = 0;
-	u16 delta;
+	const BYTE * const dictBase = hc4->dictBase;
+	const U32 dictLimit = hc4->dictLimit;
+	const U32 lowLimit = (hc4->lowLimit + 64 * KB > (U32)(ip - base))
+		? hc4->lowLimit
+		: (U32)(ip - base) - (64 * KB - 1);
+	U32 matchIndex;
+	int nbAttempts = maxNbAttempts;
+	size_t ml = 0;
 
 	/* HC4 match finder */
-	lz4hc_insert(hc4, ip);
-	ref = hashtable[HASH_VALUE(ip)] + base;
-
-	/* potential repetition */
-	if (ref >= ip-4) {
-		/* confirmed */
-		if (A32(ref) == A32(ip)) {
-			delta = (u16)(ip-ref);
-			repl = ml  = lz4hc_commonlength(ip + MINMATCH,
-					ref + MINMATCH, matchlimit) + MINMATCH;
-			*matchpos = ref;
-		}
-		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
-	}
+	LZ4HC_Insert(hc4, ip);
+	matchIndex = HashTable[LZ4HC_hashPtr(ip)];
+
+	while ((matchIndex >= lowLimit)
+		&& (nbAttempts)) {
+		nbAttempts--;
+		if (matchIndex >= dictLimit) {
+			const BYTE * const match = base + matchIndex;
+
+			if (*(match + ml) == *(ip + ml)
+				&& (LZ4_read32(match) == LZ4_read32(ip))) {
+				size_t const mlt = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, iLimit) + MINMATCH;
 
-	while ((ref >= ip - MAX_DISTANCE) && nbattempts) {
-		nbattempts--;
-		if (*(ref + ml) == *(ip + ml)) {
-			if (A32(ref) == A32(ip)) {
-				size_t mlt =
-					lz4hc_commonlength(ip + MINMATCH,
-					ref + MINMATCH, matchlimit) + MINMATCH;
 				if (mlt > ml) {
 					ml = mlt;
-					*matchpos = ref;
+					*matchpos = match;
+				}
+			}
+		} else {
+			const BYTE * const match = dictBase + matchIndex;
+
+			if (LZ4_read32(match) == LZ4_read32(ip)) {
+				size_t mlt;
+				const BYTE *vLimit = ip
+					+ (dictLimit - matchIndex);
+
+				if (vLimit > iLimit)
+					vLimit = iLimit;
+				mlt = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, vLimit) + MINMATCH;
+				if ((ip + mlt == vLimit)
+					&& (vLimit < iLimit))
+					mlt += LZ4_count(ip + mlt,
+						base + dictLimit,
+						iLimit);
+				if (mlt > ml) {
+					/* virtual matchpos */
+					ml = mlt;
+					*matchpos = base + matchIndex;
 				}
 			}
 		}
-		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
-	}
-
-	/* Complete table */
-	if (repl) {
-		const BYTE *ptr = ip;
-		const BYTE *end;
-		end = ip + repl - (MINMATCH-1);
-		/* Pre-Load */
-		while (ptr < end - delta) {
-			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
-			ptr++;
-		}
-		do {
-			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
-			/* Head of chain */
-			hashtable[HASH_VALUE(ptr)] = (ptr) - base;
-			ptr++;
-		} while (ptr < end);
-		hc4->nexttoupdate = end;
+		matchIndex -= DELTANEXTU16(matchIndex);
 	}
 
 	return (int)ml;
 }
 
-static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4,
-	const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest,
-	const u8 **matchpos, const u8 **startpos)
+static FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch(
+	LZ4HC_CCtx_internal *hc4,
+	const BYTE * const ip,
+	const BYTE * const iLowLimit,
+	const BYTE * const iHighLimit,
+	int longest,
+	const BYTE **matchpos,
+	const BYTE **startpos,
+	const int maxNbAttempts)
 {
-	u16 *const chaintable = hc4->chaintable;
-	HTYPE *const hashtable = hc4->hashtable;
-#if LZ4_ARCH64
+	U16 * const chainTable = hc4->chainTable;
+	U32 * const HashTable = hc4->hashTable;
 	const BYTE * const base = hc4->base;
-#else
-	const int base = 0;
-#endif
-	const u8 *ref;
-	int nbattempts = MAX_NB_ATTEMPTS;
-	int delta = (int)(ip - startlimit);
+	const U32 dictLimit = hc4->dictLimit;
+	const BYTE * const lowPrefixPtr = base + dictLimit;
+	const U32 lowLimit = (hc4->lowLimit + 64 * KB > (U32)(ip - base))
+		? hc4->lowLimit
+		: (U32)(ip - base) - (64 * KB - 1);
+	const BYTE * const dictBase = hc4->dictBase;
+	U32 matchIndex;
+	int nbAttempts = maxNbAttempts;
+	int delta = (int)(ip - iLowLimit);
 
 	/* First Match */
-	lz4hc_insert(hc4, ip);
-	ref = hashtable[HASH_VALUE(ip)] + base;
-
-	while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base)
-		&& (nbattempts)) {
-		nbattempts--;
-		if (*(startlimit + longest) == *(ref - delta + longest)) {
-			if (A32(ref) == A32(ip)) {
-				const u8 *reft = ref + MINMATCH;
-				const u8 *ipt = ip + MINMATCH;
-				const u8 *startt = ip;
-
-				while (ipt < matchlimit-(STEPSIZE - 1)) {
-					#if LZ4_ARCH64
-					u64 diff = A64(reft) ^ A64(ipt);
-					#else
-					u32 diff = A32(reft) ^ A32(ipt);
-					#endif
-
-					if (!diff) {
-						ipt += STEPSIZE;
-						reft += STEPSIZE;
-						continue;
+	LZ4HC_Insert(hc4, ip);
+	matchIndex = HashTable[LZ4HC_hashPtr(ip)];
+
+	while ((matchIndex >= lowLimit)
+		&& (nbAttempts)) {
+		nbAttempts--;
+		if (matchIndex >= dictLimit) {
+			const BYTE *matchPtr = base + matchIndex;
+
+			if (*(iLowLimit + longest)
+				== *(matchPtr - delta + longest)) {
+				if (LZ4_read32(matchPtr) == LZ4_read32(ip)) {
+					int mlt = MINMATCH + LZ4_count(
+						ip + MINMATCH,
+						matchPtr + MINMATCH,
+						iHighLimit);
+					int back = 0;
+
+					while ((ip + back > iLowLimit)
+						&& (matchPtr + back > lowPrefixPtr)
+						&& (ip[back - 1] == matchPtr[back - 1]))
+						back--;
+
+					mlt -= back;
+
+					if (mlt > longest) {
+						longest = (int)mlt;
+						*matchpos = matchPtr + back;
+						*startpos = ip + back;
 					}
-					ipt += LZ4_NBCOMMONBYTES(diff);
-					goto _endcount;
-				}
-				#if LZ4_ARCH64
-				if ((ipt < (matchlimit - 3))
-					&& (A32(reft) == A32(ipt))) {
-					ipt += 4;
-					reft += 4;
 				}
-				ipt += 2;
-				#endif
-				if ((ipt < (matchlimit - 1))
-					&& (A16(reft) == A16(ipt))) {
-					reft += 2;
-				}
-				if ((ipt < matchlimit) && (*reft == *ipt))
-					ipt++;
-_endcount:
-				reft = ref;
-
-				while ((startt > startlimit)
-					&& (reft > hc4->base)
-					&& (startt[-1] == reft[-1])) {
-					startt--;
-					reft--;
-				}
-
-				if ((ipt - startt) > longest) {
-					longest = (int)(ipt - startt);
-					*matchpos = reft;
-					*startpos = startt;
+			}
+		} else {
+			const BYTE * const matchPtr = dictBase + matchIndex;
+
+			if (LZ4_read32(matchPtr) == LZ4_read32(ip)) {
+				size_t mlt;
+				int back = 0;
+				const BYTE *vLimit = ip + (dictLimit - matchIndex);
+
+				if (vLimit > iHighLimit)
+					vLimit = iHighLimit;
+
+				mlt = LZ4_count(ip + MINMATCH,
+					matchPtr + MINMATCH, vLimit) + MINMATCH;
+
+				if ((ip + mlt == vLimit) && (vLimit < iHighLimit))
+					mlt += LZ4_count(ip + mlt, base + dictLimit,
+						iHighLimit);
+				while ((ip + back > iLowLimit)
+					&& (matchIndex + back > lowLimit)
+					&& (ip[back - 1] == matchPtr[back - 1]))
+					back--;
+
+				mlt -= back;
+
+				if ((int)mlt > longest) {
+					longest = (int)mlt;
+					*matchpos = base + matchIndex + back;
+					*startpos = ip + back;
 				}
 			}
 		}
-		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+
+		matchIndex -= DELTANEXTU16(matchIndex);
 	}
+
 	return longest;
 }
 
-static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor,
-		int ml, const u8 *ref)
+static FORCE_INLINE int LZ4HC_encodeSequence(
+	const BYTE **ip,
+	BYTE **op,
+	const BYTE **anchor,
+	int matchLength,
+	const BYTE * const match,
+	limitedOutput_directive limitedOutputBuffer,
+	BYTE *oend)
 {
-	int length, len;
-	u8 *token;
+	int length;
+	BYTE *token;
 
 	/* Encode Literal length */
 	length = (int)(*ip - *anchor);
 	token = (*op)++;
+
+	if ((limitedOutputBuffer)
+		&& ((*op + (length>>8)
+			+ length + (2 + 1 + LASTLITERALS)) > oend)) {
+		/* Check output limit */
+		return 1;
+	}
 	if (length >= (int)RUN_MASK) {
-		*token = (RUN_MASK << ML_BITS);
+		int len;
+
+		*token = (RUN_MASK<<ML_BITS);
 		len = length - RUN_MASK;
 		for (; len > 254 ; len -= 255)
 			*(*op)++ = 255;
-		*(*op)++ = (u8)len;
+		*(*op)++ = (BYTE)len;
 	} else
-		*token = (length << ML_BITS);
+		*token = (BYTE)(length<<ML_BITS);
 
 	/* Copy Literals */
-	LZ4_BLINDCOPY(*anchor, *op, length);
+	LZ4_wildCopy(*op, *anchor, (*op) + length);
+	*op += length;
 
 	/* Encode Offset */
-	LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref));
+	LZ4_writeLE16(*op, (U16)(*ip - match));
+	*op += 2;
 
 	/* Encode MatchLength */
-	len = (int)(ml - MINMATCH);
-	if (len >= (int)ML_MASK) {
+	length = (int)(matchLength - MINMATCH);
+
+	if ((limitedOutputBuffer)
+		&& (*op + (length>>8)
+			+ (1 + LASTLITERALS) > oend)) {
+		/* Check output limit */
+		return 1;
+	}
+
+	if (length >= (int)ML_MASK) {
 		*token += ML_MASK;
-		len -= ML_MASK;
-		for (; len > 509 ; len -= 510) {
+		length -= ML_MASK;
+
+		for (; length > 509 ; length -= 510) {
 			*(*op)++ = 255;
 			*(*op)++ = 255;
 		}
-		if (len > 254) {
-			len -= 255;
+
+		if (length > 254) {
+			length -= 255;
 			*(*op)++ = 255;
 		}
-		*(*op)++ = (u8)len;
+
+		*(*op)++ = (BYTE)length;
 	} else
-		*token += len;
+		*token += (BYTE)(length);
 
 	/* Prepare next loop */
-	*ip += ml;
+	*ip += matchLength;
 	*anchor = *ip;
 
 	return 0;
 }
 
-static int lz4_compresshcctx(struct lz4hc_data *ctx,
-		const char *source,
-		char *dest,
-		int isize)
+static int LZ4HC_compress_generic(
+	LZ4HC_CCtx_internal *const ctx,
+	const char * const source,
+	char * const dest,
+	int const inputSize,
+	int const maxOutputSize,
+	int compressionLevel,
+	limitedOutput_directive limit
+	)
 {
-	const u8 *ip = (const u8 *)source;
-	const u8 *anchor = ip;
-	const u8 *const iend = ip + isize;
-	const u8 *const mflimit = iend - MFLIMIT;
-	const u8 *const matchlimit = (iend - LASTLITERALS);
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *anchor = ip;
+	const BYTE * const iend = ip + inputSize;
+	const BYTE * const mflimit = iend - MFLIMIT;
+	const BYTE * const matchlimit = (iend - LASTLITERALS);
 
-	u8 *op = (u8 *)dest;
+	BYTE *op = (BYTE *) dest;
+	BYTE * const oend = op + maxOutputSize;
 
+	unsigned int maxNbAttempts;
 	int ml, ml2, ml3, ml0;
-	const u8 *ref = NULL;
-	const u8 *start2 = NULL;
-	const u8 *ref2 = NULL;
-	const u8 *start3 = NULL;
-	const u8 *ref3 = NULL;
-	const u8 *start0;
-	const u8 *ref0;
-	int lastrun;
+	const BYTE *ref = NULL;
+	const BYTE *start2 = NULL;
+	const BYTE *ref2 = NULL;
+	const BYTE *start3 = NULL;
+	const BYTE *ref3 = NULL;
+	const BYTE *start0;
+	const BYTE *ref0;
+
+	/* init */
+	if (compressionLevel > LZ4HC_MAX_CLEVEL)
+		compressionLevel = LZ4HC_MAX_CLEVEL;
+	if (compressionLevel < 1)
+		compressionLevel = LZ4HC_DEFAULT_CLEVEL;
+	maxNbAttempts = 1 << (compressionLevel - 1);
+	ctx->end += inputSize;
 
 	ip++;
 
 	/* Main Loop */
 	while (ip < mflimit) {
-		ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref));
+		ml = LZ4HC_InsertAndFindBestMatch(ctx, ip,
+			matchlimit, (&ref), maxNbAttempts);
 		if (!ml) {
 			ip++;
 			continue;
@@ -351,51 +387,59 @@ static int lz4_compresshcctx(struct lz4hc_data *ctx,
 		start0 = ip;
 		ref0 = ref;
 		ml0 = ml;
-_search2:
-		if (ip+ml < mflimit)
-			ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2,
-				ip + 1, matchlimit, ml, &ref2, &start2);
+
+_Search2:
+		if (ip + ml < mflimit)
+			ml2 = LZ4HC_InsertAndGetWiderMatch(ctx,
+				ip + ml - 2, ip + 0,
+				matchlimit, ml, &ref2,
+				&start2, maxNbAttempts);
 		else
 			ml2 = ml;
-		/* No better match */
+
 		if (ml2 == ml) {
-			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			/* No better match */
+			if (LZ4HC_encodeSequence(&ip, &op,
+				&anchor, ml, ref, limit, oend))
+				return 0;
 			continue;
 		}
 
 		if (start0 < ip) {
-			/* empirical */
 			if (start2 < ip + ml0) {
+				/* empirical */
 				ip = start0;
 				ref = ref0;
 				ml = ml0;
 			}
 		}
-		/*
-		 * Here, start0==ip
-		 * First Match too small : removed
-		 */
+
+		/* Here, start0 == ip */
 		if ((start2 - ip) < 3) {
+			/* First Match too small : removed */
 			ml = ml2;
 			ip = start2;
 			ref = ref2;
-			goto _search2;
+			goto _Search2;
 		}
 
-_search3:
+_Search3:
 		/*
-		 * Currently we have :
-		 * ml2 > ml1, and
-		 * ip1+3 <= ip2 (usually < ip1+ml1)
-		 */
+		* Currently we have :
+		* ml2 > ml1, and
+		* ip1 + 3 <= ip2 (usually < ip1 + ml1)
+		*/
 		if ((start2 - ip) < OPTIMAL_ML) {
 			int correction;
 			int new_ml = ml;
+
 			if (new_ml > OPTIMAL_ML)
 				new_ml = OPTIMAL_ML;
 			if (ip + new_ml > start2 + ml2 - MINMATCH)
 				new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+
 			correction = new_ml - (int)(start2 - ip);
+
 			if (correction > 0) {
 				start2 += correction;
 				ref2 += correction;
@@ -403,39 +447,44 @@ _search3:
 			}
 		}
 		/*
-		 * Now, we have start2 = ip+new_ml,
-		 * with new_ml=min(ml, OPTIMAL_ML=18)
+		 * Now, we have start2 = ip + new_ml,
+		 * with new_ml = min(ml, OPTIMAL_ML = 18)
 		 */
+
 		if (start2 + ml2 < mflimit)
-			ml3 = lz4hc_insertandgetwidermatch(ctx,
-				start2 + ml2 - 3, start2, matchlimit,
-				ml2, &ref3, &start3);
+			ml3 = LZ4HC_InsertAndGetWiderMatch(ctx,
+				start2 + ml2 - 3, start2,
+				matchlimit, ml2, &ref3, &start3,
+				maxNbAttempts);
 		else
 			ml3 = ml2;
 
-		/* No better match : 2 sequences to encode */
 		if (ml3 == ml2) {
+			/* No better match : 2 sequences to encode */
 			/* ip & ref are known; Now for ml */
-			if (start2 < ip+ml)
+			if (start2 < ip + ml)
 				ml = (int)(start2 - ip);
-
 			/* Now, encode 2 sequences */
-			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			if (LZ4HC_encodeSequence(&ip, &op, &anchor,
+				ml, ref, limit, oend))
+				return 0;
 			ip = start2;
-			lz4_encodesequence(&ip, &op, &anchor, ml2, ref2);
+			if (LZ4HC_encodeSequence(&ip, &op, &anchor,
+				ml2, ref2, limit, oend))
+				return 0;
 			continue;
 		}
 
-		/* Not enough space for match 2 : remove it */
 		if (start3 < ip + ml + 3) {
-			/*
-			 * can write Seq1 immediately ==> Seq2 is removed,
-			 * so Seq3 becomes Seq1
-			 */
+			/* Not enough space for match 2 : remove it */
 			if (start3 >= (ip + ml)) {
+				/* can write Seq1 immediately
+				 * ==> Seq2 is removed,
+				 * so Seq3 becomes Seq1
+				 */
 				if (start2 < ip + ml) {
-					int correction =
-						(int)(ip + ml - start2);
+					int correction = (int)(ip + ml - start2);
+
 					start2 += correction;
 					ref2 += correction;
 					ml2 -= correction;
@@ -446,35 +495,38 @@ _search3:
 					}
 				}
 
-				lz4_encodesequence(&ip, &op, &anchor, ml, ref);
-				ip  = start3;
+				if (LZ4HC_encodeSequence(&ip, &op, &anchor,
+					ml, ref, limit, oend))
+					return 0;
+				ip = start3;
 				ref = ref3;
-				ml  = ml3;
+				ml = ml3;
 
 				start0 = start2;
 				ref0 = ref2;
 				ml0 = ml2;
-				goto _search2;
+				goto _Search2;
 			}
 
 			start2 = start3;
 			ref2 = ref3;
 			ml2 = ml3;
-			goto _search3;
+			goto _Search3;
 		}
 
 		/*
-		 * OK, now we have 3 ascending matches; let's write at least
-		 * the first one ip & ref are known; Now for ml
-		 */
+		* OK, now we have 3 ascending matches;
+		* let's write at least the first one
+		* ip & ref are known; Now for ml
+		*/
 		if (start2 < ip + ml) {
 			if ((start2 - ip) < (int)ML_MASK) {
 				int correction;
+
 				if (ml > OPTIMAL_ML)
 					ml = OPTIMAL_ML;
 				if (ip + ml > start2 + ml2 - MINMATCH)
-					ml = (int)(start2 - ip) + ml2
-						- MINMATCH;
+					ml = (int)(start2 - ip) + ml2 - MINMATCH;
 				correction = ml - (int)(start2 - ip);
 				if (correction > 0) {
 					start2 += correction;
@@ -484,7 +536,9 @@ _search3:
 			} else
 				ml = (int)(start2 - ip);
 		}
-		lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+		if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml,
+			ref, limit, oend))
+			return 0;
 
 		ip = start2;
 		ref = ref2;
@@ -494,46 +548,245 @@ _search3:
 		ref2 = ref3;
 		ml2 = ml3;
 
-		goto _search3;
+		goto _Search3;
 	}
 
 	/* Encode Last Literals */
-	lastrun = (int)(iend - anchor);
-	if (lastrun >= (int)RUN_MASK) {
-		*op++ = (RUN_MASK << ML_BITS);
-		lastrun -= RUN_MASK;
-		for (; lastrun > 254 ; lastrun -= 255)
-			*op++ = 255;
-		*op++ = (u8) lastrun;
-	} else
-		*op++ = (lastrun << ML_BITS);
-	memcpy(op, anchor, iend - anchor);
-	op += iend - anchor;
+	{
+		int lastRun = (int)(iend - anchor);
+
+		if ((limit)
+			&& (((char *)op - dest) + lastRun + 1
+				+ ((lastRun + 255 - RUN_MASK)/255)
+					> (U32)maxOutputSize)) {
+			/* Check output limit */
+			return 0;
+		}
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK<<ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254 ; lastRun -= 255)
+				*op++ = 255;
+			*op++ = (BYTE) lastRun;
+		} else
+			*op++ = (BYTE)(lastRun<<ML_BITS);
+		memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
 	/* End */
 	return (int) (((char *)op) - dest);
 }
 
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-			unsigned char *dst, size_t *dst_len, void *wrkmem)
+static int LZ4_compress_HC_extStateHC(
+	void *state,
+	const char *src,
+	char *dst,
+	int srcSize,
+	int maxDstSize,
+	int compressionLevel)
 {
-	int ret = -1;
-	int out_len = 0;
+	LZ4HC_CCtx_internal *ctx = &((LZ4_streamHC_t *)state)->internal_donotuse;
 
-	struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem;
-	lz4hc_init(hc4, (const u8 *)src);
-	out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src,
-		(char *)dst, (int)src_len);
+	if (((size_t)(state)&(sizeof(void *) - 1)) != 0) {
+		/* Error : state is not aligned
+		 * for pointers (32 or 64 bits)
+		 */
+		return 0;
+	}
 
-	if (out_len < 0)
-		goto exit;
+	LZ4HC_init(ctx, (const BYTE *)src);
 
-	*dst_len = out_len;
-	return 0;
+	if (maxDstSize < LZ4_compressBound(srcSize))
+		return LZ4HC_compress_generic(ctx, src, dst,
+			srcSize, maxDstSize, compressionLevel, limitedOutput);
+	else
+		return LZ4HC_compress_generic(ctx, src, dst,
+			srcSize, maxDstSize, compressionLevel, noLimit);
+}
+
+int LZ4_compress_HC(const char *src, char *dst, int srcSize,
+	int maxDstSize, int compressionLevel, void *wrkmem)
+{
+	return LZ4_compress_HC_extStateHC(wrkmem, src, dst,
+		srcSize, maxDstSize, compressionLevel);
+}
+EXPORT_SYMBOL(LZ4_compress_HC);
+
+/**************************************
+ *	Streaming Functions
+ **************************************/
+void LZ4_resetStreamHC(LZ4_streamHC_t *LZ4_streamHCPtr, int compressionLevel)
+{
+	LZ4_streamHCPtr->internal_donotuse.base = NULL;
+	LZ4_streamHCPtr->internal_donotuse.compressionLevel = (unsigned int)compressionLevel;
+}
+
+int LZ4_loadDictHC(LZ4_streamHC_t *LZ4_streamHCPtr,
+	const char *dictionary,
+	int dictSize)
+{
+	LZ4HC_CCtx_internal *ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+
+	if (dictSize > 64 * KB) {
+		dictionary += dictSize - 64 * KB;
+		dictSize = 64 * KB;
+	}
+	LZ4HC_init(ctxPtr, (const BYTE *)dictionary);
+	if (dictSize >= 4)
+		LZ4HC_Insert(ctxPtr, (const BYTE *)dictionary + (dictSize - 3));
+	ctxPtr->end = (const BYTE *)dictionary + dictSize;
+	return dictSize;
+}
+EXPORT_SYMBOL(LZ4_loadDictHC);
+
+/* compression */
+
+static void LZ4HC_setExternalDict(
+	LZ4HC_CCtx_internal *ctxPtr,
+	const BYTE *newBlock)
+{
+	if (ctxPtr->end >= ctxPtr->base + 4) {
+		/* Referencing remaining dictionary content */
+		LZ4HC_Insert(ctxPtr, ctxPtr->end - 3);
+	}
+
+	/*
+	 * Only one memory segment for extDict,
+	 * so any previous extDict is lost at this stage
+	 */
+	ctxPtr->lowLimit	= ctxPtr->dictLimit;
+	ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base);
+	ctxPtr->dictBase	= ctxPtr->base;
+	ctxPtr->base = newBlock - ctxPtr->dictLimit;
+	ctxPtr->end	= newBlock;
+	/* match referencing will resume from there */
+	ctxPtr->nextToUpdate = ctxPtr->dictLimit;
+}
+EXPORT_SYMBOL(LZ4HC_setExternalDict);
+
+static int LZ4_compressHC_continue_generic(
+	LZ4_streamHC_t *LZ4_streamHCPtr,
+	const char *source,
+	char *dest,
+	int inputSize,
+	int maxOutputSize,
+	limitedOutput_directive limit)
+{
+	LZ4HC_CCtx_internal *ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+
+	/* auto - init if forgotten */
+	if (ctxPtr->base == NULL)
+		LZ4HC_init(ctxPtr, (const BYTE *) source);
+
+	/* Check overflow */
+	if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 * GB) {
+		size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base)
+			- ctxPtr->dictLimit;
+		if (dictSize > 64 * KB)
+			dictSize = 64 * KB;
+		LZ4_loadDictHC(LZ4_streamHCPtr,
+			(const char *)(ctxPtr->end) - dictSize, (int)dictSize);
+	}
+
+	/* Check if blocks follow each other */
+	if ((const BYTE *)source != ctxPtr->end)
+		LZ4HC_setExternalDict(ctxPtr, (const BYTE *)source);
+
+	/* Check overlapping input/dictionary space */
+	{
+		const BYTE *sourceEnd = (const BYTE *) source + inputSize;
+		const BYTE * const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit;
+		const BYTE * const dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit;
+
+		if ((sourceEnd > dictBegin)
+			&& ((const BYTE *)source < dictEnd)) {
+			if (sourceEnd > dictEnd)
+				sourceEnd = dictEnd;
+			ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase);
+
+			if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4)
+				ctxPtr->lowLimit = ctxPtr->dictLimit;
+		}
+	}
+
+	return LZ4HC_compress_generic(ctxPtr, source, dest,
+		inputSize, maxOutputSize, ctxPtr->compressionLevel, limit);
+}
+
+int LZ4_compress_HC_continue(
+	LZ4_streamHC_t *LZ4_streamHCPtr,
+	const char *source,
+	char *dest,
+	int inputSize,
+	int maxOutputSize)
+{
+	if (maxOutputSize < LZ4_compressBound(inputSize))
+		return LZ4_compressHC_continue_generic(LZ4_streamHCPtr,
+			source, dest, inputSize, maxOutputSize, limitedOutput);
+	else
+		return LZ4_compressHC_continue_generic(LZ4_streamHCPtr,
+			source, dest, inputSize, maxOutputSize, noLimit);
+}
+EXPORT_SYMBOL(LZ4_compress_HC_continue);
+
+/* dictionary saving */
+
+int LZ4_saveDictHC(
+	LZ4_streamHC_t *LZ4_streamHCPtr,
+	char *safeBuffer,
+	int dictSize)
+{
+	LZ4HC_CCtx_internal *const streamPtr = &LZ4_streamHCPtr->internal_donotuse;
+	int const prefixSize = (int)(streamPtr->end
+		- (streamPtr->base + streamPtr->dictLimit));
+
+	if (dictSize > 64 * KB)
+		dictSize = 64 * KB;
+	if (dictSize < 4)
+		dictSize = 0;
+	if (dictSize > prefixSize)
+		dictSize = prefixSize;
+
+	memmove(safeBuffer, streamPtr->end - dictSize, dictSize);
 
-exit:
-	return ret;
+	{
+		U32 const endIndex = (U32)(streamPtr->end - streamPtr->base);
+
+		streamPtr->end = (const BYTE *)safeBuffer + dictSize;
+		streamPtr->base = streamPtr->end - endIndex;
+		streamPtr->dictLimit = endIndex - dictSize;
+		streamPtr->lowLimit = endIndex - dictSize;
+
+		if (streamPtr->nextToUpdate < streamPtr->dictLimit)
+			streamPtr->nextToUpdate = streamPtr->dictLimit;
+	}
+	return dictSize;
+}
+EXPORT_SYMBOL(LZ4_saveDictHC);
+
+/*-******************************
+ *	For backwards compatibility
+ ********************************/
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+	unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	*dst_len = LZ4_compress_HC(src, dst, src_len,
+		*dst_len, LZ4HC_DEFAULT_CLEVEL, wrkmem);
+
+	/*
+	 * Prior lz4hc_compress will return -1 in case of error
+	 * and 0 on success
+	 * while new LZ4_compress_HC
+	 * returns 0 in case of error
+	 * and the output length on success
+	 */
+	if (!*dst_len)
+		return -1;
+	else
+		return 0;
 }
 EXPORT_SYMBOL(lz4hc_compress);
 
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("LZ4HC compressor");
+MODULE_DESCRIPTION("LZ4 HC compressor");
-- 
cgit v1.2.3


From 69c78423b8f439b077929410bdf8f88e7031b891 Mon Sep 17 00:00:00 2001
From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Date: Fri, 24 Feb 2017 15:01:25 -0800
Subject: lib/lz4: remove back-compat wrappers

Remove the functions introduced as wrappers for providing backwards
compatibility to the prior LZ4 version.  They're not needed anymore
since there's no callers left.

Link: http://lkml.kernel.org/r/1486321748-19085-6-git-send-email-4sschmid@informatik.uni-hamburg.de
Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Cc: Bongkyu Kim <bongkyu.kim@lge.com>
Cc: Rui Salvaterra <rsalvaterra@gmail.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lz4.h      | 69 ------------------------------------------------
 lib/lz4/lz4_compress.c   | 22 ---------------
 lib/lz4/lz4_decompress.c | 42 -----------------------------
 lib/lz4/lz4hc_compress.c | 23 ----------------
 4 files changed, 156 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 1b0f8ca0f9c8..394e3d9213b8 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -172,18 +172,6 @@ static inline int LZ4_compressBound(size_t isize)
 	return LZ4_COMPRESSBOUND(isize);
 }
 
-/**
- * lz4_compressbound() - For backwards compatibility; see LZ4_compressBound
- * @isize: Size of the input data
- *
- * Return: Max. size LZ4 may output in a "worst case" szenario
- *	(data not compressible)
- */
-static inline int lz4_compressbound(size_t isize)
-{
-	return LZ4_COMPRESSBOUND(isize);
-}
-
 /**
  * LZ4_compress_default() - Compress data from source to dest
  * @source: source address of the original data
@@ -257,20 +245,6 @@ int LZ4_compress_fast(const char *source, char *dest, int inputSize,
 int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr,
 	int targetDestSize, void *wrkmem);
 
-/*
- * lz4_compress() - For backward compatibility, see LZ4_compress_default
- * @src: source address of the original data
- * @src_len: size of the original data
- * @dst: output buffer address of the compressed data. This requires 'dst'
- *	of size LZ4_COMPRESSBOUND
- * @dst_len: is the output size, which is returned after compress done
- * @workmem: address of the working memory.
- *
- * Return: Success if return 0, Error if return < 0
- */
-int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
-	size_t *dst_len, void *wrkmem);
-
 /*-************************************************************************
  *	Decompression Functions
  **************************************************************************/
@@ -346,34 +320,6 @@ int LZ4_decompress_safe(const char *source, char *dest, int compressedSize,
 int LZ4_decompress_safe_partial(const char *source, char *dest,
 	int compressedSize, int targetOutputSize, int maxDecompressedSize);
 
-/*
- * lz4_decompress_unknownoutputsize() - For backwards compatibility,
- *	see LZ4_decompress_safe
- * @src: source address of the compressed data
- * @src_len: is the input size, therefore the compressed size
- * @dest: output buffer address of the decompressed data
- *	which must be already allocated
- * @dest_len: is the max size of the destination buffer, which is
- *	returned with actual size of decompressed data after decompress done
- *
- * Return: Success if return 0, Error if return (< 0)
- */
-int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-	unsigned char *dest, size_t *dest_len);
-
-/**
- * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast
- * @src: source address of the compressed data
- * @src_len: is the input size, which is returned after decompress done
- * @dest: output buffer address of the decompressed data,
- *	which must be already allocated
- * @actual_dest_len: is the size of uncompressed data, supposing it's known
- *
- * Return: Success if return 0, Error if return (< 0)
- */
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-	unsigned char *dest, size_t actual_dest_len);
-
 /*-************************************************************************
  *	LZ4 HC Compression
  **************************************************************************/
@@ -400,21 +346,6 @@ int lz4_decompress(const unsigned char *src, size_t *src_len,
 int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity,
 	int compressionLevel, void *wrkmem);
 
-/**
- * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC
- * @src: source address of the original data
- * @src_len: size of the original data
- * @dst: output buffer address of the compressed data. This requires 'dst'
- *	of size LZ4_COMPRESSBOUND.
- * @dst_len: is the output size, which is returned after compress done
- * @wrkmem: address of the working memory.
- *	This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
- *
- * Return	: Success if return 0, Error if return (< 0)
- */
-int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
-	size_t *dst_len, void *wrkmem);
-
 /**
  * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure
  * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index 53f313f05185..cc7b6d4cc7c7 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -936,27 +936,5 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
 }
 EXPORT_SYMBOL(LZ4_compress_fast_continue);
 
-/*-******************************
- *	For backwards compatibility
- ********************************/
-int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
-	size_t *dst_len, void *wrkmem) {
-	*dst_len = LZ4_compress_default(src, dst, src_len,
-		*dst_len, wrkmem);
-
-	/*
-	 * Prior lz4_compress will return -1 in case of error
-	 * and 0 on success
-	 * while new LZ4_compress_fast/default
-	 * returns 0 in case of error
-	 * and the output length on success
-	 */
-	if (!*dst_len)
-		return -1;
-	else
-		return 0;
-}
-EXPORT_SYMBOL(lz4_compress);
-
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("LZ4 compressor");
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index b5e00a19a7b4..bd3574312b82 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -493,46 +493,6 @@ int LZ4_decompress_fast_usingDict(const char *source, char *dest,
 		originalSize, 0, dictStart, dictSize);
 }
 
-/*-******************************
- *	For backwards compatibility
- ********************************/
-int lz4_decompress_unknownoutputsize(const unsigned char *src,
-	size_t src_len, unsigned char *dest, size_t *dest_len) {
-	*dest_len = LZ4_decompress_safe(src, dest,
-		src_len, *dest_len);
-
-	/*
-	 * Prior lz4_decompress_unknownoutputsize will return
-	 * 0 for success and a negative result for error
-	 * new LZ4_decompress_safe returns
-	 * - the length of data read on success
-	 * - and also a negative result on error
-	 * meaning when result > 0, we just return 0 here
-	 */
-	if (src_len > 0)
-		return 0;
-	else
-		return -1;
-}
-
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-	unsigned char *dest, size_t actual_dest_len) {
-	*src_len = LZ4_decompress_fast(src, dest, actual_dest_len);
-
-	/*
-	 * Prior lz4_decompress will return
-	 * 0 for success and a negative result for error
-	 * new LZ4_decompress_fast returns
-	 * - the length of data read on success
-	 * - and also a negative result on error
-	 * meaning when result > 0, we just return 0 here
-	 */
-	if (*src_len > 0)
-		return 0;
-	else
-		return -1;
-}
-
 #ifndef STATIC
 EXPORT_SYMBOL(LZ4_decompress_safe);
 EXPORT_SYMBOL(LZ4_decompress_safe_partial);
@@ -542,8 +502,6 @@ EXPORT_SYMBOL(LZ4_decompress_safe_continue);
 EXPORT_SYMBOL(LZ4_decompress_fast_continue);
 EXPORT_SYMBOL(LZ4_decompress_safe_usingDict);
 EXPORT_SYMBOL(LZ4_decompress_fast_usingDict);
-EXPORT_SYMBOL(lz4_decompress_unknownoutputsize);
-EXPORT_SYMBOL(lz4_decompress);
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("LZ4 decompressor");
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index e1fbf1561e55..176f03b83e56 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -765,28 +765,5 @@ int LZ4_saveDictHC(
 }
 EXPORT_SYMBOL(LZ4_saveDictHC);
 
-/*-******************************
- *	For backwards compatibility
- ********************************/
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-	unsigned char *dst, size_t *dst_len, void *wrkmem)
-{
-	*dst_len = LZ4_compress_HC(src, dst, src_len,
-		*dst_len, LZ4HC_DEFAULT_CLEVEL, wrkmem);
-
-	/*
-	 * Prior lz4hc_compress will return -1 in case of error
-	 * and 0 on success
-	 * while new LZ4_compress_HC
-	 * returns 0 in case of error
-	 * and the output length on success
-	 */
-	if (!*dst_len)
-		return -1;
-	else
-		return 0;
-}
-EXPORT_SYMBOL(lz4hc_compress);
-
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("LZ4 HC compressor");
-- 
cgit v1.2.3


From 3d1e236022cc1426b0834565995ddee2ca231cee Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 24 Feb 2017 22:31:02 -0600
Subject: objtool: Prevent GCC from merging annotate_unreachable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

0-day bot reported some new objtool warnings which were caused by the
new annotate_unreachable() macro:

  fs/afs/flock.o: warning: objtool: afs_do_unlk()+0x0: duplicate frame pointer save
  fs/afs/flock.o: warning: objtool: afs_do_unlk()+0x0: frame pointer state mismatch
  fs/btrfs/delayed-inode.o: warning: objtool: btrfs_delete_delayed_dir_index()+0x0: duplicate frame pointer save
  fs/btrfs/delayed-inode.o: warning: objtool: btrfs_delete_delayed_dir_index()+0x0: frame pointer state mismatch
  fs/dlm/lock.o: warning: objtool: _grant_lock()+0x0: duplicate frame pointer save
  fs/dlm/lock.o: warning: objtool: _grant_lock()+0x0: frame pointer state mismatch
  fs/ocfs2/alloc.o: warning: objtool: ocfs2_mv_path()+0x0: duplicate frame pointer save
  fs/ocfs2/alloc.o: warning: objtool: ocfs2_mv_path()+0x0: frame pointer state mismatch

It turns out that, for older versions of GCC, if a function has multiple
BUG() incantations, GCC will sometimes merge the corresponding
annotate_unreachable() inline asm statements into a single block.  That
has the undesirable effect of removing one of the entries in the
__unreachable section, confusing objtool greatly.

A workaround for this issue is to ensure that each instance of the
inline asm statement uses a different label, so that GCC sees the
statements are unique and leaves them alone.  The inline asm ‘%=’ token
could be used for that, but unfortunately older versions of GCC don't
support it.  So I implemented a poor man's version of it with the
__LINE__ macro.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")
Link: http://lkml.kernel.org/r/0c14b00baf9f68d1b0221ddb6c88b925181c8be8.1487997036.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 8ea159fc489d..de471346b365 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -197,10 +197,10 @@
 
 #ifdef CONFIG_STACK_VALIDATION
 #define annotate_unreachable() ({					\
-	asm("1:\t\n"							\
+	asm("%c0:\t\n"							\
 	    ".pushsection __unreachable, \"a\"\t\n"			\
-	    ".long 1b\t\n"						\
-	    ".popsection\t\n");						\
+	    ".long %c0b\t\n"						\
+	    ".popsection\t\n" : : "i" (__LINE__));			\
 })
 #else
 #define annotate_unreachable()
-- 
cgit v1.2.3


From 6dbf5cea05a7098a69f294c96b6d76f08562cae5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 24 Feb 2017 13:25:14 +0100
Subject: cpuidle: menu: Avoid taking spinlock for accessing QoS values

After commit 9908859acaa9 (cpuidle/menu: add per CPU PM QoS resume
latency consideration) the cpuidle menu governor calls
dev_pm_qos_read_value() on CPU devices to read the current resume
latency QoS constraint values for them.  That function takes a spinlock
to prevent the device's power.qos pointer from becoming NULL during
the access which is a problem for the RT patchset where spinlocks are
converted into mutexes and the idle loop stops working.

However, it is not even necessary for the menu governor to take
that spinlock, because the power.qos pointer accessed under it
cannot be modified during the access anyway.

For this reason, introduce a "raw" routine for accessing device
QoS resume latency constraints without locking and use it in the
menu governor.

Fixes: 9908859acaa9 (cpuidle/menu: add per CPU PM QoS resume latency consideration)
Acked-by: Alex Shi <alex.shi@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/qos.c         | 3 +--
 drivers/cpuidle/governors/menu.c | 2 +-
 include/linux/pm_qos.h           | 7 +++++++
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 58fcc758334e..73142d086240 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -108,8 +108,7 @@ s32 __dev_pm_qos_read_value(struct device *dev)
 {
 	lockdep_assert_held(&dev->power.lock);
 
-	return IS_ERR_OR_NULL(dev->power.qos) ?
-		0 : pm_qos_read_value(&dev->power.qos->resume_latency);
+	return dev_pm_qos_raw_read_value(dev);
 }
 
 /**
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 8d6d25c38c02..6d6f46e79d94 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -287,7 +287,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	unsigned int interactivity_req;
 	unsigned int expected_interval;
 	unsigned long nr_iowaiters, cpu_load;
-	int resume_latency = dev_pm_qos_read_value(device);
+	int resume_latency = dev_pm_qos_raw_read_value(device);
 
 	if (data->needs_update) {
 		menu_update(drv, dev);
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 0f65d36c2a75..5aba9f47899d 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -173,6 +173,12 @@ static inline s32 dev_pm_qos_requested_flags(struct device *dev)
 {
 	return dev->power.qos->flags_req->data.flr.flags;
 }
+
+static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+{
+	return IS_ERR_OR_NULL(dev->power.qos) ?
+		0 : pm_qos_read_value(&dev->power.qos->resume_latency);
+}
 #else
 static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev,
 							  s32 mask)
@@ -237,6 +243,7 @@ static inline void dev_pm_qos_hide_latency_tolerance(struct device *dev) {}
 
 static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return 0; }
 static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
+static inline s32 dev_pm_qos_raw_read_value(struct device *dev) { return 0; }
 #endif
 
 #endif
-- 
cgit v1.2.3


From 51be7a9a261ce18c520fb3928b168feb77522745 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 12 Jan 2017 23:36:32 +0200
Subject: virtio_mmio: expose header to userspace

It's handy for userspace emulators like QEMU.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_mmio.c     |   2 +-
 include/linux/virtio_mmio.h      | 141 ---------------------------------------
 include/uapi/linux/Kbuild        |   1 +
 include/uapi/linux/virtio_mmio.h | 141 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 143 insertions(+), 142 deletions(-)
 delete mode 100644 include/linux/virtio_mmio.h
 create mode 100644 include/uapi/linux/virtio_mmio.h

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index c71fde5fe835..08357d70a891 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -70,7 +70,7 @@
 #include <linux/spinlock.h>
 #include <linux/virtio.h>
 #include <linux/virtio_config.h>
-#include <linux/virtio_mmio.h>
+#include <uapi/linux/virtio_mmio.h>
 #include <linux/virtio_ring.h>
 
 
diff --git a/include/linux/virtio_mmio.h b/include/linux/virtio_mmio.h
deleted file mode 100644
index c4b09689ab64..000000000000
--- a/include/linux/virtio_mmio.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Virtio platform device driver
- *
- * Copyright 2011, ARM Ltd.
- *
- * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
- *
- * This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _LINUX_VIRTIO_MMIO_H
-#define _LINUX_VIRTIO_MMIO_H
-
-/*
- * Control registers
- */
-
-/* Magic value ("virt" string) - Read Only */
-#define VIRTIO_MMIO_MAGIC_VALUE		0x000
-
-/* Virtio device version - Read Only */
-#define VIRTIO_MMIO_VERSION		0x004
-
-/* Virtio device ID - Read Only */
-#define VIRTIO_MMIO_DEVICE_ID		0x008
-
-/* Virtio vendor ID - Read Only */
-#define VIRTIO_MMIO_VENDOR_ID		0x00c
-
-/* Bitmask of the features supported by the device (host)
- * (32 bits per set) - Read Only */
-#define VIRTIO_MMIO_DEVICE_FEATURES	0x010
-
-/* Device (host) features set selector - Write Only */
-#define VIRTIO_MMIO_DEVICE_FEATURES_SEL	0x014
-
-/* Bitmask of features activated by the driver (guest)
- * (32 bits per set) - Write Only */
-#define VIRTIO_MMIO_DRIVER_FEATURES	0x020
-
-/* Activated features set selector - Write Only */
-#define VIRTIO_MMIO_DRIVER_FEATURES_SEL	0x024
-
-
-#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
-
-/* Guest's memory page size in bytes - Write Only */
-#define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028
-
-#endif
-
-
-/* Queue selector - Write Only */
-#define VIRTIO_MMIO_QUEUE_SEL		0x030
-
-/* Maximum size of the currently selected queue - Read Only */
-#define VIRTIO_MMIO_QUEUE_NUM_MAX	0x034
-
-/* Queue size for the currently selected queue - Write Only */
-#define VIRTIO_MMIO_QUEUE_NUM		0x038
-
-
-#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
-
-/* Used Ring alignment for the currently selected queue - Write Only */
-#define VIRTIO_MMIO_QUEUE_ALIGN		0x03c
-
-/* Guest's PFN for the currently selected queue - Read Write */
-#define VIRTIO_MMIO_QUEUE_PFN		0x040
-
-#endif
-
-
-/* Ready bit for the currently selected queue - Read Write */
-#define VIRTIO_MMIO_QUEUE_READY		0x044
-
-/* Queue notifier - Write Only */
-#define VIRTIO_MMIO_QUEUE_NOTIFY	0x050
-
-/* Interrupt status - Read Only */
-#define VIRTIO_MMIO_INTERRUPT_STATUS	0x060
-
-/* Interrupt acknowledge - Write Only */
-#define VIRTIO_MMIO_INTERRUPT_ACK	0x064
-
-/* Device status register - Read Write */
-#define VIRTIO_MMIO_STATUS		0x070
-
-/* Selected queue's Descriptor Table address, 64 bits in two halves */
-#define VIRTIO_MMIO_QUEUE_DESC_LOW	0x080
-#define VIRTIO_MMIO_QUEUE_DESC_HIGH	0x084
-
-/* Selected queue's Available Ring address, 64 bits in two halves */
-#define VIRTIO_MMIO_QUEUE_AVAIL_LOW	0x090
-#define VIRTIO_MMIO_QUEUE_AVAIL_HIGH	0x094
-
-/* Selected queue's Used Ring address, 64 bits in two halves */
-#define VIRTIO_MMIO_QUEUE_USED_LOW	0x0a0
-#define VIRTIO_MMIO_QUEUE_USED_HIGH	0x0a4
-
-/* Configuration atomicity value */
-#define VIRTIO_MMIO_CONFIG_GENERATION	0x0fc
-
-/* The config space is defined by each driver as
- * the per-driver configuration space - Read Write */
-#define VIRTIO_MMIO_CONFIG		0x100
-
-
-
-/*
- * Interrupt flags (re: interrupt status & acknowledge registers)
- */
-
-#define VIRTIO_MMIO_INT_VRING		(1 << 0)
-#define VIRTIO_MMIO_INT_CONFIG		(1 << 1)
-
-#endif
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index f330ba4547cf..718fa73310e1 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -458,6 +458,7 @@ header-y += virtio_console.h
 header-y += virtio_gpu.h
 header-y += virtio_ids.h
 header-y += virtio_input.h
+header-y += virtio_mmio.h
 header-y += virtio_net.h
 header-y += virtio_pci.h
 header-y += virtio_ring.h
diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h
new file mode 100644
index 000000000000..c4b09689ab64
--- /dev/null
+++ b/include/uapi/linux/virtio_mmio.h
@@ -0,0 +1,141 @@
+/*
+ * Virtio platform device driver
+ *
+ * Copyright 2011, ARM Ltd.
+ *
+ * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_VIRTIO_MMIO_H
+#define _LINUX_VIRTIO_MMIO_H
+
+/*
+ * Control registers
+ */
+
+/* Magic value ("virt" string) - Read Only */
+#define VIRTIO_MMIO_MAGIC_VALUE		0x000
+
+/* Virtio device version - Read Only */
+#define VIRTIO_MMIO_VERSION		0x004
+
+/* Virtio device ID - Read Only */
+#define VIRTIO_MMIO_DEVICE_ID		0x008
+
+/* Virtio vendor ID - Read Only */
+#define VIRTIO_MMIO_VENDOR_ID		0x00c
+
+/* Bitmask of the features supported by the device (host)
+ * (32 bits per set) - Read Only */
+#define VIRTIO_MMIO_DEVICE_FEATURES	0x010
+
+/* Device (host) features set selector - Write Only */
+#define VIRTIO_MMIO_DEVICE_FEATURES_SEL	0x014
+
+/* Bitmask of features activated by the driver (guest)
+ * (32 bits per set) - Write Only */
+#define VIRTIO_MMIO_DRIVER_FEATURES	0x020
+
+/* Activated features set selector - Write Only */
+#define VIRTIO_MMIO_DRIVER_FEATURES_SEL	0x024
+
+
+#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
+
+/* Guest's memory page size in bytes - Write Only */
+#define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028
+
+#endif
+
+
+/* Queue selector - Write Only */
+#define VIRTIO_MMIO_QUEUE_SEL		0x030
+
+/* Maximum size of the currently selected queue - Read Only */
+#define VIRTIO_MMIO_QUEUE_NUM_MAX	0x034
+
+/* Queue size for the currently selected queue - Write Only */
+#define VIRTIO_MMIO_QUEUE_NUM		0x038
+
+
+#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
+
+/* Used Ring alignment for the currently selected queue - Write Only */
+#define VIRTIO_MMIO_QUEUE_ALIGN		0x03c
+
+/* Guest's PFN for the currently selected queue - Read Write */
+#define VIRTIO_MMIO_QUEUE_PFN		0x040
+
+#endif
+
+
+/* Ready bit for the currently selected queue - Read Write */
+#define VIRTIO_MMIO_QUEUE_READY		0x044
+
+/* Queue notifier - Write Only */
+#define VIRTIO_MMIO_QUEUE_NOTIFY	0x050
+
+/* Interrupt status - Read Only */
+#define VIRTIO_MMIO_INTERRUPT_STATUS	0x060
+
+/* Interrupt acknowledge - Write Only */
+#define VIRTIO_MMIO_INTERRUPT_ACK	0x064
+
+/* Device status register - Read Write */
+#define VIRTIO_MMIO_STATUS		0x070
+
+/* Selected queue's Descriptor Table address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_DESC_LOW	0x080
+#define VIRTIO_MMIO_QUEUE_DESC_HIGH	0x084
+
+/* Selected queue's Available Ring address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_AVAIL_LOW	0x090
+#define VIRTIO_MMIO_QUEUE_AVAIL_HIGH	0x094
+
+/* Selected queue's Used Ring address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_USED_LOW	0x0a0
+#define VIRTIO_MMIO_QUEUE_USED_HIGH	0x0a4
+
+/* Configuration atomicity value */
+#define VIRTIO_MMIO_CONFIG_GENERATION	0x0fc
+
+/* The config space is defined by each driver as
+ * the per-driver configuration space - Read Write */
+#define VIRTIO_MMIO_CONFIG		0x100
+
+
+
+/*
+ * Interrupt flags (re: interrupt status & acknowledge registers)
+ */
+
+#define VIRTIO_MMIO_INT_VRING		(1 << 0)
+#define VIRTIO_MMIO_INT_CONFIG		(1 << 1)
+
+#endif
-- 
cgit v1.2.3


From 22ad0b6ab46683975c6da032f1c2593066c7b3bd Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 9 Feb 2017 10:38:09 -0800
Subject: f2fs: add bitmaps for empty or full NAT blocks

This patches adds bitmaps to represent empty or full NAT blocks containing
free nid entries.

If we can find valid crc|cp_ver in the last block of checkpoint pack, we'll
use these bitmaps when building free nids. In order to avoid checkpointing
burden, up-to-date bitmaps will be flushed only during umount time. So,
normally we can get this gain, but when power-cut happens, we rely on fsck.f2fs
which recovers this bitmap again.

After this patch, we build free nids from nid #0 at mount time to make more
full NAT blocks, but in runtime, we check empty NAT blocks to load free nids
without loading any NAT pages from disk.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    |  28 +++++++-
 fs/f2fs/debug.c         |   1 +
 fs/f2fs/f2fs.h          |  31 +++++++-
 fs/f2fs/node.c          | 188 +++++++++++++++++++++++++++++++++++++++++++-----
 fs/f2fs/segment.c       |   2 +-
 include/linux/f2fs_fs.h |   1 +
 6 files changed, 231 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 042f8d9afe44..cd7132121573 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1024,6 +1024,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	spin_lock(&sbi->cp_lock);
 
+	if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count >
+			sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
+		disable_nat_bits(sbi, false);
+
 	if (cpc->reason == CP_UMOUNT)
 		__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 	else
@@ -1136,6 +1140,28 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	start_blk = __start_cp_next_addr(sbi);
 
+	/* write nat bits */
+	if (enabled_nat_bits(sbi, cpc)) {
+		__u64 cp_ver = cur_cp_version(ckpt);
+		unsigned int i;
+		block_t blk;
+
+		cp_ver |= ((__u64)crc32 << 32);
+		*(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
+
+		blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
+		for (i = 0; i < nm_i->nat_bits_blocks; i++)
+			update_meta_page(sbi, nm_i->nat_bits +
+					(i << F2FS_BLKSIZE_BITS), blk + i);
+
+		/* Flush all the NAT BITS pages */
+		while (get_pages(sbi, F2FS_DIRTY_META)) {
+			sync_meta_pages(sbi, META, LONG_MAX);
+			if (unlikely(f2fs_cp_error(sbi)))
+				return -EIO;
+		}
+	}
+
 	/* need to wait for end_io results */
 	wait_on_all_pages_writeback(sbi);
 	if (unlikely(f2fs_cp_error(sbi)))
@@ -1272,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
 
 	/* write cached NAT/SIT entries to NAT/SIT area */
-	flush_nat_entries(sbi);
+	flush_nat_entries(sbi, cpc);
 	flush_sit_entries(sbi, cpc);
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index de8da9fc5c99..015ad2b73a92 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -193,6 +193,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	/* build nm */
 	si->base_mem += sizeof(struct f2fs_nm_info);
 	si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+	si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS);
 
 get_cache:
 	si->cache_mem = 0;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e26cc6909a54..d1156cdd211e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -554,6 +554,7 @@ struct f2fs_nm_info {
 	struct list_head nat_entries;	/* cached nat entry list (clean) */
 	unsigned int nat_cnt;		/* the # of cached nat entries */
 	unsigned int dirty_nat_cnt;	/* total num of nat entries in set */
+	unsigned int nat_blocks;	/* # of nat blocks */
 
 	/* free node ids management */
 	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
@@ -564,6 +565,11 @@ struct f2fs_nm_info {
 
 	/* for checkpoint */
 	char *nat_bitmap;		/* NAT bitmap pointer */
+
+	unsigned int nat_bits_blocks;	/* # of nat bits blocks */
+	unsigned char *nat_bits;	/* NAT bits blocks */
+	unsigned char *full_nat_bits;	/* full NAT pages */
+	unsigned char *empty_nat_bits;	/* empty NAT pages */
 #ifdef CONFIG_F2FS_CHECK_FS
 	char *nat_bitmap_mir;		/* NAT bitmap mirror */
 #endif
@@ -1171,6 +1177,27 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 	spin_unlock(&sbi->cp_lock);
 }
 
+static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
+{
+	set_sbi_flag(sbi, SBI_NEED_FSCK);
+
+	if (lock)
+		spin_lock(&sbi->cp_lock);
+	__clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
+	kfree(NM_I(sbi)->nat_bits);
+	NM_I(sbi)->nat_bits = NULL;
+	if (lock)
+		spin_unlock(&sbi->cp_lock);
+}
+
+static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
+					struct cp_control *cpc)
+{
+	bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+
+	return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set;
+}
+
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
 	down_read(&sbi->cp_rwsem);
@@ -2131,7 +2158,7 @@ void move_node_page(struct page *node_page, int gc_type);
 int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			struct writeback_control *wbc, bool atomic);
 int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc);
-void build_free_nids(struct f2fs_sb_info *sbi, bool sync);
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount);
 bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
 void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
@@ -2142,7 +2169,7 @@ int recover_xattr_data(struct inode *inode, struct page *page,
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
 int restore_node_summary(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct f2fs_summary_block *sum);
-void flush_nat_entries(struct f2fs_sb_info *sbi);
+void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 int build_node_manager(struct f2fs_sb_info *sbi);
 void destroy_node_manager(struct f2fs_sb_info *sbi);
 int __init create_node_manager_caches(void);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8ebc4c78e6a4..43d35ec11851 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -338,6 +338,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 		set_nat_flag(e, IS_CHECKPOINTED, false);
 	__set_nat_cache_dirty(nm_i, e);
 
+	if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR)
+		clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits);
+
 	/* update fsync_mark if its inode nat entry is still alive */
 	if (ni->nid != ni->ino)
 		e = __lookup_nat_cache(nm_i, ni->ino);
@@ -1841,7 +1844,60 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	}
 }
 
-static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+static int scan_nat_bits(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct page *page;
+	unsigned int i = 0;
+	nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK;
+	nid_t nid;
+
+	if (!enabled_nat_bits(sbi, NULL))
+		return -EAGAIN;
+
+	down_read(&nm_i->nat_tree_lock);
+check_empty:
+	i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
+	if (i >= nm_i->nat_blocks) {
+		i = 0;
+		goto check_partial;
+	}
+
+	for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK;
+									nid++) {
+		if (unlikely(nid >= nm_i->max_nid))
+			break;
+		add_free_nid(sbi, nid, true);
+	}
+
+	if (nm_i->nid_cnt[FREE_NID_LIST] >= target)
+		goto out;
+	i++;
+	goto check_empty;
+
+check_partial:
+	i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
+	if (i >= nm_i->nat_blocks) {
+		disable_nat_bits(sbi, true);
+		up_read(&nm_i->nat_tree_lock);
+		return -EINVAL;
+	}
+
+	nid = i * NAT_ENTRY_PER_BLOCK;
+	page = get_current_nat_page(sbi, nid);
+	scan_nat_page(sbi, page, nid);
+	f2fs_put_page(page, 1);
+
+	if (nm_i->nid_cnt[FREE_NID_LIST] < target) {
+		i++;
+		goto check_partial;
+	}
+out:
+	up_read(&nm_i->nat_tree_lock);
+	return 0;
+}
+
+static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1856,6 +1912,21 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 	if (!sync && !available_free_memory(sbi, FREE_NIDS))
 		return;
 
+	/* try to find free nids with nat_bits */
+	if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST])
+		return;
+
+	/* find next valid candidate */
+	if (enabled_nat_bits(sbi, NULL)) {
+		int idx = find_next_zero_bit_le(nm_i->full_nat_bits,
+					nm_i->nat_blocks, 0);
+
+		if (idx >= nm_i->nat_blocks)
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+		else
+			nid = idx * NAT_ENTRY_PER_BLOCK;
+	}
+
 	/* readahead nat pages to be scanned */
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
 							META_NAT, true);
@@ -1898,10 +1969,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 					nm_i->ra_nid_pages, META_NAT, false);
 }
 
-void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 {
 	mutex_lock(&NM_I(sbi)->build_lock);
-	__build_free_nids(sbi, sync);
+	__build_free_nids(sbi, sync, mount);
 	mutex_unlock(&NM_I(sbi)->build_lock);
 }
 
@@ -1943,7 +2014,7 @@ retry:
 	spin_unlock(&nm_i->nid_list_lock);
 
 	/* Let's scan nat pages and its caches to get free nids */
-	build_free_nids(sbi, true);
+	build_free_nids(sbi, true, false);
 	goto retry;
 }
 
@@ -2235,8 +2306,39 @@ add_out:
 	list_add_tail(&nes->set_list, head);
 }
 
+void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+						struct page *page)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
+	struct f2fs_nat_block *nat_blk = page_address(page);
+	int valid = 0;
+	int i;
+
+	if (!enabled_nat_bits(sbi, NULL))
+		return;
+
+	for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
+		if (start_nid == 0 && i == 0)
+			valid++;
+		if (nat_blk->entries[i].block_addr)
+			valid++;
+	}
+	if (valid == 0) {
+		set_bit_le(nat_index, nm_i->empty_nat_bits);
+		clear_bit_le(nat_index, nm_i->full_nat_bits);
+		return;
+	}
+
+	clear_bit_le(nat_index, nm_i->empty_nat_bits);
+	if (valid == NAT_ENTRY_PER_BLOCK)
+		set_bit_le(nat_index, nm_i->full_nat_bits);
+	else
+		clear_bit_le(nat_index, nm_i->full_nat_bits);
+}
+
 static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
-					struct nat_entry_set *set)
+		struct nat_entry_set *set, struct cp_control *cpc)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 	struct f2fs_journal *journal = curseg->journal;
@@ -2251,7 +2353,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 	 * #1, flush nat entries to journal in current hot data summary block.
 	 * #2, flush nat entries to nat page.
 	 */
-	if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
+	if (enabled_nat_bits(sbi, cpc) ||
+		!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
 		to_journal = false;
 
 	if (to_journal) {
@@ -2291,10 +2394,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		}
 	}
 
-	if (to_journal)
+	if (to_journal) {
 		up_write(&curseg->journal_rwsem);
-	else
+	} else {
+		__update_nat_bits(sbi, start_nid, page);
 		f2fs_put_page(page, 1);
+	}
 
 	f2fs_bug_on(sbi, set->entry_cnt);
 
@@ -2305,7 +2410,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 /*
  * This function is called during the checkpointing process.
  */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -2326,7 +2431,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 	 * entries, remove all entries from journal and merge them
 	 * into nat entry set.
 	 */
-	if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+	if (cpc->reason == CP_UMOUNT ||
+		!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
 		remove_nats_in_journal(sbi);
 
 	while ((found = __gang_lookup_nat_set(nm_i,
@@ -2340,27 +2446,72 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 
 	/* flush dirty nats in nat entry set */
 	list_for_each_entry_safe(set, tmp, &sets, set_list)
-		__flush_nat_entry_set(sbi, set);
+		__flush_nat_entry_set(sbi, set, cpc);
 
 	up_write(&nm_i->nat_tree_lock);
 
 	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
 
+static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
+	unsigned int i;
+	__u64 cp_ver = cur_cp_version(ckpt);
+	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+	__u64 crc = le32_to_cpu(*((__le32 *)
+				((unsigned char *)ckpt + crc_offset)));
+	block_t nat_bits_addr;
+
+	if (!enabled_nat_bits(sbi, NULL))
+		return 0;
+
+	nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
+						F2FS_BLKSIZE - 1);
+	nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
+						GFP_KERNEL);
+	if (!nm_i->nat_bits)
+		return -ENOMEM;
+
+	nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
+						nm_i->nat_bits_blocks;
+	for (i = 0; i < nm_i->nat_bits_blocks; i++) {
+		struct page *page = get_meta_page(sbi, nat_bits_addr++);
+
+		memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
+					page_address(page), F2FS_BLKSIZE);
+		f2fs_put_page(page, 1);
+	}
+
+	cp_ver |= (crc << 32);
+	if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
+		disable_nat_bits(sbi, true);
+		return 0;
+	}
+
+	nm_i->full_nat_bits = nm_i->nat_bits + 8;
+	nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
+	f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
+	return 0;
+}
+
 static int init_node_manager(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	unsigned char *version_bitmap;
-	unsigned int nat_segs, nat_blocks;
+	unsigned int nat_segs;
+	int err;
 
 	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
 
 	/* segment_count_nat includes pair segment so divide to 2. */
 	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
-	nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
-
-	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+	nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
@@ -2394,6 +2545,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	if (!nm_i->nat_bitmap)
 		return -ENOMEM;
 
+	err = __get_nat_bitmaps(sbi);
+	if (err)
+		return err;
+
 #ifdef CONFIG_F2FS_CHECK_FS
 	nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
 					GFP_KERNEL);
@@ -2416,7 +2571,7 @@ int build_node_manager(struct f2fs_sb_info *sbi)
 	if (err)
 		return err;
 
-	build_free_nids(sbi, true);
+	build_free_nids(sbi, true, true);
 	return 0;
 }
 
@@ -2475,6 +2630,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	up_write(&nm_i->nat_tree_lock);
 
 	kfree(nm_i->nat_bitmap);
+	kfree(nm_i->nat_bits);
 #ifdef CONFIG_F2FS_CHECK_FS
 	kfree(nm_i->nat_bitmap_mir);
 #endif
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 3e95db5375ed..9eb6d89bf9e2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -386,7 +386,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	if (!available_free_memory(sbi, FREE_NIDS))
 		try_to_free_nids(sbi, MAX_FREE_NIDS);
 	else
-		build_free_nids(sbi, false);
+		build_free_nids(sbi, false, false);
 
 	if (!is_idle(sbi))
 		return;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index f0748524ca8c..1c92ace2e8f8 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -114,6 +114,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_NAT_BITS_FLAG	0x00000080
 #define CP_CRC_RECOVERY_FLAG	0x00000040
 #define CP_FASTBOOT_FLAG	0x00000020
 #define CP_FSCK_FLAG		0x00000010
-- 
cgit v1.2.3


From 4ac912427c4214d8031d9ad6fbc3bc75e71512df Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 23 Feb 2017 10:53:49 +0800
Subject: f2fs: introduce free nid bitmap

In scenario of intensively node allocation, free nids will be ran out
soon, then it needs to stop to load free nids by traversing NAT blocks,
in worse case, if NAT blocks does not be cached in memory, it generates
IOs which slows down our foreground operations.

In order to speed up node allocation, in this patch we introduce a new
free_nid_bitmap array, so there is an bitmap table for each NAT block,
Once the NAT block is loaded, related bitmap cache will be switched on,
and bitmap will be set during traversing nat entries in NAT block, later
we can query and update nid usage status in memory completely.

With such implementation, I expect performance of node allocation can be
improved in the long-term after filesystem image is mounted.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c         |   2 +
 fs/f2fs/f2fs.h          |   2 +
 fs/f2fs/node.c          | 125 ++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/f2fs_fs.h |   1 +
 4 files changed, 120 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 015ad2b73a92..a77df377e2e8 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -194,6 +194,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem += sizeof(struct f2fs_nm_info);
 	si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
 	si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS);
+	si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE;
+	si->base_mem += NM_I(sbi)->nat_blocks / 8;
 
 get_cache:
 	si->cache_mem = 0;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4d332b396384..7e2924981eeb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -555,6 +555,8 @@ struct f2fs_nm_info {
 	unsigned int nid_cnt[MAX_NID_LIST];	/* the number of free node id */
 	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
+	unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
+	unsigned char *nat_block_bitmap;
 
 	/* for checkpoint */
 	char *nat_bitmap;		/* NAT bitmap pointer */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8e53181d5db1..fd12a32f7e87 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1765,7 +1765,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
 		radix_tree_delete(&nm_i->free_nid_root, i->nid);
 }
 
-static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
+/* return if the nid is recognized as free */
+static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
@@ -1774,14 +1775,14 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	/* 0 nid should not be used */
 	if (unlikely(nid == 0))
-		return 0;
+		return false;
 
 	if (build) {
 		/* do not add allocated nids */
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
 				nat_get_blkaddr(ne) != NULL_ADDR))
-			return 0;
+			return false;
 	}
 
 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
@@ -1790,7 +1791,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	if (radix_tree_preload(GFP_NOFS)) {
 		kmem_cache_free(free_nid_slab, i);
-		return 0;
+		return true;
 	}
 
 	spin_lock(&nm_i->nid_list_lock);
@@ -1799,9 +1800,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	radix_tree_preload_end();
 	if (err) {
 		kmem_cache_free(free_nid_slab, i);
-		return 0;
+		return true;
 	}
-	return 1;
+	return true;
 }
 
 static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
@@ -1822,17 +1823,36 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 		kmem_cache_free(free_nid_slab, i);
 }
 
+void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
+	unsigned int nid_ofs = nid - START_NID(nid);
+
+	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
+		return;
+
+	if (set)
+		set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+	else
+		clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+}
+
 static void scan_nat_page(struct f2fs_sb_info *sbi,
 			struct page *nat_page, nid_t start_nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct f2fs_nat_block *nat_blk = page_address(nat_page);
 	block_t blk_addr;
+	unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
 	int i;
 
+	set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
+
 	i = start_nid % NAT_ENTRY_PER_BLOCK;
 
 	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
+		bool freed = false;
 
 		if (unlikely(start_nid >= nm_i->max_nid))
 			break;
@@ -1840,8 +1860,52 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
 		if (blk_addr == NULL_ADDR)
-			add_free_nid(sbi, start_nid, true);
+			freed = add_free_nid(sbi, start_nid, true);
+		update_free_nid_bitmap(sbi, start_nid, freed);
+	}
+}
+
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_journal *journal = curseg->journal;
+	unsigned int i, idx;
+	unsigned int target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK;
+
+	down_read(&nm_i->nat_tree_lock);
+
+	for (i = 0; i < nm_i->nat_blocks; i++) {
+		if (!test_bit_le(i, nm_i->nat_block_bitmap))
+			continue;
+		for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
+			nid_t nid;
+
+			if (!test_bit_le(idx, nm_i->free_nid_bitmap[i]))
+				continue;
+
+			nid = i * NAT_ENTRY_PER_BLOCK + idx;
+			add_free_nid(sbi, nid, true);
+
+			if (nm_i->nid_cnt[FREE_NID_LIST] >= target)
+				goto out;
+		}
+	}
+out:
+	down_read(&curseg->journal_rwsem);
+	for (i = 0; i < nats_in_cursum(journal); i++) {
+		block_t addr;
+		nid_t nid;
+
+		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+		nid = le32_to_cpu(nid_in_journal(journal, i));
+		if (addr == NULL_ADDR)
+			add_free_nid(sbi, nid, true);
+		else
+			remove_free_nid(sbi, nid);
 	}
+	up_read(&curseg->journal_rwsem);
+	up_read(&nm_i->nat_tree_lock);
 }
 
 static int scan_nat_bits(struct f2fs_sb_info *sbi)
@@ -1912,9 +1976,17 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 	if (!sync && !available_free_memory(sbi, FREE_NIDS))
 		return;
 
-	/* try to find free nids with nat_bits */
-	if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST])
-		return;
+	if (!mount) {
+		/* try to find free nids in free_nid_bitmap */
+		scan_free_nid_bits(sbi);
+
+		if (nm_i->nid_cnt[FREE_NID_LIST])
+			return;
+
+		/* try to find free nids with nat_bits */
+		if (!scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST])
+			return;
+	}
 
 	/* find next valid candidate */
 	if (enabled_nat_bits(sbi, NULL)) {
@@ -2010,6 +2082,9 @@ retry:
 		i->state = NID_ALLOC;
 		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
 		nm_i->available_nids--;
+
+		update_free_nid_bitmap(sbi, *nid, false);
+
 		spin_unlock(&nm_i->nid_list_lock);
 		return true;
 	}
@@ -2064,6 +2139,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 
 	nm_i->available_nids++;
 
+	update_free_nid_bitmap(sbi, nid, true);
+
 	spin_unlock(&nm_i->nid_list_lock);
 
 	if (need_free)
@@ -2392,6 +2469,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 			add_free_nid(sbi, nid, false);
 			spin_lock(&NM_I(sbi)->nid_list_lock);
 			NM_I(sbi)->available_nids++;
+			update_free_nid_bitmap(sbi, nid, true);
+			spin_unlock(&NM_I(sbi)->nid_list_lock);
+		} else {
+			spin_lock(&NM_I(sbi)->nid_list_lock);
+			update_free_nid_bitmap(sbi, nid, false);
 			spin_unlock(&NM_I(sbi)->nid_list_lock);
 		}
 	}
@@ -2558,6 +2640,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	return 0;
 }
 
+int init_free_nid_cache(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks *
+					NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
+	if (!nm_i->free_nid_bitmap)
+		return -ENOMEM;
+
+	nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8,
+								GFP_KERNEL);
+	if (!nm_i->nat_block_bitmap)
+		return -ENOMEM;
+	return 0;
+}
+
 int build_node_manager(struct f2fs_sb_info *sbi)
 {
 	int err;
@@ -2570,6 +2668,10 @@ int build_node_manager(struct f2fs_sb_info *sbi)
 	if (err)
 		return err;
 
+	err = init_free_nid_cache(sbi);
+	if (err)
+		return err;
+
 	build_free_nids(sbi, true, true);
 	return 0;
 }
@@ -2628,6 +2730,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	}
 	up_write(&nm_i->nat_tree_lock);
 
+	kvfree(nm_i->nat_block_bitmap);
+	kvfree(nm_i->free_nid_bitmap);
+
 	kfree(nm_i->nat_bitmap);
 	kfree(nm_i->nat_bits);
 #ifdef CONFIG_F2FS_CHECK_FS
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 1c92ace2e8f8..e2d239ed4c60 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -279,6 +279,7 @@ struct f2fs_node {
  * For NAT entries
  */
 #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_BITMAP_SIZE	((NAT_ENTRY_PER_BLOCK + 7) / 8)
 
 struct f2fs_nat_entry {
 	__u8 version;		/* latest version of cached nat entry */
-- 
cgit v1.2.3


From fb5e31d970ce8b4941f03ed765d7dbefc39f22d9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Feb 2017 18:15:22 +0100
Subject: virtio: allow drivers to request IRQ affinity when creating VQs

Add a struct irq_affinity pointer to the find_vqs methods, which if set
is used to tell the PCI layer to create the MSI-X vectors for our I/O
virtqueues with the proper affinity from the start.  Compared to after
the fact affinity hints this gives us an instantly working setup and
allows to allocate the irq descritors node-local and avoid interconnect
traffic.  Last but not least this will allow blk-mq queues are created
based on the interrupt affinity for storage drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c                 |  3 ++-
 drivers/char/virtio_console.c              |  2 +-
 drivers/crypto/virtio/virtio_crypto_core.c |  2 +-
 drivers/gpu/drm/virtio/virtgpu_kms.c       |  2 +-
 drivers/misc/mic/vop/vop_main.c            |  2 +-
 drivers/net/caif/caif_virtio.c             |  3 ++-
 drivers/net/virtio_net.c                   |  2 +-
 drivers/remoteproc/remoteproc_virtio.c     |  3 ++-
 drivers/rpmsg/virtio_rpmsg_bus.c           |  2 +-
 drivers/s390/virtio/kvm_virtio.c           |  3 ++-
 drivers/s390/virtio/virtio_ccw.c           |  3 ++-
 drivers/scsi/virtio_scsi.c                 |  3 ++-
 drivers/virtio/virtio_balloon.c            |  3 ++-
 drivers/virtio/virtio_input.c              |  3 ++-
 drivers/virtio/virtio_mmio.c               |  3 ++-
 drivers/virtio/virtio_pci_common.c         | 19 ++++++++++++-------
 drivers/virtio/virtio_pci_common.h         |  5 ++---
 drivers/virtio/virtio_pci_modern.c         |  7 +++----
 include/linux/virtio_config.h              |  9 +++++----
 net/vmw_vsock/virtio_transport.c           |  3 ++-
 20 files changed, 48 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 10332c24f961..c54118bdc67d 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -411,7 +411,8 @@ static int init_vq(struct virtio_blk *vblk)
 	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names,
+			NULL);
 	if (err)
 		goto out;
 
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 17857beb4892..6266c0568e1d 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1939,7 +1939,7 @@ static int init_vqs(struct ports_device *portdev)
 	/* Find the queues. */
 	err = portdev->vdev->config->find_vqs(portdev->vdev, nr_queues, vqs,
 					      io_callbacks,
-					      (const char **)io_names);
+					      (const char **)io_names, NULL);
 	if (err)
 		goto free;
 
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index fe70ec823b27..0aa2f045543b 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -119,7 +119,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 	}
 
 	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
-					 names);
+					 names, NULL);
 	if (ret)
 		goto err_find;
 
diff --git a/drivers/gpu/drm/virtio/virtgpu_kms.c b/drivers/gpu/drm/virtio/virtgpu_kms.c
index 1235519853f4..e975fa5b0a32 100644
--- a/drivers/gpu/drm/virtio/virtgpu_kms.c
+++ b/drivers/gpu/drm/virtio/virtgpu_kms.c
@@ -172,7 +172,7 @@ int virtio_gpu_driver_load(struct drm_device *dev, unsigned long flags)
 		 vgdev->has_virgl_3d ? "enabled" : "not available");
 
 	ret = vgdev->vdev->config->find_vqs(vgdev->vdev, 2, vqs,
-					    callbacks, names);
+					    callbacks, names, NULL);
 	if (ret) {
 		DRM_ERROR("failed to find virt queues\n");
 		goto err_vqs;
diff --git a/drivers/misc/mic/vop/vop_main.c b/drivers/misc/mic/vop/vop_main.c
index 1a2b67f3183d..c2e29d7f0de8 100644
--- a/drivers/misc/mic/vop/vop_main.c
+++ b/drivers/misc/mic/vop/vop_main.c
@@ -374,7 +374,7 @@ unmap:
 static int vop_find_vqs(struct virtio_device *dev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
-			const char * const names[])
+			const char * const names[], struct irq_affinity *desc)
 {
 	struct _vop_vdev *vdev = to_vopvdev(dev);
 	struct vop_device *vpdev = vdev->vpdev;
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index b306210b02b7..bc0eb47eccee 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -679,7 +679,8 @@ static int cfv_probe(struct virtio_device *vdev)
 		goto err;
 
 	/* Get the TX virtio ring. This is a "guest side vring". */
-	err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
+	err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names,
+			NULL);
 	if (err)
 		goto err;
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 765c2d6358da..9be74c2dfb22 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2003,7 +2003,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
 	}
 
 	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
-					 names);
+					 names, NULL);
 	if (ret)
 		goto err_find;
 
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 364411fb7734..0142cc3f0c91 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -137,7 +137,8 @@ static void rproc_virtio_del_vqs(struct virtio_device *vdev)
 static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				 struct virtqueue *vqs[],
 				 vq_callback_t *callbacks[],
-				 const char * const names[])
+				 const char * const names[],
+				 struct irq_affinity *desc)
 {
 	int i, ret;
 
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 3090b0d3072f..5e66e081027e 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -869,7 +869,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
 	init_waitqueue_head(&vrp->sendq);
 
 	/* We expect two virtqueues, rx and tx (and in this order) */
-	err = vdev->config->find_vqs(vdev, 2, vqs, vq_cbs, names);
+	err = vdev->config->find_vqs(vdev, 2, vqs, vq_cbs, names, NULL);
 	if (err)
 		goto free_vrp;
 
diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index 5e5c11f37b24..2ce0b3eb2efe 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -255,7 +255,8 @@ static void kvm_del_vqs(struct virtio_device *vdev)
 static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
-			const char * const names[])
+			const char * const names[],
+			struct irq_affinity *desc)
 {
 	struct kvm_device *kdev = to_kvmdev(vdev);
 	int i;
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 070c4da95f48..304d3b3cbfd3 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -628,7 +628,8 @@ out:
 static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
-			       const char * const names[])
+			       const char * const names[],
+			       struct irq_affinity *desc)
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
 	unsigned long *indicatorp = NULL;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index c680d7641311..c9c5ea0611e9 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -941,7 +941,8 @@ static int virtscsi_init(struct virtio_device *vdev,
 	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names,
+			NULL);
 	if (err)
 		goto out;
 
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 181793f07852..36c9c8fcb7f8 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -413,7 +413,8 @@ static int init_vqs(struct virtio_balloon *vb)
 	 * optionally stat.
 	 */
 	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names);
+	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
+			NULL);
 	if (err)
 		return err;
 
diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
index 350a2a5a49db..79f1293cda93 100644
--- a/drivers/virtio/virtio_input.c
+++ b/drivers/virtio/virtio_input.c
@@ -173,7 +173,8 @@ static int virtinput_init_vqs(struct virtio_input *vi)
 	static const char * const names[] = { "events", "status" };
 	int err;
 
-	err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names);
+	err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names,
+			NULL);
 	if (err)
 		return err;
 	vi->evt = vqs[0];
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 08357d70a891..78343b8f9034 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -446,7 +446,8 @@ error_available:
 static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
-		       const char * const names[])
+		       const char * const names[],
+		       struct irq_affinity *desc)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
 	unsigned int irq = platform_get_irq(vm_dev->pdev, 0);
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 822f8e5dcee4..7902e920fc73 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -143,22 +143,28 @@ void vp_del_vqs(struct virtio_device *vdev)
 
 static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[])
+		const char * const names[], struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	const char *name = dev_name(&vp_dev->vdev.dev);
 	int i, err = -ENOMEM, allocated_vectors, nvectors;
+	unsigned flags = PCI_IRQ_MSIX;
 	bool shared = false;
 	u16 msix_vec;
 
+	if (desc) {
+		flags |= PCI_IRQ_AFFINITY;
+		desc->pre_vectors++; /* virtio config vector */
+	}
+
 	nvectors = 1;
 	for (i = 0; i < nvqs; i++)
 		if (callbacks[i])
 			nvectors++;
 
 	/* Try one vector per queue first. */
-	err = pci_alloc_irq_vectors(vp_dev->pci_dev, nvectors, nvectors,
-			PCI_IRQ_MSIX);
+	err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors,
+			nvectors, flags, desc);
 	if (err < 0) {
 		/* Fallback to one vector for config, one shared for queues. */
 		shared = true;
@@ -308,13 +314,12 @@ out_remove_vqs:
 
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-		struct virtqueue *vqs[],
-		vq_callback_t *callbacks[],
-		const char * const names[])
+		struct virtqueue *vqs[], vq_callback_t *callbacks[],
+		const char * const names[], struct irq_affinity *desc)
 {
 	int err;
 
-	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names);
+	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, desc);
 	if (!err)
 		return 0;
 	return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names);
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 217ca876eed7..a6ad9ec6baef 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -97,9 +97,8 @@ bool vp_notify(struct virtqueue *vq);
 void vp_del_vqs(struct virtio_device *vdev);
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-		       struct virtqueue *vqs[],
-		       vq_callback_t *callbacks[],
-		       const char * const names[]);
+		struct virtqueue *vqs[], vq_callback_t *callbacks[],
+		const char * const names[], struct irq_affinity *desc);
 const char *vp_bus_name(struct virtio_device *vdev);
 
 /* Setup the affinity for a virtqueue:
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index e5ce31091953..a7a0981e441c 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -384,13 +384,12 @@ err_map_notify:
 }
 
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-			      struct virtqueue *vqs[],
-			      vq_callback_t *callbacks[],
-			      const char * const names[])
+		struct virtqueue *vqs[], vq_callback_t *callbacks[],
+		const char * const names[], struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
-	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names);
+	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, desc);
 
 	if (rc)
 		return rc;
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 26c155bb639b..2ebe506fe41a 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -7,6 +7,8 @@
 #include <linux/virtio_byteorder.h>
 #include <uapi/linux/virtio_config.h>
 
+struct irq_affinity;
+
 /**
  * virtio_config_ops - operations for configuring a virtio device
  * @get: read the value of a configuration field
@@ -68,9 +70,8 @@ struct virtio_config_ops {
 	void (*set_status)(struct virtio_device *vdev, u8 status);
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
-			struct virtqueue *vqs[],
-			vq_callback_t *callbacks[],
-			const char * const names[]);
+			struct virtqueue *vqs[], vq_callback_t *callbacks[],
+			const char * const names[], struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	u64 (*get_features)(struct virtio_device *vdev);
 	int (*finalize_features)(struct virtio_device *vdev);
@@ -169,7 +170,7 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	vq_callback_t *callbacks[] = { c };
 	const char *names[] = { n };
 	struct virtqueue *vq;
-	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names);
+	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 6788264acc63..9d24c0e958b1 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -532,7 +532,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 	vsock->vdev = vdev;
 
 	ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
-					    vsock->vqs, callbacks, names);
+					    vsock->vqs, callbacks, names,
+					    NULL);
 	if (ret < 0)
 		goto out;
 
-- 
cgit v1.2.3


From bbaba479563910aaa51e59bb9027a09e396d3a3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Feb 2017 18:15:23 +0100
Subject: virtio: provide a method to get the IRQ affinity mask for a virtqueue

This basically passed up the pci_irq_get_affinity information through
virtio through an optional get_vq_affinity method.  It is only implemented
by the PCI backend for now, and only when we use per-virtqueue IRQs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_common.c | 11 +++++++++++
 drivers/virtio/virtio_pci_common.h |  2 ++
 drivers/virtio/virtio_pci_legacy.c |  1 +
 drivers/virtio/virtio_pci_modern.c |  2 ++
 include/linux/virtio_config.h      |  3 +++
 5 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 7902e920fc73..df548a6fb844 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -361,6 +361,17 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
 	return 0;
 }
 
+const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	unsigned int *map = vp_dev->msix_vector_map;
+
+	if (!map || map[index] == VIRTIO_MSI_NO_VECTOR)
+		return NULL;
+
+	return pci_irq_get_affinity(vp_dev->pci_dev, map[index]);
+}
+
 #ifdef CONFIG_PM_SLEEP
 static int virtio_pci_freeze(struct device *dev)
 {
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index a6ad9ec6baef..ac8c9d788964 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -108,6 +108,8 @@ const char *vp_bus_name(struct virtio_device *vdev);
  */
 int vp_set_vq_affinity(struct virtqueue *vq, int cpu);
 
+const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index);
+
 #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY)
 int virtio_pci_legacy_probe(struct virtio_pci_device *);
 void virtio_pci_legacy_remove(struct virtio_pci_device *);
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 2ab6aee51bf6..f7362c5fe18a 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -190,6 +190,7 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
 	.finalize_features = vp_finalize_features,
 	.bus_name	= vp_bus_name,
 	.set_vq_affinity = vp_set_vq_affinity,
+	.get_vq_affinity = vp_get_vq_affinity,
 };
 
 /* the PCI probing function */
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index a7a0981e441c..7bc3004b840e 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -437,6 +437,7 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
 	.finalize_features = vp_finalize_features,
 	.bus_name	= vp_bus_name,
 	.set_vq_affinity = vp_set_vq_affinity,
+	.get_vq_affinity = vp_get_vq_affinity,
 };
 
 static const struct virtio_config_ops virtio_pci_config_ops = {
@@ -452,6 +453,7 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
 	.finalize_features = vp_finalize_features,
 	.bus_name	= vp_bus_name,
 	.set_vq_affinity = vp_set_vq_affinity,
+	.get_vq_affinity = vp_get_vq_affinity,
 };
 
 /**
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 2ebe506fe41a..8355bab175e1 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -58,6 +58,7 @@ struct irq_affinity;
  *      This returns a pointer to the bus name a la pci_name from which
  *      the caller can then copy.
  * @set_vq_affinity: set the affinity for a virtqueue.
+ * @get_vq_affinity: get the affinity for a virtqueue (optional).
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
@@ -77,6 +78,8 @@ struct virtio_config_ops {
 	int (*finalize_features)(struct virtio_device *vdev);
 	const char *(*bus_name)(struct virtio_device *vdev);
 	int (*set_vq_affinity)(struct virtqueue *vq, int cpu);
+	const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev,
+			int index);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
-- 
cgit v1.2.3


From 73473427bb551686e4b68ecd99bfd27e6635286a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Feb 2017 18:15:24 +0100
Subject: blk-mq: provide a default queue mapping for virtio device

Similar to the PCI version, just calling into virtio instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 block/Kconfig                 |  5 ++++
 block/Makefile                |  1 +
 block/blk-mq-virtio.c         | 54 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq-virtio.h | 10 ++++++++
 4 files changed, 70 insertions(+)
 create mode 100644 block/blk-mq-virtio.c
 create mode 100644 include/linux/blk-mq-virtio.h

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 8bf114a3858a..3523b4f0cd8b 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -165,4 +165,9 @@ config BLK_MQ_PCI
 	depends on BLOCK && PCI
 	default y
 
+config BLK_MQ_VIRTIO
+	bool
+	depends on BLOCK && VIRTIO
+	default y
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..60691949d28d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -23,5 +23,6 @@ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
 obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
+obj-$(CONFIG_BLK_MQ_VIRTIO)	+= blk-mq-virtio.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
new file mode 100644
index 000000000000..c3afbca11299
--- /dev/null
+++ b/block/blk-mq-virtio.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/device.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/module.h>
+#include "blk-mq.h"
+
+/**
+ * blk_mq_virtio_map_queues - provide a default queue mapping for virtio device
+ * @set:	tagset to provide the mapping for
+ * @vdev:	virtio device associated with @set.
+ * @first_vec:	first interrupt vectors to use for queues (usually 0)
+ *
+ * This function assumes the virtio device @vdev has at least as many available
+ * interrupt vetors as @set has queues.  It will then queuery the vector
+ * corresponding to each queue for it's affinity mask and built queue mapping
+ * that maps a queue to the CPUs that have irq affinity for the corresponding
+ * vector.
+ */
+int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
+		struct virtio_device *vdev, int first_vec)
+{
+	const struct cpumask *mask;
+	unsigned int queue, cpu;
+
+	if (!vdev->config->get_vq_affinity)
+		goto fallback;
+
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		mask = vdev->config->get_vq_affinity(vdev, first_vec + queue);
+		if (!mask)
+			goto fallback;
+
+		for_each_cpu(cpu, mask)
+			set->mq_map[cpu] = queue;
+	}
+
+	return 0;
+fallback:
+	return blk_mq_map_queues(set);
+}
+EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h
new file mode 100644
index 000000000000..b1ef6e14744f
--- /dev/null
+++ b/include/linux/blk-mq-virtio.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_BLK_MQ_VIRTIO_H
+#define _LINUX_BLK_MQ_VIRTIO_H
+
+struct blk_mq_tag_set;
+struct virtio_device;
+
+int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
+		struct virtio_device *vdev, int first_vec);
+
+#endif /* _LINUX_BLK_MQ_VIRTIO_H */
-- 
cgit v1.2.3


From 0d9f0a52c8b9f7a003fe1650b7d5fb8518efabe0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Feb 2017 18:15:26 +0100
Subject: virtio_scsi: use virtio IRQ affinity

Use automatic IRQ affinity assignment in the virtio layer if available,
and build the blk-mq queues based on it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/scsi/virtio_scsi.c | 126 +++++----------------------------------------
 include/linux/cpuhotplug.h |   1 -
 2 files changed, 12 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index c9c5ea0611e9..939c47df73fa 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mempool.h>
+#include <linux/interrupt.h>
 #include <linux/virtio.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
@@ -29,6 +30,7 @@
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_tcq.h>
 #include <linux/seqlock.h>
+#include <linux/blk-mq-virtio.h>
 
 #define VIRTIO_SCSI_MEMPOOL_SZ 64
 #define VIRTIO_SCSI_EVENT_LEN 8
@@ -108,7 +110,6 @@ struct virtio_scsi {
 	bool affinity_hint_set;
 
 	struct hlist_node node;
-	struct hlist_node node_dead;
 
 	/* Protected by event_vq lock */
 	bool stop_events;
@@ -118,7 +119,6 @@ struct virtio_scsi {
 	struct virtio_scsi_vq req_vqs[];
 };
 
-static enum cpuhp_state virtioscsi_online;
 static struct kmem_cache *virtscsi_cmd_cache;
 static mempool_t *virtscsi_cmd_pool;
 
@@ -766,6 +766,13 @@ static void virtscsi_target_destroy(struct scsi_target *starget)
 	kfree(tgt);
 }
 
+static int virtscsi_map_queues(struct Scsi_Host *shost)
+{
+	struct virtio_scsi *vscsi = shost_priv(shost);
+
+	return blk_mq_virtio_map_queues(&shost->tag_set, vscsi->vdev, 2);
+}
+
 static struct scsi_host_template virtscsi_host_template_single = {
 	.module = THIS_MODULE,
 	.name = "Virtio SCSI HBA",
@@ -801,6 +808,7 @@ static struct scsi_host_template virtscsi_host_template_multi = {
 	.use_clustering = ENABLE_CLUSTERING,
 	.target_alloc = virtscsi_target_alloc,
 	.target_destroy = virtscsi_target_destroy,
+	.map_queues = virtscsi_map_queues,
 	.track_queue_depth = 1,
 };
 
@@ -817,80 +825,6 @@ static struct scsi_host_template virtscsi_host_template_multi = {
 		virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \
 	} while(0)
 
-static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
-{
-	int i;
-	int cpu;
-
-	/* In multiqueue mode, when the number of cpu is equal
-	 * to the number of request queues, we let the qeueues
-	 * to be private to one cpu by setting the affinity hint
-	 * to eliminate the contention.
-	 */
-	if ((vscsi->num_queues == 1 ||
-	     vscsi->num_queues != num_online_cpus()) && affinity) {
-		if (vscsi->affinity_hint_set)
-			affinity = false;
-		else
-			return;
-	}
-
-	if (affinity) {
-		i = 0;
-		for_each_online_cpu(cpu) {
-			virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
-			i++;
-		}
-
-		vscsi->affinity_hint_set = true;
-	} else {
-		for (i = 0; i < vscsi->num_queues; i++) {
-			if (!vscsi->req_vqs[i].vq)
-				continue;
-
-			virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
-		}
-
-		vscsi->affinity_hint_set = false;
-	}
-}
-
-static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
-{
-	get_online_cpus();
-	__virtscsi_set_affinity(vscsi, affinity);
-	put_online_cpus();
-}
-
-static int virtscsi_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
-	struct virtio_scsi *vscsi = hlist_entry_safe(node, struct virtio_scsi,
-						     node);
-	__virtscsi_set_affinity(vscsi, true);
-	return 0;
-}
-
-static int virtscsi_cpu_notif_add(struct virtio_scsi *vi)
-{
-	int ret;
-
-	ret = cpuhp_state_add_instance(virtioscsi_online, &vi->node);
-	if (ret)
-		return ret;
-
-	ret = cpuhp_state_add_instance(CPUHP_VIRT_SCSI_DEAD, &vi->node_dead);
-	if (ret)
-		cpuhp_state_remove_instance(virtioscsi_online, &vi->node);
-	return ret;
-}
-
-static void virtscsi_cpu_notif_remove(struct virtio_scsi *vi)
-{
-	cpuhp_state_remove_instance_nocalls(virtioscsi_online, &vi->node);
-	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_SCSI_DEAD,
-					    &vi->node_dead);
-}
-
 static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 			     struct virtqueue *vq)
 {
@@ -900,14 +834,8 @@ static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 
 static void virtscsi_remove_vqs(struct virtio_device *vdev)
 {
-	struct Scsi_Host *sh = virtio_scsi_host(vdev);
-	struct virtio_scsi *vscsi = shost_priv(sh);
-
-	virtscsi_set_affinity(vscsi, false);
-
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
-
 	vdev->config->del_vqs(vdev);
 }
 
@@ -920,6 +848,7 @@ static int virtscsi_init(struct virtio_device *vdev,
 	vq_callback_t **callbacks;
 	const char **names;
 	struct virtqueue **vqs;
+	struct irq_affinity desc = { .pre_vectors = 2 };
 
 	num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
 	vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
@@ -942,7 +871,7 @@ static int virtscsi_init(struct virtio_device *vdev,
 
 	/* Discover virtqueues and write information to configuration.  */
 	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names,
-			NULL);
+			&desc);
 	if (err)
 		goto out;
 
@@ -1008,10 +937,6 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	if (err)
 		goto virtscsi_init_failed;
 
-	err = virtscsi_cpu_notif_add(vscsi);
-	if (err)
-		goto scsi_add_host_failed;
-
 	cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
 	shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
 	shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
@@ -1066,9 +991,6 @@ static void virtscsi_remove(struct virtio_device *vdev)
 		virtscsi_cancel_event_work(vscsi);
 
 	scsi_remove_host(shost);
-
-	virtscsi_cpu_notif_remove(vscsi);
-
 	virtscsi_remove_vqs(vdev);
 	scsi_host_put(shost);
 }
@@ -1076,10 +998,6 @@ static void virtscsi_remove(struct virtio_device *vdev)
 #ifdef CONFIG_PM_SLEEP
 static int virtscsi_freeze(struct virtio_device *vdev)
 {
-	struct Scsi_Host *sh = virtio_scsi_host(vdev);
-	struct virtio_scsi *vscsi = shost_priv(sh);
-
-	virtscsi_cpu_notif_remove(vscsi);
 	virtscsi_remove_vqs(vdev);
 	return 0;
 }
@@ -1094,11 +1012,6 @@ static int virtscsi_restore(struct virtio_device *vdev)
 	if (err)
 		return err;
 
-	err = virtscsi_cpu_notif_add(vscsi);
-	if (err) {
-		vdev->config->del_vqs(vdev);
-		return err;
-	}
 	virtio_device_ready(vdev);
 
 	if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
@@ -1153,16 +1066,6 @@ static int __init init(void)
 		pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
 		goto error;
 	}
-	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
-				      "scsi/virtio:online",
-				      virtscsi_cpu_online, NULL);
-	if (ret < 0)
-		goto error;
-	virtioscsi_online = ret;
-	ret = cpuhp_setup_state_multi(CPUHP_VIRT_SCSI_DEAD, "scsi/virtio:dead",
-				      NULL, virtscsi_cpu_online);
-	if (ret)
-		goto error;
 	ret = register_virtio_driver(&virtio_scsi_driver);
 	if (ret < 0)
 		goto error;
@@ -1178,17 +1081,12 @@ error:
 		kmem_cache_destroy(virtscsi_cmd_cache);
 		virtscsi_cmd_cache = NULL;
 	}
-	if (virtioscsi_online)
-		cpuhp_remove_multi_state(virtioscsi_online);
-	cpuhp_remove_multi_state(CPUHP_VIRT_SCSI_DEAD);
 	return ret;
 }
 
 static void __exit fini(void)
 {
 	unregister_virtio_driver(&virtio_scsi_driver);
-	cpuhp_remove_multi_state(virtioscsi_online);
-	cpuhp_remove_multi_state(CPUHP_VIRT_SCSI_DEAD);
 	mempool_destroy(virtscsi_cmd_pool);
 	kmem_cache_destroy(virtscsi_cmd_cache);
 }
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 921acaaa1601..01aea80a503e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -26,7 +26,6 @@ enum cpuhp_state {
 	CPUHP_ARM_OMAP_WAKE_DEAD,
 	CPUHP_IRQ_POLL_DEAD,
 	CPUHP_BLOCK_SOFTIRQ_DEAD,
-	CPUHP_VIRT_SCSI_DEAD,
 	CPUHP_ACPI_CPUDRV_DEAD,
 	CPUHP_S390_PFAULT_DEAD,
 	CPUHP_BLK_MQ_DEAD,
-- 
cgit v1.2.3


From 7d134b2ce639448199052fd573a324f7e7cd5ed8 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Mon, 27 Feb 2017 14:26:56 -0800
Subject: kprobes: move kprobe declarations to asm-generic/kprobes.h

Often all is needed is these small helpers, instead of compiler.h or a
full kprobes.h.  This is important for asm helpers, in fact even some
asm/kprobes.h make use of these helpers...  instead just keep a generic
asm file with helpers useful for asm code with the least amount of
clutter as possible.

Likewise we need now to also address what to do about this file for both
when architectures have CONFIG_HAVE_KPROBES, and when they do not.  Then
for when architectures have CONFIG_HAVE_KPROBES but have disabled
CONFIG_KPROBES.

Right now most asm/kprobes.h do not have guards against CONFIG_KPROBES,
this means most architecture code cannot include asm/kprobes.h safely.
Correct this and add guards for architectures missing them.
Additionally provide architectures that not have kprobes support with
the default asm-generic solution.  This lets us force asm/kprobes.h on
the header include/linux/kprobes.h always, but most importantly we can
now safely include just asm/kprobes.h on architecture code without
bringing the full kitchen sink of header files.

Two architectures already provided a guard against CONFIG_KPROBES on its
kprobes.h: sh, arch.  The rest of the architectures needed gaurds added.
We avoid including any not-needed headers on asm/kprobes.h unless
kprobes have been enabled.

In a subsequent atomic change we can try now to remove compiler.h from
include/linux/kprobes.h.

During this sweep I've also identified a few architectures defining a
common macro needed for both kprobes and ftrace, that of the definition
of the breakput instruction up.  Some refer to this as
BREAKPOINT_INSTRUCTION.  This must be kept outside of the #ifdef
CONFIG_KPROBES guard.

[mcgrof@kernel.org: fix arm64 build]
  Link: http://lkml.kernel.org/r/CAB=NE6X1WMByuARS4mZ1g9+W=LuVBnMDnh_5zyN0CLADaVh=Jw@mail.gmail.com
[sfr@canb.auug.org.au: fixup for kprobes declarations moving]
  Link: http://lkml.kernel.org/r/20170214165933.13ebd4f4@canb.auug.org.au
Link: http://lkml.kernel.org/r/20170203233139.32682-1-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                            |  1 +
 arch/alpha/include/asm/Kbuild          |  1 +
 arch/arc/include/asm/kprobes.h         |  6 ++++--
 arch/arm/include/asm/kprobes.h         |  4 ++++
 arch/arm/probes/decode.h               |  1 +
 arch/arm64/include/asm/kprobes.h       |  4 ++++
 arch/arm64/kernel/armv8_deprecated.c   |  1 +
 arch/arm64/kernel/insn.c               |  1 +
 arch/arm64/kernel/probes/decode-insn.h |  2 ++
 arch/avr32/include/asm/kprobes.h       |  7 ++++++-
 arch/blackfin/include/asm/Kbuild       |  1 +
 arch/c6x/include/asm/Kbuild            |  1 +
 arch/cris/include/asm/Kbuild           |  1 +
 arch/frv/include/asm/Kbuild            |  1 +
 arch/h8300/include/asm/Kbuild          |  1 +
 arch/hexagon/include/asm/Kbuild        |  1 +
 arch/ia64/include/asm/kprobes.h        | 12 +++++++++---
 arch/m32r/include/asm/Kbuild           |  1 +
 arch/m68k/include/asm/Kbuild           |  1 +
 arch/metag/include/asm/Kbuild          |  1 +
 arch/microblaze/include/asm/Kbuild     |  1 +
 arch/mips/include/asm/kprobes.h        |  6 +++++-
 arch/mn10300/include/asm/kprobes.h     |  7 ++++++-
 arch/nios2/include/asm/Kbuild          |  1 +
 arch/openrisc/include/asm/Kbuild       |  1 +
 arch/parisc/include/asm/Kbuild         |  1 +
 arch/powerpc/include/asm/kprobes.h     |  3 +++
 arch/powerpc/lib/code-patching.c       |  1 +
 arch/s390/include/asm/kprobes.h        |  7 ++++++-
 arch/score/include/asm/Kbuild          |  1 +
 arch/sh/include/asm/kprobes.h          |  5 ++++-
 arch/sparc/include/asm/kprobes.h       | 10 ++++++++--
 arch/tile/include/asm/kprobes.h        |  6 +++++-
 arch/um/include/asm/Kbuild             |  1 +
 arch/unicore32/include/asm/Kbuild      |  1 +
 arch/x86/include/asm/kprobes.h         |  9 ++++++++-
 arch/xtensa/include/asm/Kbuild         |  1 +
 include/asm-generic/kprobes.h          | 25 +++++++++++++++++++++++++
 include/linux/compiler.h               |  8 --------
 include/linux/kprobes.h                | 19 +++----------------
 40 files changed, 125 insertions(+), 38 deletions(-)
 create mode 100644 include/asm-generic/kprobes.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6cd8945b9094..846f97aa3508 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7286,6 +7286,7 @@ M:	Masami Hiramatsu <mhiramat@kernel.org>
 S:	Maintained
 F:	Documentation/kprobes.txt
 F:	include/linux/kprobes.h
+F:	include/asm-generic/kprobes.h
 F:	kernel/kprobes.c
 
 KS0108 LCD CONTROLLER DRIVER
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 46e47c088622..d103db5af5ff 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -10,3 +10,4 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += trace_clock.h
 generic-y += current.h
+generic-y += kprobes.h
diff --git a/arch/arc/include/asm/kprobes.h b/arch/arc/include/asm/kprobes.h
index 944dbedb38b5..00bdbe167615 100644
--- a/arch/arc/include/asm/kprobes.h
+++ b/arch/arc/include/asm/kprobes.h
@@ -9,6 +9,8 @@
 #ifndef _ARC_KPROBES_H
 #define _ARC_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
 #ifdef CONFIG_KPROBES
 
 typedef u16 kprobe_opcode_t;
@@ -55,6 +57,6 @@ void trap_is_kprobe(unsigned long address, struct pt_regs *regs);
 static void trap_is_kprobe(unsigned long address, struct pt_regs *regs)
 {
 }
-#endif
+#endif /* CONFIG_KPROBES */
 
-#endif
+#endif /* _ARC_KPROBES_H */
diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h
index 3ea9be559726..59655459da59 100644
--- a/arch/arm/include/asm/kprobes.h
+++ b/arch/arm/include/asm/kprobes.h
@@ -16,6 +16,9 @@
 #ifndef _ARM_KPROBES_H
 #define _ARM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/notifier.h>
@@ -83,4 +86,5 @@ struct arch_optimized_insn {
 	 */
 };
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ARM_KPROBES_H */
diff --git a/arch/arm/probes/decode.h b/arch/arm/probes/decode.h
index f9b08ba7fe73..548d622a3159 100644
--- a/arch/arm/probes/decode.h
+++ b/arch/arm/probes/decode.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <asm/probes.h>
+#include <asm/kprobes.h>
 
 void __init arm_probes_decode_init(void);
 
diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h
index 1737aecfcc5e..6deb8d726041 100644
--- a/arch/arm64/include/asm/kprobes.h
+++ b/arch/arm64/include/asm/kprobes.h
@@ -16,6 +16,9 @@
 #ifndef _ARM_KPROBES_H
 #define _ARM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -57,4 +60,5 @@ int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr);
 void kretprobe_trampoline(void);
 void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ARM_KPROBES_H */
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 86032a012388..657977e77ec8 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -19,6 +19,7 @@
 #include <asm/sysreg.h>
 #include <asm/system_misc.h>
 #include <asm/traps.h>
+#include <asm/kprobes.h>
 #include <linux/uaccess.h>
 #include <asm/cpufeature.h>
 
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index b6badff5a151..3a63954a8b14 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -31,6 +31,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/fixmap.h>
 #include <asm/insn.h>
+#include <asm/kprobes.h>
 
 #define AARCH64_INSN_SF_BIT	BIT(31)
 #define AARCH64_INSN_N_BIT	BIT(22)
diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h
index 76d3f315407f..192ab007bacb 100644
--- a/arch/arm64/kernel/probes/decode-insn.h
+++ b/arch/arm64/kernel/probes/decode-insn.h
@@ -16,6 +16,8 @@
 #ifndef _ARM_KERNEL_KPROBES_ARM64_H
 #define _ARM_KERNEL_KPROBES_ARM64_H
 
+#include <asm/kprobes.h>
+
 /*
  * ARM strongly recommends a limit of 128 bytes between LoadExcl and
  * StoreExcl instructions in a single thread of execution. So keep the
diff --git a/arch/avr32/include/asm/kprobes.h b/arch/avr32/include/asm/kprobes.h
index 45f563ed73fd..28dfc61ad384 100644
--- a/arch/avr32/include/asm/kprobes.h
+++ b/arch/avr32/include/asm/kprobes.h
@@ -11,10 +11,14 @@
 #ifndef __ASM_AVR32_KPROBES_H
 #define __ASM_AVR32_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xd673	/* breakpoint */
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 
 typedef u16	kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xd673	/* breakpoint */
 #define MAX_INSN_SIZE		2
 #define MAX_STACK_SIZE		64	/* 32 would probably be OK */
 
@@ -46,4 +50,5 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 
 #define flush_insn_slot(p)	do { } while (0)
 
+#endif /* CONFIG_KPROBES */
 #endif /* __ASM_AVR32_KPROBES_H */
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index d6fa60b158be..625db8ac815e 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -46,3 +46,4 @@ generic-y += unaligned.h
 generic-y += user.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 4e9f57433f3a..82619c32d25b 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -61,3 +61,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 8e4ef321001f..0f5132b08896 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -45,3 +45,4 @@ generic-y += types.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index 0f5b0d5d313c..c33b46715f65 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -7,3 +7,4 @@ generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 5efd0c87f3c0..341740c3581c 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -74,3 +74,4 @@ generic-y += unaligned.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index a43a7c90e4af..797b64a4b80b 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -59,3 +59,4 @@ generic-y += unaligned.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/ia64/include/asm/kprobes.h b/arch/ia64/include/asm/kprobes.h
index d5505d6f2382..0302b3664789 100644
--- a/arch/ia64/include/asm/kprobes.h
+++ b/arch/ia64/include/asm/kprobes.h
@@ -23,14 +23,19 @@
  * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
  *              <anil.s.keshavamurthy@intel.com> adapted from i386
  */
+#include <asm-generic/kprobes.h>
+#include <asm/break.h>
+
+#define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
+
+#ifdef CONFIG_KPROBES
+
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
-#include <asm/break.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
 #define MAX_INSN_SIZE   2	/* last half is for kprobe-booster */
-#define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
 #define NOP_M_INST	(long)(1<<27)
 #define BRL_INST(i1, i2) ((long)((0xcL << 37) |	/* brl */ \
 				(0x1L << 12) |	/* many */ \
@@ -124,4 +129,5 @@ extern void invalidate_stacked_regs(void);
 extern void flush_register_stack(void);
 extern void arch_remove_kprobe(struct kprobe *p);
 
-#endif				/* _ASM_KPROBES_H */
+#endif /* CONFIG_KPROBES */
+#endif /* _ASM_KPROBES_H */
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 8c24c5e1db66..deb298777df2 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 6c76d6c24b3d..d4f9ccbfa85c 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -33,3 +33,4 @@ generic-y += trace_clock.h
 generic-y += types.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index d3731f0db73b..f9b9df5d6de9 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -54,3 +54,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 6275eb051801..1732ec13b211 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -10,3 +10,4 @@ generic-y += preempt.h
 generic-y += syscalls.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h
index daba1f9a4f79..291846d9ba83 100644
--- a/arch/mips/include/asm/kprobes.h
+++ b/arch/mips/include/asm/kprobes.h
@@ -22,6 +22,9 @@
 #ifndef _ASM_KPROBES_H
 #define _ASM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/ptrace.h>
 #include <linux/types.h>
 
@@ -94,4 +97,5 @@ struct kprobe_ctlblk {
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
 
-#endif				/* _ASM_KPROBES_H */
+#endif /* CONFIG_KPROBES */
+#endif /* _ASM_KPROBES_H */
diff --git a/arch/mn10300/include/asm/kprobes.h b/arch/mn10300/include/asm/kprobes.h
index c800b590183a..7abea0bdb549 100644
--- a/arch/mn10300/include/asm/kprobes.h
+++ b/arch/mn10300/include/asm/kprobes.h
@@ -21,13 +21,17 @@
 #ifndef _ASM_KPROBES_H
 #define _ASM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xff
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
 struct kprobe;
 
 typedef unsigned char kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xff
 #define MAX_INSN_SIZE 8
 #define MAX_STACK_SIZE 128
 
@@ -47,4 +51,5 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 
 extern void arch_remove_kprobe(struct kprobe *p);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_KPROBES_H */
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 35b0e883761a..aaa3c218b56c 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -62,3 +62,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index fb241757f7f0..fb01873a5aad 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -67,3 +67,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index cc70b4116718..a9909c2d04c5 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -28,3 +28,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index d821835ade86..0503c98b2117 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -1,5 +1,8 @@
 #ifndef _ASM_POWERPC_KPROBES_H
 #define _ASM_POWERPC_KPROBES_H
+
+#include <asm-generic/kprobes.h>
+
 #ifdef __KERNEL__
 /*
  *  Kernel Probes (KProbes)
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0899315e1434..0d3002b7e2b4 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/code-patching.h>
 #include <linux/uaccess.h>
+#include <linux/kprobes.h>
 
 
 int patch_instruction(unsigned int *addr, unsigned int instr)
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index 591e5a5279b0..84c0f9086483 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -27,6 +27,11 @@
  * 2005-Dec	Used as a template for s390 by Mike Grundy
  *		<grundym@us.ibm.com>
  */
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0x0002
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -37,7 +42,6 @@ struct pt_regs;
 struct kprobe;
 
 typedef u16 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0x0002
 
 /* Maximum instruction size is 3 (16bit) halfwords: */
 #define MAX_INSN_SIZE		0x0003
@@ -91,4 +95,5 @@ int probe_is_insn_relative_long(u16 *insn);
 
 #define flush_insn_slot(p)	do { } while (0)
 
+#endif /* CONFIG_KPROBES */
 #endif	/* _ASM_S390_KPROBES_H */
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index db3e28ca3ae2..926943a49ea5 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -13,3 +13,4 @@ generic-y += trace_clock.h
 generic-y += xor.h
 generic-y += serial.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h
index 134f3980e44a..f0986f9b3844 100644
--- a/arch/sh/include/asm/kprobes.h
+++ b/arch/sh/include/asm/kprobes.h
@@ -1,13 +1,16 @@
 #ifndef __ASM_SH_KPROBES_H
 #define __ASM_SH_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xc33a
+
 #ifdef CONFIG_KPROBES
 
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
 typedef insn_size_t kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xc33a
 
 #define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
diff --git a/arch/sparc/include/asm/kprobes.h b/arch/sparc/include/asm/kprobes.h
index a145d798e112..49f8402035d7 100644
--- a/arch/sparc/include/asm/kprobes.h
+++ b/arch/sparc/include/asm/kprobes.h
@@ -1,13 +1,17 @@
 #ifndef _SPARC64_KPROBES_H
 #define _SPARC64_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION   0x91d02070 /* ta 0x70 */
+#define BREAKPOINT_INSTRUCTION_2 0x91d02071 /* ta 0x71 */
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/percpu.h>
 
 typedef u32 kprobe_opcode_t;
 
-#define BREAKPOINT_INSTRUCTION   0x91d02070 /* ta 0x70 */
-#define BREAKPOINT_INSTRUCTION_2 0x91d02071 /* ta 0x71 */
 #define MAX_INSN_SIZE 2
 
 #define kretprobe_blacklist_size 0
@@ -48,4 +52,6 @@ int kprobe_exceptions_notify(struct notifier_block *self,
 int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 asmlinkage void __kprobes kprobe_trap(unsigned long trap_level,
 				      struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
 #endif /* _SPARC64_KPROBES_H */
diff --git a/arch/tile/include/asm/kprobes.h b/arch/tile/include/asm/kprobes.h
index d8f9a83943b1..4a8b1cadca24 100644
--- a/arch/tile/include/asm/kprobes.h
+++ b/arch/tile/include/asm/kprobes.h
@@ -17,10 +17,13 @@
 #ifndef _ASM_TILE_KPROBES_H
 #define _ASM_TILE_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
+
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
-
 #include <arch/opcode.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
@@ -76,4 +79,5 @@ void arch_remove_kprobe(struct kprobe *);
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 			     unsigned long val, void *data);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_TILE_KPROBES_H */
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 90c281cd7e1d..e9d42aab76dc 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -25,3 +25,4 @@ generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 5d51ade89f4c..84205fe1cd79 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -63,3 +63,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index d1d1e5094c28..200581691c6e 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -21,6 +21,12 @@
  *
  * See arch/x86/kernel/kprobes.c for x86 kprobes history.
  */
+
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xcc
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -32,7 +38,6 @@ struct pt_regs;
 struct kprobe;
 
 typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xcc
 #define RELATIVEJUMP_OPCODE 0xe9
 #define RELATIVEJUMP_SIZE 5
 #define RELATIVECALL_OPCODE 0xe8
@@ -116,4 +121,6 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
 extern int kprobe_int3_handler(struct pt_regs *regs);
 extern int kprobe_debug_handler(struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 9e9760b20be5..f41408c53fe1 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -31,3 +31,4 @@ generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h
new file mode 100644
index 000000000000..57af9f21d148
--- /dev/null
+++ b/include/asm-generic/kprobes.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_GENERIC_KPROBES_H
+#define _ASM_GENERIC_KPROBES_H
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_KPROBES
+/*
+ * Blacklist ganerating macro. Specify functions which is not probed
+ * by using this macro.
+ */
+# define __NOKPROBE_SYMBOL(fname)				\
+static unsigned long __used					\
+	__attribute__((__section__("_kprobe_blacklist")))	\
+	_kbl_addr_##fname = (unsigned long)fname;
+# define NOKPROBE_SYMBOL(fname)	__NOKPROBE_SYMBOL(fname)
+/* Use this to forbid a kprobes attach on very low level functions */
+# define __kprobes	__attribute__((__section__(".kprobes.text")))
+# define nokprobe_inline	__always_inline
+#else
+# define NOKPROBE_SYMBOL(fname)
+# define __kprobes
+# define nokprobe_inline	inline
+#endif
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+
+#endif /* _ASM_GENERIC_KPROBES_H */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 91c30cba984e..b2eb9c0a68c4 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -570,12 +570,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	(_________p1); \
 })
 
-/* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
-#ifdef CONFIG_KPROBES
-# define __kprobes	__attribute__((__section__(".kprobes.text")))
-# define nokprobe_inline	__always_inline
-#else
-# define __kprobes
-# define nokprobe_inline	inline
-#endif
 #endif /* __LINUX_COMPILER_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 16ddfb8b304a..c328e4f7dcad 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -29,7 +29,7 @@
  *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
  */
-#include <linux/compiler.h>	/* for __kprobes */
+#include <linux/compiler.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
@@ -40,9 +40,9 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <asm/kprobes.h>
 
 #ifdef CONFIG_KPROBES
-#include <asm/kprobes.h>
 
 /* kprobe_status settings */
 #define KPROBE_HIT_ACTIVE	0x00000001
@@ -51,6 +51,7 @@
 #define KPROBE_HIT_SSDONE	0x00000008
 
 #else /* CONFIG_KPROBES */
+#include <asm-generic/kprobes.h>
 typedef int kprobe_opcode_t;
 struct arch_specific_insn {
 	int dummy;
@@ -509,18 +510,4 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 }
 #endif
 
-#ifdef CONFIG_KPROBES
-/*
- * Blacklist ganerating macro. Specify functions which is not probed
- * by using this macro.
- */
-#define __NOKPROBE_SYMBOL(fname)			\
-static unsigned long __used				\
-	__attribute__((section("_kprobe_blacklist")))	\
-	_kbl_addr_##fname = (unsigned long)fname;
-#define NOKPROBE_SYMBOL(fname)	__NOKPROBE_SYMBOL(fname)
-#else
-#define NOKPROBE_SYMBOL(fname)
-#endif
-
 #endif /* _LINUX_KPROBES_H */
-- 
cgit v1.2.3


From 441398d378f29a5ad6d0fcda07918e54e4961800 Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@list.ru>
Date: Mon, 27 Feb 2017 14:27:25 -0800
Subject: sigaltstack: support SS_AUTODISARM for CONFIG_COMPAT

Currently SS_AUTODISARM is not supported in compatibility mode, but does
not return -EINVAL either.  This makes dosemu built with -m32 on x86_64
to crash.  Also the kernel's sigaltstack selftest fails if compiled with
-m32.

This patch adds the needed support.

Link: http://lkml.kernel.org/r/20170205101213.8163-2-stsp@list.ru
Signed-off-by: Stas Sergeev <stsp@users.sourceforge.net>
Cc: Milosz Tanski <milosz@adfin.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Waiman Long <Waiman.Long@hpe.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h |  4 +++-
 kernel/signal.c        | 11 +++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 9e40be522793..aef47be2a5c1 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -711,8 +711,10 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 	compat_stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
 	put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
-	put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
+	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
 	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	if (t->sas_ss_flags & SS_AUTODISARM) \
+		sas_ss_reset(t); \
 } while (0);
 
 asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
diff --git a/kernel/signal.c b/kernel/signal.c
index 13f9def8b24a..214a8feeb771 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3239,10 +3239,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss)
 
 int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 {
+	int err;
 	struct task_struct *t = current;
-	return  __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
-		__put_user(sas_ss_flags(sp), &uss->ss_flags) |
+	err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
+			 &uss->ss_sp) |
+		__put_user(t->sas_ss_flags, &uss->ss_flags) |
 		__put_user(t->sas_ss_size, &uss->ss_size);
+	if (err)
+		return err;
+	if (t->sas_ss_flags & SS_AUTODISARM)
+		sas_ss_reset(t);
+	return 0;
 }
 #endif
 
-- 
cgit v1.2.3


From e3b5a342ab9643b4079c8d417d90519388469341 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 27 Feb 2017 14:27:37 -0800
Subject: include/linux/pid.h: use for_each_thread() in do_each_pid_thread()

while_each_pid_thread() is using while_each_thread(), which is unsafe
under RCU lock according to commit 0c740d0afc3b ("introduce
for_each_thread() to replace the buggy while_each_thread()").  Use
for_each_thread() in do_each_pid_thread() which is safe under RCU lock.

Link: http://lkml.kernel.org/r/201702011947.DBD56740.OMVHOLOtSJFFFQ@I-love.SAKURA.ne.jp
Link: http://lkml.kernel.org/r/1486041779-4401-2-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 23705a53abba..298ead5512e5 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -191,10 +191,10 @@ pid_t pid_vnr(struct pid *pid);
 #define do_each_pid_thread(pid, type, task)				\
 	do_each_pid_task(pid, type, task) {				\
 		struct task_struct *tg___ = task;			\
-		do {
+		for_each_thread(tg___, task) {
 
 #define while_each_pid_thread(pid, type, task)				\
-		} while_each_thread(tg___, task);			\
+		}							\
 		task = tg___;						\
 	} while_each_pid_task(pid, type, task)
 #endif /* _LINUX_PID_H */
-- 
cgit v1.2.3


From 9de5ab8a2eeea9ae4b63b6f6353b415b93e020c0 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 27 Feb 2017 14:28:18 -0800
Subject: ipc/sem: add hysteresis

sysv sem has two lock modes: One with per-semaphore locks, one lock mode
with a single global lock for the whole array.  When switching from the
per-semaphore locks to the global lock, all per-semaphore locks must be
scanned for ongoing operations.

The patch adds a hysteresis for switching from the global lock to the
per semaphore locks.  This reduces how often the per-semaphore locks
must be scanned.

Compared to the initial patch, this is a simplified solution: Setting
USE_GLOBAL_LOCK_HYSTERESIS to 1 restores the current behavior.

In theory, a workload with exactly 10 simple sops and then one complex
op now scales a bit worse, but this is pure theory: If there is
concurrency, the it won't be exactly 10:1:10:1:10:1:...  If there is no
concurrency, then there is no need for scalability.

Link: http://lkml.kernel.org/r/1476851896-3590-3-git-send-email-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: <1vier1@web.de>
Cc: kernel test robot <xiaolong.ye@intel.com>
Cc: <felixh@informatik.uni-bremen.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h |  2 +-
 ipc/sem.c           | 86 +++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 62 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index d0efd6e6c20a..4fc222f8755d 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,7 +21,7 @@ struct sem_array {
 	struct list_head	list_id;	/* undo requests on this array */
 	int			sem_nsems;	/* no. of semaphores in array */
 	int			complex_count;	/* pending complex operations */
-	bool			complex_mode;	/* no parallel simple ops */
+	unsigned int		use_global_lock;/* >0: global lock required */
 };
 
 #ifdef CONFIG_SYSVIPC
diff --git a/ipc/sem.c b/ipc/sem.c
index fe5db1ed081b..e468cd1c12f0 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -158,23 +158,43 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define SEMMSL_FAST	256 /* 512 bytes on stack */
 #define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
 
+/*
+ * Switching from the mode suitable for simple ops
+ * to the mode for complex ops is costly. Therefore:
+ * use some hysteresis
+ */
+#define USE_GLOBAL_LOCK_HYSTERESIS	10
+
 /*
  * Locking:
  * a) global sem_lock() for read/write
  *	sem_undo.id_next,
  *	sem_array.complex_count,
- *	sem_array.complex_mode
  *	sem_array.pending{_alter,_const},
  *	sem_array.sem_undo
  *
  * b) global or semaphore sem_lock() for read/write:
  *	sem_array.sem_base[i].pending_{const,alter}:
- *	sem_array.complex_mode (for read)
  *
  * c) special:
  *	sem_undo_list.list_proc:
  *	* undo_list->lock for write
  *	* rcu for read
+ *	use_global_lock:
+ *	* global sem_lock() for write
+ *	* either local or global sem_lock() for read.
+ *
+ * Memory ordering:
+ * Most ordering is enforced by using spin_lock() and spin_unlock().
+ * The special case is use_global_lock:
+ * Setting it from non-zero to 0 is a RELEASE, this is ensured by
+ * using smp_store_release().
+ * Testing if it is non-zero is an ACQUIRE, this is ensured by using
+ * smp_load_acquire().
+ * Setting it from 0 to non-zero must be ordered with regards to
+ * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
+ * is inside a spin_lock() and after a write from 0 to non-zero a
+ * spin_lock()+spin_unlock() is done.
  */
 
 #define sc_semmsl	sem_ctls[0]
@@ -273,12 +293,16 @@ static void complexmode_enter(struct sem_array *sma)
 	int i;
 	struct sem *sem;
 
-	if (sma->complex_mode)  {
-		/* We are already in complex_mode. Nothing to do */
+	if (sma->use_global_lock > 0)  {
+		/*
+		 * We are already in global lock mode.
+		 * Nothing to do, just reset the
+		 * counter until we return to simple mode.
+		 */
+		sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 		return;
 	}
-
-	sma->complex_mode = true;
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 
 	for (i = 0; i < sma->sem_nsems; i++) {
 		sem = sma->sem_base + i;
@@ -299,13 +323,17 @@ static void complexmode_tryleave(struct sem_array *sma)
 		 */
 		return;
 	}
-	/*
-	 * Immediately after setting complex_mode to false,
-	 * a simple op can start. Thus: all memory writes
-	 * performed by the current operation must be visible
-	 * before we set complex_mode to false.
-	 */
-	smp_store_release(&sma->complex_mode, false);
+	if (sma->use_global_lock == 1) {
+		/*
+		 * Immediately after setting use_global_lock to 0,
+		 * a simple op can start. Thus: all memory writes
+		 * performed by the current operation must be visible
+		 * before we set use_global_lock to 0.
+		 */
+		smp_store_release(&sma->use_global_lock, 0);
+	} else {
+		sma->use_global_lock--;
+	}
 }
 
 #define SEM_GLOBAL_LOCK	(-1)
@@ -335,22 +363,23 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	 * Optimized locking is possible if no complex operation
 	 * is either enqueued or processed right now.
 	 *
-	 * Both facts are tracked by complex_mode.
+	 * Both facts are tracked by use_global_mode.
 	 */
 	sem = sma->sem_base + sops->sem_num;
 
 	/*
-	 * Initial check for complex_mode. Just an optimization,
+	 * Initial check for use_global_lock. Just an optimization,
 	 * no locking, no memory barrier.
 	 */
-	if (!sma->complex_mode) {
+	if (!sma->use_global_lock) {
 		/*
 		 * It appears that no complex operation is around.
 		 * Acquire the per-semaphore lock.
 		 */
 		spin_lock(&sem->lock);
 
-		if (!smp_load_acquire(&sma->complex_mode)) {
+		/* pairs with smp_store_release() */
+		if (!smp_load_acquire(&sma->use_global_lock)) {
 			/* fast path successful! */
 			return sops->sem_num;
 		}
@@ -360,19 +389,26 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	/* slow path: acquire the full lock */
 	ipc_lock_object(&sma->sem_perm);
 
-	if (sma->complex_count == 0) {
-		/* False alarm:
-		 * There is no complex operation, thus we can switch
-		 * back to the fast path.
+	if (sma->use_global_lock == 0) {
+		/*
+		 * The use_global_lock mode ended while we waited for
+		 * sma->sem_perm.lock. Thus we must switch to locking
+		 * with sem->lock.
+		 * Unlike in the fast path, there is no need to recheck
+		 * sma->use_global_lock after we have acquired sem->lock:
+		 * We own sma->sem_perm.lock, thus use_global_lock cannot
+		 * change.
 		 */
 		spin_lock(&sem->lock);
+
 		ipc_unlock_object(&sma->sem_perm);
 		return sops->sem_num;
 	} else {
-		/* Not a false alarm, thus complete the sequence for a
-		 * full lock.
+		/*
+		 * Not a false alarm, thus continue to use the global lock
+		 * mode. No need for complexmode_enter(), this was done by
+		 * the caller that has set use_global_mode to non-zero.
 		 */
-		complexmode_enter(sma);
 		return SEM_GLOBAL_LOCK;
 	}
 }
@@ -476,7 +512,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	}
 
 	sma->complex_count = 0;
-	sma->complex_mode = true; /* dropped by sem_unlock below */
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 	INIT_LIST_HEAD(&sma->pending_alter);
 	INIT_LIST_HEAD(&sma->pending_const);
 	INIT_LIST_HEAD(&sma->list_id);
-- 
cgit v1.2.3


From 93407472a21b82f39c955ea7787e5bc7da100642 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Mon, 27 Feb 2017 14:28:32 -0800
Subject: fs: add i_blocksize()

Replace all 1 << inode->i_blkbits and (1 << inode->i_blkbits) in fs
branch.

This patch also fixes multiple checkpatch warnings: WARNING: Prefer
'unsigned int' to bare use of 'unsigned'

Thanks to Andrew Morton for suggesting more appropriate function instead
of macro.

[geliangtang@gmail.com: truncate: use i_blocksize()]
  Link: http://lkml.kernel.org/r/9c8b2cd83c8f5653805d43debde9fa8817e02fc4.1484895804.git.geliangtang@gmail.com
Link: http://lkml.kernel.org/r/1481319905-10126-1-git-send-email-fabf@skynet.be
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c               |  2 +-
 fs/btrfs/file.c              |  2 +-
 fs/buffer.c                  | 12 ++++++------
 fs/ceph/addr.c               |  2 +-
 fs/direct-io.c               |  2 +-
 fs/ext4/inode.c              |  6 +++---
 fs/ext4/mballoc.c            |  2 +-
 fs/ext4/move_extent.c        |  2 +-
 fs/iomap.c                   | 10 +++++-----
 fs/jfs/super.c               |  4 ++--
 fs/mpage.c                   |  2 +-
 fs/nfsd/blocklayout.c        |  6 +++---
 fs/nilfs2/btnode.c           |  2 +-
 fs/nilfs2/inode.c            |  4 ++--
 fs/nilfs2/mdt.c              |  4 ++--
 fs/nilfs2/segment.c          |  2 +-
 fs/ocfs2/aops.c              |  2 +-
 fs/ocfs2/file.c              |  2 +-
 fs/orangefs/orangefs-utils.c |  4 ++--
 fs/reiserfs/file.c           |  2 +-
 fs/reiserfs/inode.c          |  2 +-
 fs/stat.c                    |  2 +-
 fs/udf/inode.c               |  2 +-
 fs/xfs/xfs_aops.c            | 16 ++++++++--------
 fs/xfs/xfs_file.c            |  4 ++--
 include/linux/fs.h           |  5 +++++
 mm/truncate.c                |  2 +-
 27 files changed, 56 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1c62845a72c7..77c30f15a02c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -989,7 +989,7 @@ struct block_device *bdget(dev_t dev)
 		bdev->bd_super = NULL;
 		bdev->bd_inode = inode;
 		bdev->bd_bdi = &noop_backing_dev_info;
-		bdev->bd_block_size = (1 << inode->i_blkbits);
+		bdev->bd_block_size = i_blocksize(inode);
 		bdev->bd_part_count = 0;
 		bdev->bd_invalidated = 0;
 		inode->i_mode = S_IFBLK;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 18e5146df864..c1d2a07205da 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2875,7 +2875,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		if (!ret)
 			ret = btrfs_prealloc_file_range(inode, mode,
 					range->start,
-					range->len, 1 << inode->i_blkbits,
+					range->len, i_blocksize(inode),
 					offset + len, &alloc_hint);
 		else
 			btrfs_free_reserved_data_space(inode, range->start,
diff --git a/fs/buffer.c b/fs/buffer.c
index 0e87401cf335..28484b3ebc98 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2395,7 +2395,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
 			    loff_t pos, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
-	unsigned blocksize = 1 << inode->i_blkbits;
+	unsigned int blocksize = i_blocksize(inode);
 	struct page *page;
 	void *fsdata;
 	pgoff_t index, curidx;
@@ -2475,8 +2475,8 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
 			get_block_t *get_block, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
-	unsigned blocksize = 1 << inode->i_blkbits;
-	unsigned zerofrom;
+	unsigned int blocksize = i_blocksize(inode);
+	unsigned int zerofrom;
 	int err;
 
 	err = cont_expand_zero(file, mapping, pos, bytes);
@@ -2838,7 +2838,7 @@ int nobh_truncate_page(struct address_space *mapping,
 	struct buffer_head map_bh;
 	int err;
 
-	blocksize = 1 << inode->i_blkbits;
+	blocksize = i_blocksize(inode);
 	length = offset & (blocksize - 1);
 
 	/* Block boundary? Nothing to do */
@@ -2916,7 +2916,7 @@ int block_truncate_page(struct address_space *mapping,
 	struct buffer_head *bh;
 	int err;
 
-	blocksize = 1 << inode->i_blkbits;
+	blocksize = i_blocksize(inode);
 	length = offset & (blocksize - 1);
 
 	/* Block boundary? Nothing to do */
@@ -3028,7 +3028,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 	struct inode *inode = mapping->host;
 	tmp.b_state = 0;
 	tmp.b_blocknr = 0;
-	tmp.b_size = 1 << inode->i_blkbits;
+	tmp.b_size = i_blocksize(inode);
 	get_block(inode, block, &tmp, 0);
 	return tmp.b_blocknr;
 }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 09860c0ec7c1..7ce35aec8c76 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -751,7 +751,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	struct pagevec pvec;
 	int done = 0;
 	int rc = 0;
-	unsigned wsize = 1 << inode->i_blkbits;
+	unsigned int wsize = i_blocksize(inode);
 	struct ceph_osd_request *req = NULL;
 	int do_sync = 0;
 	loff_t snap_size, i_size;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c87bae4376b8..a04ebea77de8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -587,7 +587,7 @@ static int dio_set_defer_completion(struct dio *dio)
 /*
  * Call into the fs to map some more disk blocks.  We record the current number
  * of available blocks at sdio->blocks_available.  These are in units of the
- * fs blocksize, (1 << inode->i_blkbits).
+ * fs blocksize, i_blocksize(inode).
  *
  * The fs is allowed to map lots of blocks at once.  If it wants to do that,
  * it uses the passed inode-relative block number as the file offset, as usual.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 41d8e53e5a7f..971f66342080 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2221,7 +2221,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 {
 	struct inode *inode = mpd->inode;
 	int err;
-	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+	ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
 							>> inode->i_blkbits;
 
 	do {
@@ -3577,7 +3577,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 	if (overwrite)
 		get_block_func = ext4_dio_get_block_overwrite;
 	else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
-		   round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+		   round_down(offset, i_blocksize(inode)) >= inode->i_size) {
 		get_block_func = ext4_dio_get_block;
 		dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
 	} else if (is_sync_kiocb(iocb)) {
@@ -5179,7 +5179,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 	 * do. We do the check mainly to optimize the common PAGE_SIZE ==
 	 * blocksize case
 	 */
-	if (offset > PAGE_SIZE - (1 << inode->i_blkbits))
+	if (offset > PAGE_SIZE - i_blocksize(inode))
 		return;
 	while (1) {
 		page = find_lock_page(inode->i_mapping,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 10c62de642c6..354dc1a894c2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -838,7 +838,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 	inode = page->mapping->host;
 	sb = inode->i_sb;
 	ngroups = ext4_get_groups_count(sb);
-	blocksize = 1 << inode->i_blkbits;
+	blocksize = i_blocksize(inode);
 	blocks_per_page = PAGE_SIZE / blocksize;
 
 	groups_per_page = blocks_per_page >> 1;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 6fc14def0c70..578f8c33fb44 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -187,7 +187,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
 	if (PageUptodate(page))
 		return 0;
 
-	blocksize = 1 << inode->i_blkbits;
+	blocksize = i_blocksize(inode);
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 
diff --git a/fs/iomap.c b/fs/iomap.c
index d209f42cdcb8..0f85f2410605 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -420,8 +420,8 @@ int
 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops)
 {
-	unsigned blocksize = (1 << inode->i_blkbits);
-	unsigned off = pos & (blocksize - 1);
+	unsigned int blocksize = i_blocksize(inode);
+	unsigned int off = pos & (blocksize - 1);
 
 	/* Block boundary? Nothing to do */
 	if (!off)
@@ -735,9 +735,9 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		void *data, struct iomap *iomap)
 {
 	struct iomap_dio *dio = data;
-	unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned fs_block_size = (1 << inode->i_blkbits), pad;
-	unsigned align = iov_iter_alignment(dio->submit.iter);
+	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int align = iov_iter_alignment(dio->submit.iter);
 	struct iov_iter iter;
 	struct bio *bio;
 	bool need_zeroout = false;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2be7c9ce6663..c64c2574a0aa 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -758,7 +758,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
 				sb->s_blocksize - offset : toread;
 
 		tmp_bh.b_state = 0;
-		tmp_bh.b_size = 1 << inode->i_blkbits;
+		tmp_bh.b_size = i_blocksize(inode);
 		err = jfs_get_block(inode, blk, &tmp_bh, 0);
 		if (err)
 			return err;
@@ -798,7 +798,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 				sb->s_blocksize - offset : towrite;
 
 		tmp_bh.b_state = 0;
-		tmp_bh.b_size = 1 << inode->i_blkbits;
+		tmp_bh.b_size = i_blocksize(inode);
 		err = jfs_get_block(inode, blk, &tmp_bh, 1);
 		if (err)
 			goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 28af984a3d96..baff8f820c29 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -115,7 +115,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
 			SetPageUptodate(page);    
 			return;
 		}
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+		create_empty_buffers(page, i_blocksize(inode), 0);
 	}
 	head = page_buffers(page);
 	page_bh = head;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index a06115e31612..92b4b41d19d2 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -24,7 +24,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
 {
 	struct nfsd4_layout_seg *seg = &args->lg_seg;
 	struct super_block *sb = inode->i_sb;
-	u32 block_size = (1 << inode->i_blkbits);
+	u32 block_size = i_blocksize(inode);
 	struct pnfs_block_extent *bex;
 	struct iomap iomap;
 	u32 device_generation = 0;
@@ -181,7 +181,7 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
 	int nr_iomaps;
 
 	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
-			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+			lcp->lc_up_len, &iomaps, i_blocksize(inode));
 	if (nr_iomaps < 0)
 		return nfserrno(nr_iomaps);
 
@@ -375,7 +375,7 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
 	int nr_iomaps;
 
 	nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
-			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+			lcp->lc_up_len, &iomaps, i_blocksize(inode));
 	if (nr_iomaps < 0)
 		return nfserrno(nr_iomaps);
 
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index d5c23da43513..c21e0b4454a6 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -50,7 +50,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 		brelse(bh);
 		BUG();
 	}
-	memset(bh->b_data, 0, 1 << inode->i_blkbits);
+	memset(bh->b_data, 0, i_blocksize(inode));
 	bh->b_bdev = inode->i_sb->s_bdev;
 	bh->b_blocknr = blocknr;
 	set_buffer_mapped(bh);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index c7f4fef9ebf5..7ffe71a8dfb9 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -51,7 +51,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 
-	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
+	inode_add_bytes(inode, i_blocksize(inode) * n);
 	if (root)
 		atomic64_add(n, &root->blocks_count);
 }
@@ -60,7 +60,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 
-	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
+	inode_sub_bytes(inode, i_blocksize(inode) * n);
 	if (root)
 		atomic64_sub(n, &root->blocks_count);
 }
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d56d3a5bea88..98835ed6bef4 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -57,7 +57,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
 	set_buffer_mapped(bh);
 
 	kaddr = kmap_atomic(bh->b_page);
-	memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+	memset(kaddr + bh_offset(bh), 0, i_blocksize(inode));
 	if (init_block)
 		init_block(inode, bh, kaddr);
 	flush_dcache_page(bh->b_page);
@@ -501,7 +501,7 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
 
 	mi->mi_entry_size = entry_size;
-	mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+	mi->mi_entries_per_block = i_blocksize(inode) / entry_size;
 	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index bedcae2c28e6..7d18d62e8e07 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -723,7 +723,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 
 		lock_page(page);
 		if (!page_has_buffers(page))
-			create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+			create_empty_buffers(page, i_blocksize(inode), 0);
 		unlock_page(page);
 
 		bh = head = page_buffers(page);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 11556b7d93ec..88a31e9340a0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -608,7 +608,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 	int ret = 0;
 	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
 	unsigned int block_end, block_start;
-	unsigned int bsize = 1 << inode->i_blkbits;
+	unsigned int bsize = i_blocksize(inode);
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, bsize, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7b6a146327d7..8836305eb378 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -808,7 +808,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	/* We know that zero_from is block aligned */
 	for (block_start = zero_from; block_start < zero_to;
 	     block_start = block_end) {
-		block_end = block_start + (1 << inode->i_blkbits);
+		block_end = block_start + i_blocksize(inode);
 
 		/*
 		 * block_start is block-aligned.  Bump it by one to force
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 06af81f71e10..9b96b99539d6 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -306,7 +306,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 		break;
 	case S_IFDIR:
 		inode->i_size = PAGE_SIZE;
-		orangefs_inode->blksize = (1 << inode->i_blkbits);
+		orangefs_inode->blksize = i_blocksize(inode);
 		spin_lock(&inode->i_lock);
 		inode_set_bytes(inode, inode->i_size);
 		spin_unlock(&inode->i_lock);
@@ -316,7 +316,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 		if (new) {
 			inode->i_size = (loff_t)strlen(new_op->
 			    downcall.resp.getattr.link_target);
-			orangefs_inode->blksize = (1 << inode->i_blkbits);
+			orangefs_inode->blksize = i_blocksize(inode);
 			ret = strscpy(orangefs_inode->link_target,
 			    new_op->downcall.resp.getattr.link_target,
 			    ORANGEFS_NAME_MAX);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 2f8c5c9bdaf6..b396eb09f288 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -189,7 +189,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
 	int ret = 0;
 
 	th.t_trans_id = 0;
-	blocksize = 1 << inode->i_blkbits;
+	blocksize = i_blocksize(inode);
 
 	if (logit) {
 		reiserfs_write_lock(s);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index cfeae9b0a2b7..a6ab9d64ea1b 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -525,7 +525,7 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
 	 * referenced in convert_tail_for_hole() that may be called from
 	 * reiserfs_get_block()
 	 */
-	bh_result->b_size = (1 << inode->i_blkbits);
+	bh_result->b_size = i_blocksize(inode);
 
 	ret = reiserfs_get_block(inode, iblock, bh_result,
 				 create | GET_BLOCK_NO_DANGLE);
diff --git a/fs/stat.c b/fs/stat.c
index a268b7f27adf..3f14d1ef0868 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -31,7 +31,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
 	stat->atime = inode->i_atime;
 	stat->mtime = inode->i_mtime;
 	stat->ctime = inode->i_ctime;
-	stat->blksize = (1 << inode->i_blkbits);
+	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
 }
 
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8ec6b3df0bc7..a8d8f71ef8bd 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1193,7 +1193,7 @@ int udf_setsize(struct inode *inode, loff_t newsize)
 {
 	int err;
 	struct udf_inode_info *iinfo;
-	int bsize = 1 << inode->i_blkbits;
+	int bsize = i_blocksize(inode);
 
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	      S_ISLNK(inode->i_mode)))
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1ff9df7a3ce8..bf65a9ea8642 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -103,9 +103,9 @@ xfs_finish_page_writeback(
 	unsigned int		bsize;
 
 	ASSERT(bvec->bv_offset < PAGE_SIZE);
-	ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
+	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
 	ASSERT(end < PAGE_SIZE);
-	ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
+	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
 
 	bh = head = page_buffers(bvec->bv_page);
 
@@ -349,7 +349,7 @@ xfs_map_blocks(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			count = 1 << inode->i_blkbits;
+	ssize_t			count = i_blocksize(inode);
 	xfs_fileoff_t		offset_fsb, end_fsb;
 	int			error = 0;
 	int			bmapi_flags = XFS_BMAPI_ENTIRE;
@@ -758,7 +758,7 @@ xfs_aops_discard_page(
 			break;
 		}
 next_buffer:
-		offset += 1 << inode->i_blkbits;
+		offset += i_blocksize(inode);
 
 	} while ((bh = bh->b_this_page) != head);
 
@@ -846,7 +846,7 @@ xfs_writepage_map(
 	LIST_HEAD(submit_list);
 	struct xfs_ioend	*ioend, *next;
 	struct buffer_head	*bh, *head;
-	ssize_t			len = 1 << inode->i_blkbits;
+	ssize_t			len = i_blocksize(inode);
 	int			error = 0;
 	int			count = 0;
 	int			uptodate = 1;
@@ -1210,7 +1210,7 @@ xfs_map_trim_size(
 	    offset + mapping_size >= i_size_read(inode)) {
 		/* limit mapping to block that spans EOF */
 		mapping_size = roundup_64(i_size_read(inode) - offset,
-					  1 << inode->i_blkbits);
+					  i_blocksize(inode));
 	}
 	if (mapping_size > LONG_MAX)
 		mapping_size = LONG_MAX;
@@ -1241,7 +1241,7 @@ xfs_get_blocks(
 		return -EIO;
 
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
-	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+	ASSERT(bh_result->b_size >= i_blocksize(inode));
 	size = bh_result->b_size;
 
 	if (offset >= i_size_read(inode))
@@ -1389,7 +1389,7 @@ xfs_vm_set_page_dirty(
 			if (offset < end_offset)
 				set_buffer_dirty(bh);
 			bh = bh->b_this_page;
-			offset += 1 << inode->i_blkbits;
+			offset += i_blocksize(inode);
 		} while (bh != head);
 	}
 	/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a50eca676670..35703a801372 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -754,7 +754,7 @@ xfs_file_fallocate(
 		if (error)
 			goto out_unlock;
 	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
-		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+		unsigned int blksize_mask = i_blocksize(inode) - 1;
 
 		if (offset & blksize_mask || len & blksize_mask) {
 			error = -EINVAL;
@@ -776,7 +776,7 @@ xfs_file_fallocate(
 		if (error)
 			goto out_unlock;
 	} else if (mode & FALLOC_FL_INSERT_RANGE) {
-		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+		unsigned int blksize_mask = i_blocksize(inode) - 1;
 
 		new_size = i_size_read(inode) + len;
 		if (offset & blksize_mask || len & blksize_mask) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c930cbc19342..c64f2cb7d364 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -655,6 +655,11 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 };
 
+static inline unsigned int i_blocksize(const struct inode *node)
+{
+	return (1 << node->i_blkbits);
+}
+
 static inline int inode_unhashed(struct inode *inode)
 {
 	return hlist_unhashed(&inode->i_hash);
diff --git a/mm/truncate.c b/mm/truncate.c
index f2db67465495..6263affdef88 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -786,7 +786,7 @@ EXPORT_SYMBOL(truncate_setsize);
  */
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
 {
-	int bsize = 1 << inode->i_blkbits;
+	int bsize = i_blocksize(inode);
 	loff_t rounded_from;
 	struct page *page;
 	pgoff_t index;
-- 
cgit v1.2.3


From 03440c4e5e2f167764997a7e0f2dbb279d8078e6 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:28:49 -0800
Subject: scripts/spelling.txt: add "an union" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  an union||a union

Link: http://lkml.kernel.org/r/1481573103-11329-5-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/media/uapi/dvb/dvb-frontend-parameters.rst  | 4 ++--
 arch/x86/include/asm/desc_defs.h                          | 2 +-
 drivers/acpi/acpica/dbconvert.c                           | 2 +-
 drivers/acpi/acpica/nspredef.c                            | 2 +-
 drivers/acpi/acpica/nsxfeval.c                            | 4 ++--
 drivers/staging/lustre/lustre/include/lustre/lustre_idl.h | 2 +-
 include/linux/dcache.h                                    | 4 ++--
 include/media/v4l2-ctrls.h                                | 4 ++--
 include/xen/interface/grant_table.h                       | 2 +-
 scripts/spelling.txt                                      | 1 +
 sound/pci/ice1712/wm8766.c                                | 2 +-
 sound/pci/ice1712/wm8776.c                                | 2 +-
 tools/perf/util/probe-finder.c                            | 4 ++--
 tools/perf/util/sort.h                                    | 2 +-
 14 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/media/uapi/dvb/dvb-frontend-parameters.rst b/Documentation/media/uapi/dvb/dvb-frontend-parameters.rst
index bf31411fc9df..899fd5c3545e 100644
--- a/Documentation/media/uapi/dvb/dvb-frontend-parameters.rst
+++ b/Documentation/media/uapi/dvb/dvb-frontend-parameters.rst
@@ -9,7 +9,7 @@ frontend parameters
 The kind of parameters passed to the frontend device for tuning depend
 on the kind of hardware you are using.
 
-The struct ``dvb_frontend_parameters`` uses an union with specific
+The struct ``dvb_frontend_parameters`` uses a union with specific
 per-system parameters. However, as newer delivery systems required more
 data, the structure size weren't enough to fit, and just extending its
 size would break the existing applications. So, those parameters were
@@ -23,7 +23,7 @@ So, newer applications should use
 instead, in order to be able to support the newer System Delivery like
 DVB-S2, DVB-T2, DVB-C2, ISDB, etc.
 
-All kinds of parameters are combined as an union in the
+All kinds of parameters are combined as a union in the
 FrontendParameters structure:
 
 
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index eb5deb42484d..49265345d4d2 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -15,7 +15,7 @@
  * FIXME: Accessing the desc_struct through its fields is more elegant,
  * and should be the one valid thing to do. However, a lot of open code
  * still touches the a and b accessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than an union,
+ * incrementally. We keep the signature as a struct, rather than a union,
  * so we can get rid of it transparently in the future -- glommer
  */
 /* 8 byte segment descriptor */
diff --git a/drivers/acpi/acpica/dbconvert.c b/drivers/acpi/acpica/dbconvert.c
index 251f9477a984..857dbc43a9b1 100644
--- a/drivers/acpi/acpica/dbconvert.c
+++ b/drivers/acpi/acpica/dbconvert.c
@@ -242,7 +242,7 @@ acpi_status acpi_db_convert_to_package(char *string, union acpi_object *object)
  *
  * RETURN:      Status
  *
- * DESCRIPTION: Convert a typed and tokenized string to an union acpi_object. Typing:
+ * DESCRIPTION: Convert a typed and tokenized string to a union acpi_object. Typing:
  *              1) String objects were surrounded by quotes.
  *              2) Buffer objects were surrounded by parentheses.
  *              3) Package objects were surrounded by brackets "[]".
diff --git a/drivers/acpi/acpica/nspredef.c b/drivers/acpi/acpica/nspredef.c
index 3dbbecf22087..9d14b509529e 100644
--- a/drivers/acpi/acpica/nspredef.c
+++ b/drivers/acpi/acpica/nspredef.c
@@ -323,7 +323,7 @@ acpi_ns_check_reference(struct acpi_evaluate_info *info,
 
 	/*
 	 * Check the reference object for the correct reference type (opcode).
-	 * The only type of reference that can be converted to an union acpi_object is
+	 * The only type of reference that can be converted to a union acpi_object is
 	 * a reference to a named object (reference class: NAME)
 	 */
 	if (return_object->reference.class == ACPI_REFCLASS_NAME) {
diff --git a/drivers/acpi/acpica/nsxfeval.c b/drivers/acpi/acpica/nsxfeval.c
index 8e365c0e766b..c944ff5c9c3d 100644
--- a/drivers/acpi/acpica/nsxfeval.c
+++ b/drivers/acpi/acpica/nsxfeval.c
@@ -495,9 +495,9 @@ static void acpi_ns_resolve_references(struct acpi_evaluate_info *info)
 	/*
 	 * Two types of references are supported - those created by Index and
 	 * ref_of operators. A name reference (AML_NAMEPATH_OP) can be converted
-	 * to an union acpi_object, so it is not dereferenced here. A ddb_handle
+	 * to a union acpi_object, so it is not dereferenced here. A ddb_handle
 	 * (AML_LOAD_OP) cannot be dereferenced, nor can it be converted to
-	 * an union acpi_object.
+	 * a union acpi_object.
 	 */
 	switch (info->return_object->reference.class) {
 	case ACPI_REFCLASS_INDEX:
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
index b0eb80d70c23..60b827eeefe2 100644
--- a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -1704,7 +1704,7 @@ struct ost_lvb {
  *   lquota data structures
  */
 
-/* The lquota_id structure is an union of all the possible identifier types that
+/* The lquota_id structure is a union of all the possible identifier types that
  * can be used with quota, this includes:
  * - 64-bit user ID
  * - 64-bit group ID
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c965e4469499..591b6c16f9c1 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -562,7 +562,7 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
  * @inode: inode to select the dentry from multiple layers (can be NULL)
  * @flags: open flags to control copy-up behavior
  *
- * If dentry is on an union/overlay, then return the underlying, real dentry.
+ * If dentry is on a union/overlay, then return the underlying, real dentry.
  * Otherwise return the dentry itself.
  *
  * See also: Documentation/filesystems/vfs.txt
@@ -581,7 +581,7 @@ static inline struct dentry *d_real(struct dentry *dentry,
  * d_real_inode - Return the real inode
  * @dentry: The dentry to query
  *
- * If dentry is on an union/overlay, then return the underlying, real inode.
+ * If dentry is on a union/overlay, then return the underlying, real inode.
  * Otherwise return d_inode().
  */
 static inline struct inode *d_real_inode(const struct dentry *dentry)
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index e1006b391cdc..bee1404391dd 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -174,10 +174,10 @@ typedef void (*v4l2_ctrl_notify_fnc)(struct v4l2_ctrl *ctrl, void *priv);
  *		not freed when the control is deleted. Should this be needed
  *		then a new internal bitfield can be added to tell the framework
  *		to free this pointer.
- * @p_cur:	The control's current value represented via an union with
+ * @p_cur:	The control's current value represented via a union with
  *		provides a standard way of accessing control types
  *		through a pointer.
- * @p_new:	The control's new value represented via an union with provides
+ * @p_new:	The control's new value represented via a union with provides
  *		a standard way of accessing control types
  *		through a pointer.
  */
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
index 56806bc90c2f..7fb7112d667c 100644
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -181,7 +181,7 @@ struct grant_entry_header {
 };
 
 /*
- * Version 2 of the grant entry structure, here is an union because three
+ * Version 2 of the grant entry structure, here is a union because three
  * different types are suppotted: full_page, sub_page and transitive.
  */
 union grant_entry_v2 {
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 13794532c3fa..27991a91de6f 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -86,6 +86,7 @@ alue||value
 ambigious||ambiguous
 amoung||among
 amout||amount
+an union||a union
 an user||a user
 an userspace||a userspace
 analysator||analyzer
diff --git a/sound/pci/ice1712/wm8766.c b/sound/pci/ice1712/wm8766.c
index f7ac8d5e862c..27c03e40c9b1 100644
--- a/sound/pci/ice1712/wm8766.c
+++ b/sound/pci/ice1712/wm8766.c
@@ -254,7 +254,7 @@ static int snd_wm8766_ctl_put(struct snd_kcontrol *kcontrol,
 	int n = kcontrol->private_value;
 	u16 val, regval1, regval2;
 
-	/* this also works for enum because value is an union */
+	/* this also works for enum because value is a union */
 	regval1 = ucontrol->value.integer.value[0];
 	regval2 = ucontrol->value.integer.value[1];
 	if (wm->ctl[n].flags & WM8766_FLAG_INVERT) {
diff --git a/sound/pci/ice1712/wm8776.c b/sound/pci/ice1712/wm8776.c
index ebd2fe4b4a57..553669b103c2 100644
--- a/sound/pci/ice1712/wm8776.c
+++ b/sound/pci/ice1712/wm8776.c
@@ -528,7 +528,7 @@ static int snd_wm8776_ctl_put(struct snd_kcontrol *kcontrol,
 	int n = kcontrol->private_value;
 	u16 val, regval1, regval2;
 
-	/* this also works for enum because value is an union */
+	/* this also works for enum because value is a union */
 	regval1 = ucontrol->value.integer.value[0];
 	regval2 = ucontrol->value.integer.value[1];
 	if (wm->ctl[n].flags & WM8776_FLAG_INVERT) {
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 0d9d6e0803b8..57cd268d4275 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -464,7 +464,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 		/* Verify it is a data structure  */
 		tag = dwarf_tag(&type);
 		if (tag != DW_TAG_structure_type && tag != DW_TAG_union_type) {
-			pr_warning("%s is not a data structure nor an union.\n",
+			pr_warning("%s is not a data structure nor a union.\n",
 				   varname);
 			return -EINVAL;
 		}
@@ -479,7 +479,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 	} else {
 		/* Verify it is a data structure  */
 		if (tag != DW_TAG_structure_type && tag != DW_TAG_union_type) {
-			pr_warning("%s is not a data structure nor an union.\n",
+			pr_warning("%s is not a data structure nor a union.\n",
 				   varname);
 			return -EINVAL;
 		}
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 7aff317fc7c4..796c847e2f00 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -108,7 +108,7 @@ struct hist_entry {
 		/*
 		 * Since perf diff only supports the stdio output, TUI
 		 * fields are only accessed from perf report (or perf
-		 * top).  So make it an union to reduce memory usage.
+		 * top).  So make it a union to reduce memory usage.
 		 */
 		struct hist_entry_diff	diff;
 		struct /* for TUI */ {
-- 
cgit v1.2.3


From 8ab102d60a0c19df602f3758848d55f0703bf9bb Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:28:55 -0800
Subject: scripts/spelling.txt: add "partiton" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  partiton||partition

Link: http://lkml.kernel.org/r/1481573103-11329-7-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/mach/flash.h                   | 2 +-
 arch/ia64/sn/kernel/sn2/sn_hwperf.c                 | 2 +-
 arch/powerpc/boot/dts/fsl/mpc8569mds.dts            | 2 +-
 arch/powerpc/include/asm/book3s/64/mmu.h            | 2 +-
 arch/powerpc/include/asm/fsl_hcalls.h               | 2 +-
 arch/powerpc/platforms/pseries/iommu.c              | 2 +-
 arch/sparc/kernel/visemul.c                         | 2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 2 +-
 drivers/watchdog/bcm2835_wdt.c                      | 6 +++---
 include/linux/mtd/qinfo.h                           | 2 +-
 include/linux/spi/flash.h                           | 2 +-
 11 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/mach/flash.h b/arch/arm/include/asm/mach/flash.h
index 4ca69fe2c850..bada3f845a97 100644
--- a/arch/arm/include/asm/mach/flash.h
+++ b/arch/arm/include/asm/mach/flash.h
@@ -22,7 +22,7 @@ struct mtd_info;
  * set_vpp:	method called to enable or disable VPP
  * mmcontrol:	method called to enable or disable Sync. Burst Read in OneNAND
  * parts:	optional array of mtd_partitions for static partitioning
- * nr_parts:	number of mtd_partitions for static partitoning
+ * nr_parts:	number of mtd_partitions for static partitioning
  */
 struct flash_platform_data {
 	const char	*map_name;
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 4c3b84d8406a..52704f199dd6 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -525,7 +525,7 @@ static int sn_topology_show(struct seq_file *s, void *d)
 				/* both ends local to this partition */
 				seq_puts(s, " local");
 			else if (SN_HWPERF_FOREIGN(p))
-				/* both ends of the link in foreign partiton */
+				/* both ends of the link in foreign partition */
 				seq_puts(s, " foreign");
 			else
 				/* link straddles a partition */
diff --git a/arch/powerpc/boot/dts/fsl/mpc8569mds.dts b/arch/powerpc/boot/dts/fsl/mpc8569mds.dts
index 8e94448f296c..76b2bd6f7742 100644
--- a/arch/powerpc/boot/dts/fsl/mpc8569mds.dts
+++ b/arch/powerpc/boot/dts/fsl/mpc8569mds.dts
@@ -55,7 +55,7 @@
 				label = "kernel";
 				reg = <0x01c00000 0x002e0000>;
 			};
-			partiton@1ee0000 {
+			partition@1ee0000 {
 				label = "dtb";
 				reg = <0x01ee0000 0x00020000>;
 			};
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index d73e9dfa5237..1145dc8e726d 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -30,7 +30,7 @@ extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 
 #ifndef __ASSEMBLY__
 /*
- * ISA 3.0 partiton and process table entry format
+ * ISA 3.0 partition and process table entry format
  */
 struct prtb_entry {
 	__be64 prtb0;
diff --git a/arch/powerpc/include/asm/fsl_hcalls.h b/arch/powerpc/include/asm/fsl_hcalls.h
index 3abb58394da4..b889d13547fd 100644
--- a/arch/powerpc/include/asm/fsl_hcalls.h
+++ b/arch/powerpc/include/asm/fsl_hcalls.h
@@ -109,7 +109,7 @@ static inline unsigned int fh_send_nmi(unsigned int vcpu_mask)
 #define FH_DTPROP_MAX_PROPLEN 32768
 
 /**
- * fh_partiton_get_dtprop - get a property from a guest device tree.
+ * fh_partition_get_dtprop - get a property from a guest device tree.
  * @handle: handle of partition whose device tree is to be accessed
  * @dtpath_addr: physical address of device tree path to access
  * @propname_addr: physical address of name of property
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 0024e451bb36..4d757eaa46bf 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1020,7 +1020,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	/* check largest block * page size > max memory hotplug addr */
 	max_addr = memory_hotplug_max();
 	if (query.largest_available_block < (max_addr >> page_shift)) {
-		dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
+		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
 			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
 			  1ULL << page_shift);
 		goto out_failed;
diff --git a/arch/sparc/kernel/visemul.c b/arch/sparc/kernel/visemul.c
index c4ac58e483a4..8f35eea2103a 100644
--- a/arch/sparc/kernel/visemul.c
+++ b/arch/sparc/kernel/visemul.c
@@ -30,7 +30,7 @@
 /* 001001011 - two 32-bit merges */
 #define FPMERGE_OPF	0x04b
 
-/* 000110001 - 8-by-16-bit partitoned product  */
+/* 000110001 - 8-by-16-bit partitioned product  */
 #define FMUL8x16_OPF	0x031
 
 /* 000110011 - 8-by-16-bit upper alpha partitioned product  */
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
index 99b187bfdd55..718bf58a7da6 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
@@ -178,7 +178,7 @@ const u32 qlcnic_83xx_reg_tbl[] = {
 	0x3540,		/* Device state, DRV_REG1 */
 	0x3544,		/* Driver state, DRV_REG2 */
 	0x3548,		/* Driver scratch, DRV_REG3 */
-	0x354C,		/* Device partiton info, DRV_REG4 */
+	0x354C,		/* Device partition info, DRV_REG4 */
 	0x3524,		/* Driver IDC ver, DRV_REG5 */
 	0x3550,		/* FW_VER_MAJOR */
 	0x3554,		/* FW_VER_MINOR */
diff --git a/drivers/watchdog/bcm2835_wdt.c b/drivers/watchdog/bcm2835_wdt.c
index 496f6c106bb1..b339e0e67b4c 100644
--- a/drivers/watchdog/bcm2835_wdt.c
+++ b/drivers/watchdog/bcm2835_wdt.c
@@ -36,9 +36,9 @@
 #define PM_RSTC_RESET			0x00000102
 
 /*
- * The Raspberry Pi firmware uses the RSTS register to know which partiton
- * to boot from. The partiton value is spread into bits 0, 2, 4, 6, 8, 10.
- * Partiton 63 is a special partition used by the firmware to indicate halt.
+ * The Raspberry Pi firmware uses the RSTS register to know which partition
+ * to boot from. The partition value is spread into bits 0, 2, 4, 6, 8, 10.
+ * Partition 63 is a special partition used by the firmware to indicate halt.
  */
 #define PM_RSTS_RASPBERRYPI_HALT	0x555
 
diff --git a/include/linux/mtd/qinfo.h b/include/linux/mtd/qinfo.h
index 7b3d487d8b3f..b532ce524dae 100644
--- a/include/linux/mtd/qinfo.h
+++ b/include/linux/mtd/qinfo.h
@@ -14,7 +14,7 @@
  * @DevId - Chip Device ID
  * @qinfo - pointer to qinfo records describing the chip
  * @numchips - number of chips including virual RWW partitions
- * @chipshift - Chip/partiton size 2^chipshift
+ * @chipshift - Chip/partition size 2^chipshift
  * @chips - per-chip data structure
  */
 struct lpddr_private {
diff --git a/include/linux/spi/flash.h b/include/linux/spi/flash.h
index 3f22932e67a4..f4199e758f97 100644
--- a/include/linux/spi/flash.h
+++ b/include/linux/spi/flash.h
@@ -7,7 +7,7 @@ struct mtd_partition;
  * struct flash_platform_data: board-specific flash data
  * @name: optional flash device name (eg, as used with mtdparts=)
  * @parts: optional array of mtd_partitions for static partitioning
- * @nr_parts: number of mtd_partitions for static partitoning
+ * @nr_parts: number of mtd_partitions for static partitioning
  * @type: optional flash device type (e.g. m25p80 vs m25p64), for use
  *	with chips that can't be queried for JEDEC or other IDs
  *
-- 
cgit v1.2.3


From 4091fb95b5f8dea37568d1a94c8227244bade891 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:29:56 -0800
Subject: scripts/spelling.txt: add "followings" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  followings||following

While we are here, add a missing colon in the boilerplate in DT binding
documents.  The "you SoC" in allwinner,sunxi-pinctrl.txt was fixed as
well.

I reworded "as the followings:" to "as follows:" for
drivers/usb/gadget/udc/renesas_usb3.c.

Link: http://lkml.kernel.org/r/1481573103-11329-32-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/libata.tmpl                                     | 2 +-
 Documentation/acpi/method-tracing.txt                                 | 2 +-
 Documentation/blockdev/mflash.txt                                     | 2 +-
 Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt     | 2 +-
 Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt | 2 +-
 Documentation/devicetree/bindings/soc/rockchip/grf.txt                | 4 ++--
 Documentation/devicetree/bindings/sound/rockchip-i2s.txt              | 4 ++--
 Documentation/devicetree/bindings/sound/sun4i-codec.txt               | 2 +-
 Documentation/devicetree/bindings/sound/sun4i-i2s.txt                 | 4 ++--
 Documentation/memory-hotplug.txt                                      | 4 ++--
 Documentation/sound/hd-audio/notes.rst                                | 2 +-
 drivers/ata/libata-eh.c                                               | 2 +-
 drivers/atm/iphase.c                                                  | 2 +-
 drivers/atm/iphase.h                                                  | 2 +-
 drivers/devfreq/devfreq.c                                             | 2 +-
 drivers/hwmon/g762.c                                                  | 2 +-
 drivers/isdn/hardware/eicon/debug.c                                   | 2 +-
 drivers/pci/quirks.c                                                  | 2 +-
 drivers/staging/gs_fpgaboot/gs_fpgaboot.h                             | 2 +-
 drivers/usb/gadget/udc/renesas_usb3.c                                 | 4 ++--
 include/linux/kconfig.h                                               | 2 +-
 mm/percpu.c                                                           | 2 +-
 scripts/spelling.txt                                                  | 1 +
 sound/pci/hda/patch_ca0132.c                                          | 2 +-
 sound/ppc/snd_ps3.c                                                   | 2 +-
 sound/soc/fsl/fsl_asrc.c                                              | 2 +-
 26 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
index d7fcdc5a4379..0320910b866d 100644
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -1020,7 +1020,7 @@ and other resources, etc.
 	</itemizedlist>
 
 	<para>
-	Of errors detected as above, the followings are not ATA/ATAPI
+	Of errors detected as above, the following are not ATA/ATAPI
 	device errors but ATA bus errors and should be handled
 	according to <xref linkend="excatATAbusErr"/>.
 	</para>
diff --git a/Documentation/acpi/method-tracing.txt b/Documentation/acpi/method-tracing.txt
index c2505eefc878..0aba14c8f459 100644
--- a/Documentation/acpi/method-tracing.txt
+++ b/Documentation/acpi/method-tracing.txt
@@ -152,7 +152,7 @@ tracing facility.
 	Users can enable/disable this debug tracing feature by executing
 	the following command:
 	    # echo string > /sys/module/acpi/parameters/trace_state
-	Where "string" should be one of the followings:
+	Where "string" should be one of the following:
 	"disable"
 	    Disable the method tracing feature.
 	"enable"
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
index 1f610ecf698a..f7e050551487 100644
--- a/Documentation/blockdev/mflash.txt
+++ b/Documentation/blockdev/mflash.txt
@@ -17,7 +17,7 @@ driver and currently works well under standard IDE subsystem. Actually it's
 one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
 IDE interface.
 
-Followings are brief descriptions about IO mode.
+Following are brief descriptions about IO mode.
 A. IO mode based on ATA protocol and uses some custom command. (read confirm,
 write confirm)
 B. IO mode uses SRAM bus interface.
diff --git a/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt b/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt
index 7aa840c8768d..ae4234ca4ee4 100644
--- a/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt
+++ b/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt
@@ -1,7 +1,7 @@
 * Marvell Armada 370 / Armada XP / Armada 3700 Ethernet Controller (NETA)
 
 Required properties:
-- compatible: could be one of the followings
+- compatible: could be one of the following:
 	"marvell,armada-370-neta"
 	"marvell,armada-xp-neta"
 	"marvell,armada-3700-neta"
diff --git a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
index 7c85dca4221a..2fd688c8dbdb 100644
--- a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
@@ -6,7 +6,7 @@ the first two functions being GPIO in and out. The configuration on
 the pins includes drive strength and pull-up.
 
 Required properties:
-- compatible: Should be one of the followings (depending on you SoC):
+- compatible: Should be one of the following (depending on your SoC):
   "allwinner,sun4i-a10-pinctrl"
   "allwinner,sun5i-a10s-pinctrl"
   "allwinner,sun5i-a13-pinctrl"
diff --git a/Documentation/devicetree/bindings/soc/rockchip/grf.txt b/Documentation/devicetree/bindings/soc/rockchip/grf.txt
index c6e62cb30712..a0685c209218 100644
--- a/Documentation/devicetree/bindings/soc/rockchip/grf.txt
+++ b/Documentation/devicetree/bindings/soc/rockchip/grf.txt
@@ -10,7 +10,7 @@ From RK3368 SoCs, the GRF is divided into two sections,
 
 Required Properties:
 
-- compatible: GRF should be one of the followings
+- compatible: GRF should be one of the following:
    - "rockchip,rk3036-grf", "syscon": for rk3036
    - "rockchip,rk3066-grf", "syscon": for rk3066
    - "rockchip,rk3188-grf", "syscon": for rk3188
@@ -18,7 +18,7 @@ Required Properties:
    - "rockchip,rk3288-grf", "syscon": for rk3288
    - "rockchip,rk3368-grf", "syscon": for rk3368
    - "rockchip,rk3399-grf", "syscon": for rk3399
-- compatible: PMUGRF should be one of the followings
+- compatible: PMUGRF should be one of the following:
    - "rockchip,rk3368-pmugrf", "syscon": for rk3368
    - "rockchip,rk3399-pmugrf", "syscon": for rk3399
 - compatible: SGRF should be one of the following
diff --git a/Documentation/devicetree/bindings/sound/rockchip-i2s.txt b/Documentation/devicetree/bindings/sound/rockchip-i2s.txt
index 4ea29aa9af59..a6600f6dea64 100644
--- a/Documentation/devicetree/bindings/sound/rockchip-i2s.txt
+++ b/Documentation/devicetree/bindings/sound/rockchip-i2s.txt
@@ -5,7 +5,7 @@ audio data transfer between devices in the system.
 
 Required properties:
 
-- compatible: should be one of the followings
+- compatible: should be one of the following:
    - "rockchip,rk3066-i2s": for rk3066
    - "rockchip,rk3188-i2s", "rockchip,rk3066-i2s": for rk3188
    - "rockchip,rk3288-i2s", "rockchip,rk3066-i2s": for rk3288
@@ -17,7 +17,7 @@ Required properties:
 	Documentation/devicetree/bindings/dma/dma.txt
 - dma-names: should include "tx" and "rx".
 - clocks: a list of phandle + clock-specifer pairs, one for each entry in clock-names.
-- clock-names: should contain followings:
+- clock-names: should contain the following:
    - "i2s_hclk": clock for I2S BUS
    - "i2s_clk" : clock for I2S controller
 - rockchip,playback-channels: max playback channels, if not set, 8 channels default.
diff --git a/Documentation/devicetree/bindings/sound/sun4i-codec.txt b/Documentation/devicetree/bindings/sound/sun4i-codec.txt
index 3033bd8aab0f..3863531d1e6d 100644
--- a/Documentation/devicetree/bindings/sound/sun4i-codec.txt
+++ b/Documentation/devicetree/bindings/sound/sun4i-codec.txt
@@ -14,7 +14,7 @@ Required properties:
 - dma-names: should include "tx" and "rx".
 - clocks: a list of phandle + clock-specifer pairs, one for each entry
   in clock-names.
-- clock-names: should contain followings:
+- clock-names: should contain the following:
    - "apb": the parent APB clock for this controller
    - "codec": the parent module clock
 
diff --git a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt b/Documentation/devicetree/bindings/sound/sun4i-i2s.txt
index f4adc58f82ba..ee21da865771 100644
--- a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt
+++ b/Documentation/devicetree/bindings/sound/sun4i-i2s.txt
@@ -5,7 +5,7 @@ audio data transfer between devices in the system.
 
 Required properties:
 
-- compatible: should be one of the followings
+- compatible: should be one of the following:
    - "allwinner,sun4i-a10-i2s"
    - "allwinner,sun6i-a31-i2s"
 - reg: physical base address of the controller and length of memory mapped
@@ -15,7 +15,7 @@ Required properties:
 	Documentation/devicetree/bindings/dma/dma.txt
 - dma-names: should include "tx" and "rx".
 - clocks: a list of phandle + clock-specifer pairs, one for each entry in clock-names.
-- clock-names: should contain followings:
+- clock-names: should contain the following:
    - "apb" : clock for the I2S bus interface
    - "mod" : module clock for the I2S controller
 - #sound-dai-cells : Must be equal to 0
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 5de846d3ecc0..670f3ded0802 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -114,11 +114,11 @@ config options.
     Memory model -> Sparse Memory  (CONFIG_SPARSEMEM)
     Allow for memory hot-add       (CONFIG_MEMORY_HOTPLUG)
 
-- To enable memory removal, the followings are also necessary
+- To enable memory removal, the following are also necessary
     Allow for memory hot remove    (CONFIG_MEMORY_HOTREMOVE)
     Page Migration                 (CONFIG_MIGRATION)
 
-- For ACPI memory hotplug, the followings are also necessary
+- For ACPI memory hotplug, the following are also necessary
     Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY)
     This option can be kernel module.
 
diff --git a/Documentation/sound/hd-audio/notes.rst b/Documentation/sound/hd-audio/notes.rst
index 168d0cfab1ce..9eeb9b468706 100644
--- a/Documentation/sound/hd-audio/notes.rst
+++ b/Documentation/sound/hd-audio/notes.rst
@@ -697,7 +697,7 @@ If it's a regression, at best, send alsa-info outputs of both working
 and non-working kernels.  This is really helpful because we can
 compare the codec registers directly.
 
-Send a bug report either the followings:
+Send a bug report either the following:
 
 kernel-bugzilla
     https://bugzilla.kernel.org/
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 4e5bf36c5f46..ef68232b5222 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2034,7 +2034,7 @@ static int speed_down_verdict_cb(struct ata_ering_entry *ent, void *void_arg)
  *	This is to expedite speed down decisions right after device is
  *	initially configured.
  *
- *	The followings are speed down rules.  #1 and #2 deal with
+ *	The following are speed down rules.  #1 and #2 deal with
  *	DUBIOUS errors.
  *
  *	1. If more than one DUBIOUS_ATA_BUS or DUBIOUS_TOUT_HSM errors
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index 8640bafeb471..a4fa6c82261e 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -21,7 +21,7 @@
       supports a variety of varients of Interphase ATM PCI (i)Chip adapter 
       card family (See www.iphase.com/products/ClassSheet.cfm?ClassID=ATM) 
       in terms of PHY type, the size of control memory and the size of 
-      packet memory. The followings are the change log and history:
+      packet memory. The following are the change log and history:
      
           Bugfix the Mona's UBR driver.
           Modify the basic memory allocation and dma logic.
diff --git a/drivers/atm/iphase.h b/drivers/atm/iphase.h
index 53ecac5a2161..2beacf2fc1ec 100644
--- a/drivers/atm/iphase.h
+++ b/drivers/atm/iphase.h
@@ -21,7 +21,7 @@
       supports a variety of varients of Interphase ATM PCI (i)Chip adapter 
       card family (See www.iphase.com/products/ClassSheet.cfm?ClassID=ATM) 
       in terms of PHY type, the size of control memory and the size of 
-      packet memory. The followings are the change log and history:
+      packet memory. The following are the change log and history:
      
           Bugfix the Mona's UBR driver.
           Modify the basic memory allocation and dma logic.
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 551a271353d2..dea04871b50d 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -1228,7 +1228,7 @@ static int __init devfreq_init(void)
 subsys_initcall(devfreq_init);
 
 /*
- * The followings are helper functions for devfreq user device drivers with
+ * The following are helper functions for devfreq user device drivers with
  * OPP framework.
  */
 
diff --git a/drivers/hwmon/g762.c b/drivers/hwmon/g762.c
index 6dca2fd3d303..6d1208b2b6d2 100644
--- a/drivers/hwmon/g762.c
+++ b/drivers/hwmon/g762.c
@@ -861,7 +861,7 @@ static ssize_t fan1_pulses_store(struct device *dev,
  * (i.e. closed or open-loop).
  *
  * Following documentation about hwmon's sysfs interface, a pwm1_enable node
- * should accept followings:
+ * should accept the following:
  *
  *  0 : no fan speed control (i.e. fan at full speed)
  *  1 : manual fan speed control enabled (use pwm[1-*]) (open-loop)
diff --git a/drivers/isdn/hardware/eicon/debug.c b/drivers/isdn/hardware/eicon/debug.c
index 576b7b4a3278..8bc2791bc39c 100644
--- a/drivers/isdn/hardware/eicon/debug.c
+++ b/drivers/isdn/hardware/eicon/debug.c
@@ -2049,7 +2049,7 @@ static int diva_dbg_cmp_key(const char *ref, const char *key) {
 /*
   In case trace filter starts with "C" character then
   all following characters are interpreted as command.
-  Followings commands are available:
+  Following commands are available:
   - single, trace single call at time, independent from CPN/CiPN
 */
 static int diva_mnt_cmp_nmbr(const char *nmbr) {
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index ca77d235867f..f754453fe754 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3630,7 +3630,7 @@ static int __init pci_apply_final_quirks(void)
 fs_initcall_sync(pci_apply_final_quirks);
 
 /*
- * Followings are device-specific reset methods which can be used to
+ * Following are device-specific reset methods which can be used to
  * reset a single function if other methods (e.g. FLR, PM D0->D3) are
  * not available.
  */
diff --git a/drivers/staging/gs_fpgaboot/gs_fpgaboot.h b/drivers/staging/gs_fpgaboot/gs_fpgaboot.h
index 7b8cc3a25214..cd1eb2c4c940 100644
--- a/drivers/staging/gs_fpgaboot/gs_fpgaboot.h
+++ b/drivers/staging/gs_fpgaboot/gs_fpgaboot.h
@@ -39,7 +39,7 @@ struct fpgaimage {
 	const struct	firmware	*fw_entry;
 
 	/*
-	 * the followings can be read from bitstream,
+	 * the following can be read from bitstream,
 	 * but other image format should have as well
 	 */
 	char	filename[MAX_STR];
diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c
index fb8fc34827ab..2218f91e92a6 100644
--- a/drivers/usb/gadget/udc/renesas_usb3.c
+++ b/drivers/usb/gadget/udc/renesas_usb3.c
@@ -1791,7 +1791,7 @@ static int renesas_usb3_init_ep(struct renesas_usb3 *usb3, struct device *dev,
 
 	dev_dbg(dev, "%s: num_usb3_eps = %d\n", __func__, usb3->num_usb3_eps);
 	/*
-	 * This driver prepares pipes as the followings:
+	 * This driver prepares pipes as follows:
 	 *  - odd pipes = IN pipe
 	 *  - even pipes = OUT pipe (except pipe 0)
 	 */
@@ -1841,7 +1841,7 @@ static void renesas_usb3_init_ram(struct renesas_usb3 *usb3, struct device *dev,
 	memset(basead, 0, sizeof(basead));
 
 	/*
-	 * This driver prepares pipes as the followings:
+	 * This driver prepares pipes as follows:
 	 *  - all pipes = the same size as "ramsize_per_pipe"
 	 * Please refer to the "Method of Specifying RAM Mapping"
 	 */
diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index 8f2e059e4d45..4d748603e818 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -8,7 +8,7 @@
 
 /*
  * The use of "&&" / "||" is limited in certain expressions.
- * The followings enable to calculate "and" / "or" with macro expansion only.
+ * The following enable to calculate "and" / "or" with macro expansion only.
  */
 #define __and(x, y)			___and(x, y)
 #define ___and(x, y)			____and(__ARG_PLACEHOLDER_##x, y)
diff --git a/mm/percpu.c b/mm/percpu.c
index 0686f566d347..5696039b5c07 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -43,7 +43,7 @@
  * Chunks can be determined from the address using the index field
  * in the page struct. The index field contains a pointer to the chunk.
  *
- * To use this allocator, arch code should do the followings.
+ * To use this allocator, arch code should do the following:
  *
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back if they need to be
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 56a9860080dc..be1d00307927 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -464,6 +464,7 @@ finsih||finish
 flusing||flushing
 folloing||following
 followign||following
+followings||following
 follwing||following
 forseeable||foreseeable
 forse||force
diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c
index 9ec4dba8a793..07a9deb17477 100644
--- a/sound/pci/hda/patch_ca0132.c
+++ b/sound/pci/hda/patch_ca0132.c
@@ -2866,7 +2866,7 @@ static unsigned int ca0132_capture_pcm_delay(struct hda_pcm_stream *info,
 #define CA0132_CODEC_MUTE(xname, nid, dir) \
 	CA0132_CODEC_MUTE_MONO(xname, nid, 3, dir)
 
-/* The followings are for tuning of products */
+/* The following are for tuning of products */
 #ifdef ENABLE_TUNING_CONTROLS
 
 static unsigned int voice_focus_vals_lookup[] = {
diff --git a/sound/ppc/snd_ps3.c b/sound/ppc/snd_ps3.c
index b84d7d34f188..cdd44abfc9e0 100644
--- a/sound/ppc/snd_ps3.c
+++ b/sound/ppc/snd_ps3.c
@@ -883,7 +883,7 @@ static void snd_ps3_audio_set_base_addr(uint64_t ioaddr_start)
 static void snd_ps3_audio_fixup(struct snd_ps3_card_info *card)
 {
 	/*
-	 * avsetting driver seems to never change the followings
+	 * avsetting driver seems to never change the following
 	 * so, init them here once
 	 */
 
diff --git a/sound/soc/fsl/fsl_asrc.c b/sound/soc/fsl/fsl_asrc.c
index 1d82f68305c3..8cfffa70c144 100644
--- a/sound/soc/fsl/fsl_asrc.c
+++ b/sound/soc/fsl/fsl_asrc.c
@@ -368,7 +368,7 @@ static int fsl_asrc_config_pair(struct fsl_asrc_pair *pair)
 	fsl_asrc_set_watermarks(pair, ASRC_INPUTFIFO_THRESHOLD,
 				ASRC_INPUTFIFO_THRESHOLD);
 
-	/* Configure the followings only for Ideal Ratio mode */
+	/* Configure the following only for Ideal Ratio mode */
 	if (!ideal)
 		return 0;
 
-- 
cgit v1.2.3


From f1f1007644ffc8051a4c11427d58b1967ae7b75a Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 27 Feb 2017 14:30:07 -0800
Subject: mm: add new mmgrab() helper

Apart from adding the helper function itself, the rest of the kernel is
converted mechanically using:

  git grep -l 'atomic_inc.*mm_count' | xargs sed -i 's/atomic_inc(&\(.*\)->mm_count);/mmgrab\(\1\);/'
  git grep -l 'atomic_inc.*mm_count' | xargs sed -i 's/atomic_inc(&\(.*\)\.mm_count);/mmgrab\(\&\1\);/'

This is needed for a later patch that hooks into the helper, but might
be a worthwhile cleanup on its own.

(Michal Hocko provided most of the kerneldoc comment.)

Link: http://lkml.kernel.org/r/20161218123229.22952-1-vegard.nossum@oracle.com
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/smp.c                  |  2 +-
 arch/arc/kernel/smp.c                    |  2 +-
 arch/arm/kernel/smp.c                    |  2 +-
 arch/arm64/kernel/smp.c                  |  2 +-
 arch/blackfin/mach-common/smp.c          |  2 +-
 arch/hexagon/kernel/smp.c                |  2 +-
 arch/ia64/kernel/setup.c                 |  2 +-
 arch/m32r/kernel/setup.c                 |  2 +-
 arch/metag/kernel/smp.c                  |  2 +-
 arch/mips/kernel/traps.c                 |  2 +-
 arch/mn10300/kernel/smp.c                |  2 +-
 arch/parisc/kernel/smp.c                 |  2 +-
 arch/powerpc/kernel/smp.c                |  2 +-
 arch/s390/kernel/processor.c             |  2 +-
 arch/score/kernel/traps.c                |  2 +-
 arch/sh/kernel/smp.c                     |  2 +-
 arch/sparc/kernel/leon_smp.c             |  2 +-
 arch/sparc/kernel/smp_64.c               |  2 +-
 arch/sparc/kernel/sun4d_smp.c            |  2 +-
 arch/sparc/kernel/sun4m_smp.c            |  2 +-
 arch/sparc/kernel/traps_32.c             |  2 +-
 arch/sparc/kernel/traps_64.c             |  2 +-
 arch/tile/kernel/smpboot.c               |  2 +-
 arch/x86/kernel/cpu/common.c             |  4 ++--
 arch/xtensa/kernel/smp.c                 |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  2 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c  |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c    |  2 +-
 fs/proc/base.c                           |  4 ++--
 fs/userfaultfd.c                         |  2 +-
 include/linux/sched.h                    | 22 ++++++++++++++++++++++
 kernel/exit.c                            |  2 +-
 kernel/futex.c                           |  2 +-
 kernel/sched/core.c                      |  4 ++--
 mm/khugepaged.c                          |  2 +-
 mm/ksm.c                                 |  2 +-
 mm/mmu_context.c                         |  2 +-
 mm/mmu_notifier.c                        |  2 +-
 mm/oom_kill.c                            |  4 ++--
 virt/kvm/kvm_main.c                      |  2 +-
 40 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 46bf263c3153..acb4b146a607 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -144,7 +144,7 @@ smp_callin(void)
 		alpha_mv.smp_callin();
 
 	/* All kernel threads share the same mm context.  */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	/* inform the notifiers about the new cpu */
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 2afbafadb6ab..695624181682 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -140,7 +140,7 @@ void start_kernel_secondary(void)
 	setup_processor();
 
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 7dd14e8395e6..c6514ce0fcbc 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -371,7 +371,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * reference and switch to it.
 	 */
 	cpu = smp_processor_id();
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index a8ec5da530af..827d52d78b67 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -222,7 +222,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * All kernel threads share the same mm context; grab a
 	 * reference and switch to it.
 	 */
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 
 	/*
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 23c4ef5f8bdc..bc5617ef7128 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -308,7 +308,7 @@ void secondary_start_kernel(void)
 
 	/* Attach the new idle task to the global mm. */
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 
 	preempt_disable();
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 983bae7d2665..c02a6455839e 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -162,7 +162,7 @@ void start_secondary(void)
 	);
 
 	/*  Set the memory struct  */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	cpu = smp_processor_id();
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index c483ece3eb84..d68322966f33 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -994,7 +994,7 @@ cpu_init (void)
 	 */
 	ia64_setreg(_IA64_REG_CR_DCR,  (  IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
 					| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 136c69f1fb8a..b18bc0bd6544 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -403,7 +403,7 @@ void __init cpu_init (void)
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu_id);
 
 	/* Set up and load the per-CPU TSS and LDT */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	if (current->mm)
 		BUG();
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index bad13232de51..af9cff547a19 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -345,7 +345,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * reference and switch to it.
 	 */
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 	enter_lazy_tlb(mm, current);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index cb479be31a50..49c6df20672a 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2232,7 +2232,7 @@ void per_cpu_trap_init(bool is_boot_cpu)
 	if (!cpu_data[cpu].asid_cache)
 		cpu_data[cpu].asid_cache = asid_first_version(cpu);
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index 426173c4b0b9..e65b5cc2fa67 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -589,7 +589,7 @@ static void __init smp_cpu_init(void)
 	}
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu_id);
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 75dab2871346..67b452b41ff6 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -279,7 +279,7 @@ smp_cpu_init(int cpunum)
 	set_cpu_online(cpunum, true);
 
 	/* Initialise the idle task for this CPU */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 893bd7f79be6..573fb3a461b5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -707,7 +707,7 @@ void start_secondary(void *unused)
 	unsigned int cpu = smp_processor_id();
 	int i, base;
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 21004aaac69b..bc2b60dcb178 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -73,7 +73,7 @@ void cpu_init(void)
 	get_cpu_id(id);
 	if (machine_has_cpu_mhz)
 		update_cpu_mhz(NULL);
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 2b22bcf02c27..569ac02f68df 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -336,7 +336,7 @@ void __init trap_init(void)
 	set_except_vector(18, handle_dbe);
 	flush_icache_range(DEBUG_VECTOR_BASE_ADDR, IRQ_VECTOR_BASE_ADDR);
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	cpu_cache_init();
 }
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 38e7860845db..ee379c699c08 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -178,7 +178,7 @@ asmlinkage void start_secondary(void)
 	struct mm_struct *mm = &init_mm;
 
 	enable_mmu();
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	atomic_inc(&mm->mm_users);
 	current->active_mm = mm;
 #ifdef CONFIG_MMU
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c
index 71e16f2241c2..b99d33797e1d 100644
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -93,7 +93,7 @@ void leon_cpu_pre_online(void *arg)
 			     : "memory" /* paranoid */);
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 90a02cb64e20..8e3e13924594 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -122,7 +122,7 @@ void smp_callin(void)
 	current_thread_info()->new_child = 0;
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	/* inform the notifiers about the new cpu */
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 9d98e5002a09..7b55c50eabe5 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -93,7 +93,7 @@ void sun4d_cpu_pre_online(void *arg)
 	show_leds(cpuid);
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	local_ops->cache_all();
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index 278c40abce82..633c4cf6fdb0 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -59,7 +59,7 @@ void sun4m_cpu_pre_online(void *arg)
 			     : "memory" /* paranoid */);
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 4f21df7d4f13..ecddac5a4c96 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -448,7 +448,7 @@ void trap_init(void)
 		thread_info_offsets_are_bolixed_pete();
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	/* NOTE: Other cpus have this done as they are started
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index dfc97a47c9a0..e022d7b00390 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2837,6 +2837,6 @@ void __init trap_init(void)
 	/* Attach to the address space of init_task.  On SMP we
 	 * do this in smp.c:smp_callin for other cpus.
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 }
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 6c0abaacec33..53ce940a5016 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -160,7 +160,7 @@ static void start_secondary(void)
 	__this_cpu_write(current_asid, min_asid);
 
 	/* Set up this thread as another owner of the init_mm */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	if (current->mm)
 		BUG();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f07005e6f461..c64ca5929cb5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1510,7 +1510,7 @@ void cpu_init(void)
 	for (i = 0; i <= IO_BITMAP_LONGS; i++)
 		t->io_bitmap[i] = ~0UL;
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	me->active_mm = &init_mm;
 	BUG_ON(me->mm);
 	enter_lazy_tlb(&init_mm, me);
@@ -1561,7 +1561,7 @@ void cpu_init(void)
 	/*
 	 * Set up and load the per-CPU TSS and LDT
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	curr->active_mm = &init_mm;
 	BUG_ON(curr->mm);
 	enter_lazy_tlb(&init_mm, curr);
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index fc4ad21a5ed4..9bf5cea3bae4 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -136,7 +136,7 @@ void secondary_start_kernel(void)
 	/* All kernel threads share the same mm context. */
 
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 	enter_lazy_tlb(mm, current);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ef7c8de7060e..ca5f2aa7232d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -262,7 +262,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	 * and because the mmu_notifier_unregister function also drop
 	 * mm_count we need to take an extra count here.
 	 */
-	atomic_inc(&p->mm->mm_count);
+	mmgrab(p->mm);
 	mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm);
 	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 6a8fa085b74e..65802d93fdc1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -334,7 +334,7 @@ i915_gem_userptr_init__mm_struct(struct drm_i915_gem_object *obj)
 		mm->i915 = to_i915(obj->base.dev);
 
 		mm->mm = current->mm;
-		atomic_inc(&current->mm->mm_count);
+		mmgrab(current->mm);
 
 		mm->mn = NULL;
 
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index f46033984d07..3b19c16a9e45 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -185,7 +185,7 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
 	if (fd) {
 		fd->rec_cpu_num = -1; /* no cpu affinity by default */
 		fd->mm = current->mm;
-		atomic_inc(&fd->mm->mm_count);
+		mmgrab(fd->mm);
 		fp->private_data = fd;
 	} else {
 		fp->private_data = NULL;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b8f06273353e..5d51a188871b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -766,7 +766,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 
 		if (!IS_ERR_OR_NULL(mm)) {
 			/* ensure this mm_struct can't be freed */
-			atomic_inc(&mm->mm_count);
+			mmgrab(mm);
 			/* but do not pin its memory */
 			mmput(mm);
 		}
@@ -1064,7 +1064,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
 		if (p) {
 			if (atomic_read(&p->mm->mm_users) > 1) {
 				mm = p->mm;
-				atomic_inc(&mm->mm_count);
+				mmgrab(mm);
 			}
 			task_unlock(p);
 		}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e6e0a619cb3a..3c421d06a18e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1847,7 +1847,7 @@ static struct file *userfaultfd_file_create(int flags)
 	ctx->released = false;
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
-	atomic_inc(&ctx->mm->mm_count);
+	mmgrab(ctx->mm);
 
 	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
 				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 451e241f32c5..7cfa5546c840 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2904,6 +2904,28 @@ static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
  */
 extern struct mm_struct * mm_alloc(void);
 
+/**
+ * mmgrab() - Pin a &struct mm_struct.
+ * @mm: The &struct mm_struct to pin.
+ *
+ * Make sure that @mm will not get freed even after the owning task
+ * exits. This doesn't guarantee that the associated address space
+ * will still exist later on and mmget_not_zero() has to be used before
+ * accessing it.
+ *
+ * This is a preferred way to to pin @mm for a longer/unbounded amount
+ * of time.
+ *
+ * Use mmdrop() to release the reference acquired by mmgrab().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmgrab(struct mm_struct *mm)
+{
+	atomic_inc(&mm->mm_count);
+}
+
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
 static inline void mmdrop(struct mm_struct *mm)
diff --git a/kernel/exit.c b/kernel/exit.c
index 90b09ca35c84..8a768a3672a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -539,7 +539,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/futex.c b/kernel/futex.c
index cdf365036141..b687cb22301c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -338,7 +338,7 @@ static inline bool should_fail_futex(bool fshared)
 
 static inline void futex_get_mm(union futex_key *key)
 {
-	atomic_inc(&key->private.mm->mm_count);
+	mmgrab(key->private.mm);
 	/*
 	 * Ensure futex_get_mm() implies a full barrier such that
 	 * get_futex_key() implies a full barrier. This is relied upon
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1ae6ac15eac..6ea1925ac5c0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2847,7 +2847,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	if (!mm) {
 		next->active_mm = oldmm;
-		atomic_inc(&oldmm->mm_count);
+		mmgrab(oldmm);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm_irqs_off(oldmm, mm, next);
@@ -6098,7 +6098,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 77ae3239c3de..34bce5c308e3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -420,7 +420,7 @@ int __khugepaged_enter(struct mm_struct *mm)
 	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
 	spin_unlock(&khugepaged_mm_lock);
 
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	if (wakeup)
 		wake_up_interruptible(&khugepaged_wait);
 
diff --git a/mm/ksm.c b/mm/ksm.c
index cf211c01ceac..520e4c37fec7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1854,7 +1854,7 @@ int __ksm_enter(struct mm_struct *mm)
 	spin_unlock(&ksm_mmlist_lock);
 
 	set_bit(MMF_VM_MERGEABLE, &mm->flags);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 
 	if (needs_wakeup)
 		wake_up_interruptible(&ksm_thread_wait);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 6f4d27c5bb32..daf67bb02b4a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -25,7 +25,7 @@ void use_mm(struct mm_struct *mm)
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	if (active_mm != mm) {
-		atomic_inc(&mm->mm_count);
+		mmgrab(mm);
 		tsk->active_mm = mm;
 	}
 	tsk->mm = mm;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index f4259e496f83..32bc9f2ff7eb 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -275,7 +275,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 		mm->mmu_notifier_mm = mmu_notifier_mm;
 		mmu_notifier_mm = NULL;
 	}
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 
 	/*
 	 * Serialize the update against mmu_notifier_unregister. A
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 578321f1c070..51c091849dcb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -653,7 +653,7 @@ static void mark_oom_victim(struct task_struct *tsk)
 
 	/* oom_mm is bound to the signal struct life time. */
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
-		atomic_inc(&tsk->signal->oom_mm->mm_count);
+		mmgrab(tsk->signal->oom_mm);
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
@@ -870,7 +870,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 
 	/* Get a reference to safely compare mm after task_unlock(victim) */
 	mm = victim->mm;
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	/*
 	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
 	 * the OOM victim from depleting the memory reserves from the user
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b0dd4a9b2cb..35f71409d9ee 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -611,7 +611,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&kvm->mmu_lock);
-	atomic_inc(&current->mm->mm_count);
+	mmgrab(current->mm);
 	kvm->mm = current->mm;
 	kvm_eventfd_init(kvm);
 	mutex_init(&kvm->lock);
-- 
cgit v1.2.3


From 3fce371bfac2be0396ffc1e763600e6c6b1bb52a Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 27 Feb 2017 14:30:10 -0800
Subject: mm: add new mmget() helper

Apart from adding the helper function itself, the rest of the kernel is
converted mechanically using:

  git grep -l 'atomic_inc.*mm_users' | xargs sed -i 's/atomic_inc(&\(.*\)->mm_users);/mmget\(\1\);/'
  git grep -l 'atomic_inc.*mm_users' | xargs sed -i 's/atomic_inc(&\(.*\)\.mm_users);/mmget\(\&\1\);/'

This is needed for a later patch that hooks into the helper, but might
be a worthwhile cleanup on its own.

(Michal Hocko provided most of the kerneldoc comment.)

Link: http://lkml.kernel.org/r/20161218123229.22952-2-vegard.nossum@oracle.com
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/kernel/smp.c           |  2 +-
 arch/blackfin/mach-common/smp.c |  2 +-
 arch/frv/mm/mmu-context.c       |  2 +-
 arch/metag/kernel/smp.c         |  2 +-
 arch/sh/kernel/smp.c            |  2 +-
 arch/xtensa/kernel/smp.c        |  2 +-
 include/linux/sched.h           | 21 +++++++++++++++++++++
 kernel/fork.c                   |  4 ++--
 mm/swapfile.c                   | 10 +++++-----
 virt/kvm/async_pf.c             |  2 +-
 10 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 695624181682..b8e8d3944481 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -139,7 +139,7 @@ void start_kernel_secondary(void)
 	/* MMU, Caches, Vector Table, Interrupts etc */
 	setup_processor();
 
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index bc5617ef7128..a2e6db2ce811 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -307,7 +307,7 @@ void secondary_start_kernel(void)
 	local_irq_disable();
 
 	/* Attach the new idle task to the global mm. */
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 
diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c
index 81757d55a5b5..3473bde77f56 100644
--- a/arch/frv/mm/mmu-context.c
+++ b/arch/frv/mm/mmu-context.c
@@ -188,7 +188,7 @@ int cxn_pin_by_pid(pid_t pid)
 		task_lock(tsk);
 		if (tsk->mm) {
 			mm = tsk->mm;
-			atomic_inc(&mm->mm_users);
+			mmget(mm);
 			ret = 0;
 		}
 		task_unlock(tsk);
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index af9cff547a19..c622293254e4 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -344,7 +344,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * All kernel threads share the same mm context; grab a
 	 * reference and switch to it.
 	 */
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index ee379c699c08..edc4769b047e 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -179,7 +179,7 @@ asmlinkage void start_secondary(void)
 
 	enable_mmu();
 	mmgrab(mm);
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	current->active_mm = mm;
 #ifdef CONFIG_MMU
 	enter_lazy_tlb(mm, current);
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 9bf5cea3bae4..fcea72019df7 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -135,7 +135,7 @@ void secondary_start_kernel(void)
 
 	/* All kernel threads share the same mm context. */
 
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7cfa5546c840..4a28deb5f210 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2948,6 +2948,27 @@ static inline void mmdrop_async(struct mm_struct *mm)
 	}
 }
 
+/**
+ * mmget() - Pin the address space associated with a &struct mm_struct.
+ * @mm: The address space to pin.
+ *
+ * Make sure that the address space of the given &struct mm_struct doesn't
+ * go away. This does not protect against parts of the address space being
+ * modified or freed, however.
+ *
+ * Never use this function to pin this address space for an
+ * unbounded/indefinite amount of time.
+ *
+ * Use mmput() to release the reference acquired by mmget().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmget(struct mm_struct *mm)
+{
+	atomic_inc(&mm->mm_users);
+}
+
 static inline bool mmget_not_zero(struct mm_struct *mm)
 {
 	return atomic_inc_not_zero(&mm->mm_users);
diff --git a/kernel/fork.c b/kernel/fork.c
index 348fe73155bc..246bf9aaf9df 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1000,7 +1000,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 		if (task->flags & PF_KTHREAD)
 			mm = NULL;
 		else
-			atomic_inc(&mm->mm_users);
+			mmget(mm);
 	}
 	task_unlock(task);
 	return mm;
@@ -1188,7 +1188,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	vmacache_flush(tsk);
 
 	if (clone_flags & CLONE_VM) {
-		atomic_inc(&oldmm->mm_users);
+		mmget(oldmm);
 		mm = oldmm;
 		goto good_mm;
 	}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2cac12cc9abe..7a0713b76668 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1671,7 +1671,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 	 * that.
 	 */
 	start_mm = &init_mm;
-	atomic_inc(&init_mm.mm_users);
+	mmget(&init_mm);
 
 	/*
 	 * Keep on scanning until all entries have gone.  Usually,
@@ -1720,7 +1720,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 		if (atomic_read(&start_mm->mm_users) == 1) {
 			mmput(start_mm);
 			start_mm = &init_mm;
-			atomic_inc(&init_mm.mm_users);
+			mmget(&init_mm);
 		}
 
 		/*
@@ -1757,8 +1757,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
 			struct mm_struct *prev_mm = start_mm;
 			struct mm_struct *mm;
 
-			atomic_inc(&new_start_mm->mm_users);
-			atomic_inc(&prev_mm->mm_users);
+			mmget(new_start_mm);
+			mmget(prev_mm);
 			spin_lock(&mmlist_lock);
 			while (swap_count(*swap_map) && !retval &&
 					(p = p->next) != &start_mm->mmlist) {
@@ -1781,7 +1781,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 
 				if (set_start_mm && *swap_map < swcount) {
 					mmput(new_start_mm);
-					atomic_inc(&mm->mm_users);
+					mmget(mm);
 					new_start_mm = mm;
 					set_start_mm = 0;
 				}
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 3815e940fbea..2366177172f6 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -204,7 +204,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 	work->addr = hva;
 	work->arch = *arch;
 	work->mm = current->mm;
-	atomic_inc(&work->mm->mm_users);
+	mmget(work->mm);
 	kvm_get_kvm(work->vcpu->kvm);
 
 	/* this can't really happen otherwise gfn_to_pfn_async
-- 
cgit v1.2.3


From b279ddc3382426f8e05068de3488a2993f68dc26 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 27 Feb 2017 14:30:16 -0800
Subject: mm: clarify mm_struct.mm_{users,count} documentation

Clarify documentation relating to mm_users and mm_count, and switch to
kernel-doc syntax.

Link: http://lkml.kernel.org/r/20161218123229.22952-4-vegard.nossum@oracle.com
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 808751d7b737..4f6d440ad785 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -407,8 +407,27 @@ struct mm_struct {
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long highest_vm_end;		/* highest vma end address */
 	pgd_t * pgd;
-	atomic_t mm_users;			/* How many users with user space? */
-	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+
+	/**
+	 * @mm_users: The number of users including userspace.
+	 *
+	 * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
+	 * to 0 (i.e. when the task exits and there are no other temporary
+	 * reference holders), we also release a reference on @mm_count
+	 * (which may then free the &struct mm_struct if @mm_count also
+	 * drops to 0).
+	 */
+	atomic_t mm_users;
+
+	/**
+	 * @mm_count: The number of references to &struct mm_struct
+	 * (@mm_users count as 1).
+	 *
+	 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
+	 * &struct mm_struct is freed.
+	 */
+	atomic_t mm_count;
+
 	atomic_long_t nr_ptes;			/* PTE page table pages */
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
-- 
cgit v1.2.3


From 2959a5f726f6510d6dd7c958f8877e08d0cf589c Mon Sep 17 00:00:00 2001
From: Jinbum Park <jinb.park7@gmail.com>
Date: Mon, 27 Feb 2017 14:30:22 -0800
Subject: mm: add arch-independent testcases for RODATA

This patch makes arch-independent testcases for RODATA.  Both x86 and
x86_64 already have testcases for RODATA, But they are arch-specific
because using inline assembly directly.

And cacheflush.h is not a suitable location for rodata-test related
things.  Since they were in cacheflush.h, If someone change the state of
CONFIG_DEBUG_RODATA_TEST, It cause overhead of kernel build.

To solve the above issues, write arch-independent testcases and move it
to shared location.

[jinb.park7@gmail.com: fix config dependency]
  Link: http://lkml.kernel.org/r/20170209131625.GA16954@pjb1027-Latitude-E5410
Link: http://lkml.kernel.org/r/20170129105436.GA9303@pjb1027-Latitude-E5410
Signed-off-by: Jinbum Park <jinb.park7@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Valentin Rothberg <valentinrothberg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig.debug            |  8 -----
 arch/x86/include/asm/cacheflush.h | 10 ------
 arch/x86/kernel/Makefile          |  1 -
 arch/x86/kernel/test_rodata.c     | 75 ---------------------------------------
 arch/x86/mm/init_32.c             |  4 ---
 arch/x86/mm/init_64.c             |  5 ---
 include/linux/rodata_test.h       | 23 ++++++++++++
 init/main.c                       |  6 ++--
 mm/Kconfig.debug                  |  6 ++++
 mm/Makefile                       |  1 +
 mm/rodata_test.c                  | 56 +++++++++++++++++++++++++++++
 11 files changed, 90 insertions(+), 105 deletions(-)
 delete mode 100644 arch/x86/kernel/test_rodata.c
 create mode 100644 include/linux/rodata_test.h
 create mode 100644 mm/rodata_test.c

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c4cba00dbdee..63c1d13aaf9f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,14 +74,6 @@ config EFI_PGT_DUMP
 	  issues with the mapping of the EFI runtime regions into that
 	  table.
 
-config DEBUG_RODATA_TEST
-	bool "Testcase for the marking rodata read-only"
-	default y
-	---help---
-	  This option enables a testcase for the setting rodata read-only
-	  as well as for the change_page_attr() infrastructure.
-	  If in doubt, say "N"
-
 config DEBUG_WX
 	bool "Warn on W+X mappings at boot"
 	select X86_PTDUMP_CORE
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 872877d930de..e7e1942edff7 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -90,18 +90,8 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 
-#ifdef CONFIG_DEBUG_RODATA_TEST
-int rodata_test(void);
-#else
-static inline int rodata_test(void)
-{
-	return 0;
-}
-#endif
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bdcdb3b3a219..84c00592d359 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,7 +100,6 @@ obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
-obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644
index 222e84e2432e..000000000000
--- a/arch/x86/kernel/test_rodata.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-#include <asm/asm.h>
-
-int rodata_test(void)
-{
-	unsigned long result;
-	unsigned long start, end;
-
-	/* test 1: read the value */
-	/* If this test fails, some previous testrun has clobbered the state */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
-		return -ENODEV;
-	}
-
-	/* test 2: write to the variable; this should fault */
-	/*
-	 * If this test fails, we managed to overwrite the data
-	 *
-	 * This is written in assembly to be able to catch the
-	 * exception that is supposed to happen in the correct
-	 * case
-	 */
-
-	result = 1;
-	asm volatile(
-		"0:	mov %[zero],(%[rodata_test])\n"
-		"	mov %[zero], %[rslt]\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(0b,2b)
-		: [rslt] "=r" (result)
-		: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
-	);
-
-
-	if (!result) {
-		printk(KERN_ERR "rodata_test: test data was not read only\n");
-		return -ENODEV;
-	}
-
-	/* test 3: check the value hasn't changed */
-	/* If this test fails, we managed to overwrite the data */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
-		return -ENODEV;
-	}
-	/* test 4: check if the rodata section is 4Kb aligned */
-	start = (unsigned long)__start_rodata;
-	end = (unsigned long)__end_rodata;
-	if (start & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
-		return -ENODEV;
-	}
-	if (end & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 928d657de829..2b4b53e6793f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -864,9 +864,6 @@ static noinline int do_test_wp_bit(void)
 	return flag;
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly __read_mostly;
 
 void set_kernel_text_rw(void)
@@ -939,7 +936,6 @@ void mark_rodata_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 		size >> 10);
-	rodata_test();
 
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 97346f987ef2..15173d37f399 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1000,9 +1000,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly;
 
 void set_kernel_text_rw(void)
@@ -1071,8 +1068,6 @@ void mark_rodata_ro(void)
 	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
 	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
 
-	rodata_test();
-
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
 	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h
new file mode 100644
index 000000000000..ea05f6c51413
--- /dev/null
+++ b/include/linux/rodata_test.h
@@ -0,0 +1,23 @@
+/*
+ * rodata_test.h: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifndef _RODATA_TEST_H
+#define _RODATA_TEST_H
+
+#ifdef CONFIG_DEBUG_RODATA_TEST
+extern const int rodata_test_data;
+void rodata_test(void);
+#else
+static inline void rodata_test(void) {}
+#endif
+
+#endif /* _RODATA_TEST_H */
diff --git a/init/main.c b/init/main.c
index 6eb10ce472b9..47ea22d181ef 100644
--- a/init/main.c
+++ b/init/main.c
@@ -82,6 +82,7 @@
 #include <linux/proc_ns.h>
 #include <linux/io.h>
 #include <linux/cache.h>
+#include <linux/rodata_test.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -935,9 +936,10 @@ __setup("rodata=", set_debug_rodata);
 #ifdef CONFIG_STRICT_KERNEL_RWX
 static void mark_readonly(void)
 {
-	if (rodata_enabled)
+	if (rodata_enabled) {
 		mark_rodata_ro();
-	else
+		rodata_test();
+	} else
 		pr_info("Kernel memory protection disabled.\n");
 }
 #else
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index afcc550877ff..79d0fd13b5b3 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -90,3 +90,9 @@ config DEBUG_PAGE_REF
 	  careful when enabling this feature because it adds about 30 KB to the
 	  kernel code.  However the runtime performance overhead is virtually
 	  nil until the tracepoints are actually enabled.
+
+config DEBUG_RODATA_TEST
+    bool "Testcase for the marking rodata read-only"
+    depends on STRICT_KERNEL_RWX
+    ---help---
+      This option enables a testcase for the setting rodata read-only.
diff --git a/mm/Makefile b/mm/Makefile
index aa0aa17cb413..026f6a828a50 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
new file mode 100644
index 000000000000..0fd21670b513
--- /dev/null
+++ b/mm/rodata_test.c
@@ -0,0 +1,56 @@
+/*
+ * rodata_test.c: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/uaccess.h>
+#include <asm/sections.h>
+
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+void rodata_test(void)
+{
+	unsigned long start, end;
+	int zero = 0;
+
+	/* test 1: read the value */
+	/* If this test fails, some previous testrun has clobbered the state */
+	if (!rodata_test_data) {
+		pr_err("rodata_test: test 1 fails (start data)\n");
+		return;
+	}
+
+	/* test 2: write to the variable; this should fault */
+	if (!probe_kernel_write((void *)&rodata_test_data,
+						(void *)&zero, sizeof(zero))) {
+		pr_err("rodata_test: test data was not read only\n");
+		return;
+	}
+
+	/* test 3: check the value hasn't changed */
+	if (rodata_test_data == zero) {
+		pr_err("rodata_test: test data was changed\n");
+		return;
+	}
+
+	/* test 4: check if the rodata section is PAGE_SIZE aligned */
+	start = (unsigned long)__start_rodata;
+	end = (unsigned long)__end_rodata;
+	if (start & (PAGE_SIZE - 1)) {
+		pr_err("rodata_test: start of .rodata is not page size aligned\n");
+		return;
+	}
+	if (end & (PAGE_SIZE - 1)) {
+		pr_err("rodata_test: end of .rodata is not page size aligned\n");
+		return;
+	}
+
+	pr_info("rodata_test: all tests were successful\n");
+}
-- 
cgit v1.2.3


From 4e4636cf981b5b629fbfb78aa9f232e015f7d521 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 27 Feb 2017 22:21:16 -0600
Subject: objtool: Enclose contents of unreachable() macro in a block

Guenter Roeck reported a boot failure in mips64.  It was bisected to the
following commit:

  d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")

The unreachable() macro was formerly only composed of a single
statement.  The above commit added a second statement, but neglected to
enclose the statements in a block.

Suggested-by: Guenter Roeck <linux@roeck-us.net>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")
Link: http://lkml.kernel.org/r/20170228042116.glmwmwiohcix7o4a@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index de471346b365..f457b520ead6 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -215,7 +215,8 @@
  * this in the preprocessor, but we can live with this because they're
  * unreleased.  Really, we need to have autoconf for the kernel.
  */
-#define unreachable() annotate_unreachable(); __builtin_unreachable()
+#define unreachable() \
+	do { annotate_unreachable(); __builtin_unreachable(); } while (0)
 
 /* Mark a function definition as prohibited from being cloned. */
 #define __noclone	__attribute__((__noclone__, __optimize__("no-tracer")))
-- 
cgit v1.2.3


From 86ef58a4e35e8fa66afb5898cf6dec6a3bb29f67 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 28 Feb 2017 18:32:48 -0800
Subject: nfit, libnvdimm: fix interleave set cookie calculation

The interleave-set cookie is a sum that sanity checks the composition of
an interleave set has not changed from when the namespace was initially
created.  The checksum is calculated by sorting the DIMMs by their
location in the interleave-set. The comparison for the sort must be
64-bit wide, not byte-by-byte as performed by memcmp() in the broken
case.

Fix the implementation to accept correct cookie values in addition to
the Linux "memcmp" order cookies, but only allow correct cookies to be
generated going forward. It does mean that namespaces created by
third-party-tooling, or created by newer kernels with this fix, will not
validate on older kernels. However, there are a couple mitigating
conditions:

    1/ platforms with namespace-label capable NVDIMMs are not widely
       available.

    2/ interleave-sets with a single-dimm are by definition not affected
       (nothing to sort). This covers the QEMU-KVM NVDIMM emulation case.

The cookie stored in the namespace label will be fixed by any write the
namespace label, the most straightforward way to achieve this is to
write to the "alt_name" attribute of a namespace in sysfs.

Cc: <stable@vger.kernel.org>
Fixes: eaf961536e16 ("libnvdimm, nfit: add interleave-set state-tracking infrastructure")
Reported-by: Nicholas Moulin <nicholas.w.moulin@linux.intel.com>
Tested-by: Nicholas Moulin <nicholas.w.moulin@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c        | 16 +++++++++++++++-
 drivers/nvdimm/namespace_devs.c | 18 ++++++++++++++----
 drivers/nvdimm/nd.h             |  1 +
 drivers/nvdimm/region_devs.c    |  9 +++++++++
 include/linux/libnvdimm.h       |  2 ++
 5 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 7361d00818e2..662036bdc65e 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1603,7 +1603,7 @@ static size_t sizeof_nfit_set_info(int num_mappings)
 		+ num_mappings * sizeof(struct nfit_set_info_map);
 }
 
-static int cmp_map(const void *m0, const void *m1)
+static int cmp_map_compat(const void *m0, const void *m1)
 {
 	const struct nfit_set_info_map *map0 = m0;
 	const struct nfit_set_info_map *map1 = m1;
@@ -1612,6 +1612,14 @@ static int cmp_map(const void *m0, const void *m1)
 			sizeof(u64));
 }
 
+static int cmp_map(const void *m0, const void *m1)
+{
+	const struct nfit_set_info_map *map0 = m0;
+	const struct nfit_set_info_map *map1 = m1;
+
+	return map0->region_offset - map1->region_offset;
+}
+
 /* Retrieve the nth entry referencing this spa */
 static struct acpi_nfit_memory_map *memdev_from_spa(
 		struct acpi_nfit_desc *acpi_desc, u16 range_index, int n)
@@ -1667,6 +1675,12 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 	sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map),
 			cmp_map, NULL);
 	nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
+
+	/* support namespaces created with the wrong sort order */
+	sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map),
+			cmp_map_compat, NULL);
+	nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
+
 	ndr_desc->nd_set = nd_set;
 	devm_kfree(dev, info);
 
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index ce3e8dfa10ad..1b481a5fb966 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1700,6 +1700,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
 struct device *create_namespace_pmem(struct nd_region *nd_region,
 		struct nd_namespace_label *nd_label)
 {
+	u64 altcookie = nd_region_interleave_set_altcookie(nd_region);
 	u64 cookie = nd_region_interleave_set_cookie(nd_region);
 	struct nd_label_ent *label_ent;
 	struct nd_namespace_pmem *nspm;
@@ -1718,7 +1719,11 @@ struct device *create_namespace_pmem(struct nd_region *nd_region,
 	if (__le64_to_cpu(nd_label->isetcookie) != cookie) {
 		dev_dbg(&nd_region->dev, "invalid cookie in label: %pUb\n",
 				nd_label->uuid);
-		return ERR_PTR(-EAGAIN);
+		if (__le64_to_cpu(nd_label->isetcookie) != altcookie)
+			return ERR_PTR(-EAGAIN);
+
+		dev_dbg(&nd_region->dev, "valid altcookie in label: %pUb\n",
+				nd_label->uuid);
 	}
 
 	nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
@@ -1733,9 +1738,14 @@ struct device *create_namespace_pmem(struct nd_region *nd_region,
 	res->name = dev_name(&nd_region->dev);
 	res->flags = IORESOURCE_MEM;
 
-	for (i = 0; i < nd_region->ndr_mappings; i++)
-		if (!has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i))
-			break;
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		if (has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i))
+			continue;
+		if (has_uuid_at_pos(nd_region, nd_label->uuid, altcookie, i))
+			continue;
+		break;
+	}
+
 	if (i < nd_region->ndr_mappings) {
 		struct nvdimm_drvdata *ndd = to_ndd(&nd_region->mapping[i]);
 
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 35dd75057e16..2a99c83aa19f 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -328,6 +328,7 @@ struct nd_region *to_nd_region(struct device *dev);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
 u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
+u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region);
 void nvdimm_bus_lock(struct device *dev);
 void nvdimm_bus_unlock(struct device *dev);
 bool is_nvdimm_bus_locked(struct device *dev);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 7cd705f3247c..b7cb5066d961 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -505,6 +505,15 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region)
 	return 0;
 }
 
+u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region)
+{
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+	if (nd_set)
+		return nd_set->altcookie;
+	return 0;
+}
+
 void nd_mapping_free_labels(struct nd_mapping *nd_mapping)
 {
 	struct nd_label_ent *label_ent, *e;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 8458c5351e56..77e7af32543f 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -70,6 +70,8 @@ struct nd_cmd_desc {
 
 struct nd_interleave_set {
 	u64 cookie;
+	/* compatibility with initial buggy Linux implementation */
+	u64 altcookie;
 };
 
 struct nd_mapping_desc {
-- 
cgit v1.2.3


From 55149d06534ae2a7ba5f7a078353deb89b3fe891 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 1 Mar 2017 00:05:04 -0600
Subject: objtool, compiler.h: Fix __unreachable section relocation size

Linus reported the following commit broke module loading on his laptop:

  d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")

It showed errors like the following:

  module: overflow in relocation type 10 val ffffffffc02afc81
  module: 'nvme' likely not compiled with -mcmodel=kernel

The problem is that the __unreachable section addresses are stored using
the '.long' asm directive, which isn't big enough for .text section
kernel addresses.  Use relative addresses instead:

  ".long %c0b - .\t\n"

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")
Link: http://lkml.kernel.org/r/20170301060504.oltm3iws6fmubnom@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 76e28c229805..b6bb9019d87f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -201,7 +201,7 @@
 #define annotate_unreachable() ({					\
 	asm("%c0:\t\n"							\
 	    ".pushsection __unreachable, \"a\"\t\n"			\
-	    ".long %c0b\t\n"						\
+	    ".long %c0b - .\t\n"					\
 	    ".popsection\t\n" : : "i" (__LINE__));			\
 })
 #else
-- 
cgit v1.2.3


From e3736c3eb3a6f7c0966923b629c9f92b558aa9c7 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Mon, 20 Feb 2017 13:06:21 +0200
Subject: kvm: convert kvm.users_count from atomic_t to refcount_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 include/linux/kvm_host.h | 3 ++-
 virt/kvm/kvm_main.c      | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8d69d5150748..2c14ad9809da 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -26,6 +26,7 @@
 #include <linux/context_tracking.h>
 #include <linux/irqbypass.h>
 #include <linux/swait.h>
+#include <linux/refcount.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -401,7 +402,7 @@ struct kvm {
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
-	atomic_t users_count;
+	refcount_t users_count;
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 	spinlock_t ring_lock;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cc4d6e0dd2a2..c6b7aff634be 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -617,7 +617,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
-	atomic_set(&kvm->users_count, 1);
+	refcount_set(&kvm->users_count, 1);
 	INIT_LIST_HEAD(&kvm->devices);
 
 	r = kvm_arch_init_vm(kvm, type);
@@ -747,13 +747,13 @@ static void kvm_destroy_vm(struct kvm *kvm)
 
 void kvm_get_kvm(struct kvm *kvm)
 {
-	atomic_inc(&kvm->users_count);
+	refcount_inc(&kvm->users_count);
 }
 EXPORT_SYMBOL_GPL(kvm_get_kvm);
 
 void kvm_put_kvm(struct kvm *kvm)
 {
-	if (atomic_dec_and_test(&kvm->users_count))
+	if (refcount_dec_and_test(&kvm->users_count))
 		kvm_destroy_vm(kvm);
 }
 EXPORT_SYMBOL_GPL(kvm_put_kvm);
@@ -3639,7 +3639,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
 	 * To avoid the race between open and the removal of the debugfs
 	 * directory we test against the users count.
 	 */
-	if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0))
+	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
 		return -ENOENT;
 
 	if (simple_attr_open(inode, file, get, set, fmt)) {
-- 
cgit v1.2.3


From b2d0fe35471d1a71471f99147ffb5986bd60e744 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 28 Feb 2017 15:02:15 +0300
Subject: net/mlx4: && vs & typo

Bitwise & was obviously intended here.

Fixes: 745d8ae4622c ("net/mlx4: Spoofcheck and zero MAC can't coexist")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index e965e5090d96..a858bcb6220b 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -109,7 +109,7 @@ static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
 	int i;
 
 	for (i = ETH_ALEN; i > 0; i--) {
-		addr[i - 1] = mac && 0xFF;
+		addr[i - 1] = mac & 0xFF;
 		mac >>= 8;
 	}
 }
-- 
cgit v1.2.3


From 39e6c8208d7b6fb9d2047850fb3327db567b564b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Feb 2017 10:34:50 -0800
Subject: net: solve a NAPI race

While playing with mlx4 hardware timestamping of RX packets, I found
that some packets were received by TCP stack with a ~200 ms delay...

Since the timestamp was provided by the NIC, and my probe was added
in tcp_v4_rcv() while in BH handler, I was confident it was not
a sender issue, or a drop in the network.

This would happen with a very low probability, but hurting RPC
workloads.

A NAPI driver normally arms the IRQ after the napi_complete_done(),
after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
it.

Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
while IRQ are not disabled, we might have later an IRQ firing and
finding this bit set, right before napi_complete_done() clears it.

This can happen with busy polling users, or if gro_flush_timeout is
used. But some other uses of napi_schedule() in drivers can cause this
as well.

thread 1                                 thread 2 (could be on same cpu, or not)

// busy polling or napi_watchdog()
napi_schedule();
...
napi->poll()

device polling:
read 2 packets from ring buffer
                                          Additional 3rd packet is
available.
                                          device hard irq

                                          // does nothing because
NAPI_STATE_SCHED bit is owned by thread 1
                                          napi_schedule();

napi_complete_done(napi, 2);
rearm_irq();

Note that rearm_irq() will not force the device to send an additional
IRQ for the packet it already signaled (3rd packet in my example)

This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
can set if it could not grab NAPI_STATE_SCHED

Then napi_complete_done() properly reschedules the napi to make sure
we do not miss something.

Since we manipulate multiple bits at once, use cmpxchg() like in
sk_busy_loop() to provide proper transactions.

In v2, I changed napi_watchdog() to use a relaxed variant of
napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point.

In v3, I added more details in the changelog and clears
NAPI_STATE_MISSED in busy_poll_stop()

In v4, I added the ideas given by Alexander Duyck in v3 review

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 29 ++++++------------
 net/core/dev.c            | 76 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 81 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f40f0ab3847a..97456b2539e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -330,6 +330,7 @@ struct napi_struct {
 
 enum {
 	NAPI_STATE_SCHED,	/* Poll is scheduled */
+	NAPI_STATE_MISSED,	/* reschedule a napi */
 	NAPI_STATE_DISABLE,	/* Disable pending */
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
@@ -338,12 +339,13 @@ enum {
 };
 
 enum {
-	NAPIF_STATE_SCHED	 = (1UL << NAPI_STATE_SCHED),
-	NAPIF_STATE_DISABLE	 = (1UL << NAPI_STATE_DISABLE),
-	NAPIF_STATE_NPSVC	 = (1UL << NAPI_STATE_NPSVC),
-	NAPIF_STATE_HASHED	 = (1UL << NAPI_STATE_HASHED),
-	NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
-	NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_SCHED	 = BIT(NAPI_STATE_SCHED),
+	NAPIF_STATE_MISSED	 = BIT(NAPI_STATE_MISSED),
+	NAPIF_STATE_DISABLE	 = BIT(NAPI_STATE_DISABLE),
+	NAPIF_STATE_NPSVC	 = BIT(NAPI_STATE_NPSVC),
+	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
+	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
 };
 
 enum gro_result {
@@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n)
 	return test_bit(NAPI_STATE_DISABLE, &n->state);
 }
 
-/**
- *	napi_schedule_prep - check if NAPI can be scheduled
- *	@n: NAPI context
- *
- * Test if NAPI routine is already running, and if not mark
- * it as running.  This is used as a condition variable to
- * insure only one NAPI poll instance runs.  We also make
- * sure there is no pending NAPI disable.
- */
-static inline bool napi_schedule_prep(struct napi_struct *n)
-{
-	return !napi_disable_pending(n) &&
-		!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
-}
+bool napi_schedule_prep(struct napi_struct *n);
 
 /**
  *	napi_schedule - schedule NAPI poll
diff --git a/net/core/dev.c b/net/core/dev.c
index 304f2deae5f9..e63bf61b19be 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4883,6 +4883,39 @@ void __napi_schedule(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule);
 
+/**
+ *	napi_schedule_prep - check if napi can be scheduled
+ *	@n: napi context
+ *
+ * Test if NAPI routine is already running, and if not mark
+ * it as running.  This is used as a condition variable
+ * insure only one NAPI poll instance runs.  We also make
+ * sure there is no pending NAPI disable.
+ */
+bool napi_schedule_prep(struct napi_struct *n)
+{
+	unsigned long val, new;
+
+	do {
+		val = READ_ONCE(n->state);
+		if (unlikely(val & NAPIF_STATE_DISABLE))
+			return false;
+		new = val | NAPIF_STATE_SCHED;
+
+		/* Sets STATE_MISSED bit if STATE_SCHED was already set
+		 * This was suggested by Alexander Duyck, as compiler
+		 * emits better code than :
+		 * if (val & NAPIF_STATE_SCHED)
+		 *     new |= NAPIF_STATE_MISSED;
+		 */
+		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
+						   NAPIF_STATE_MISSED;
+	} while (cmpxchg(&n->state, val, new) != val);
+
+	return !(val & NAPIF_STATE_SCHED);
+}
+EXPORT_SYMBOL(napi_schedule_prep);
+
 /**
  * __napi_schedule_irqoff - schedule for receive
  * @n: entry to schedule
@@ -4897,7 +4930,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
 
 bool napi_complete_done(struct napi_struct *n, int work_done)
 {
-	unsigned long flags;
+	unsigned long flags, val, new;
 
 	/*
 	 * 1) Don't let napi dequeue from the cpu poll list
@@ -4927,7 +4960,27 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		list_del_init(&n->poll_list);
 		local_irq_restore(flags);
 	}
-	WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+
+	do {
+		val = READ_ONCE(n->state);
+
+		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
+
+		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+
+		/* If STATE_MISSED was set, leave STATE_SCHED set,
+		 * because we will call napi->poll() one more time.
+		 * This C code was suggested by Alexander Duyck to help gcc.
+		 */
+		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
+						    NAPIF_STATE_SCHED;
+	} while (cmpxchg(&n->state, val, new) != val);
+
+	if (unlikely(val & NAPIF_STATE_MISSED)) {
+		__napi_schedule(n);
+		return false;
+	}
+
 	return true;
 }
 EXPORT_SYMBOL(napi_complete_done);
@@ -4953,6 +5006,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 {
 	int rc;
 
+	/* Busy polling means there is a high chance device driver hard irq
+	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
+	 * set in napi_schedule_prep().
+	 * Since we are about to call napi->poll() once more, we can safely
+	 * clear NAPI_STATE_MISSED.
+	 *
+	 * Note: x86 could use a single "lock and ..." instruction
+	 * to perform these two clear_bit()
+	 */
+	clear_bit(NAPI_STATE_MISSED, &napi->state);
 	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 
 	local_bh_disable();
@@ -5088,8 +5151,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	struct napi_struct *napi;
 
 	napi = container_of(timer, struct napi_struct, timer);
-	if (napi->gro_list)
-		napi_schedule_irqoff(napi);
+
+	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
+	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
+	 */
+	if (napi->gro_list && !napi_disable_pending(napi) &&
+	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+		__napi_schedule_irqoff(napi);
 
 	return HRTIMER_NORESTART;
 }
-- 
cgit v1.2.3


From e390f9a9689a42f477a6073e2e7df530a4c1b740 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 1 Mar 2017 12:04:44 -0600
Subject: objtool, modules: Discard objtool annotation sections for modules

The '__unreachable' and '__func_stack_frame_non_standard' sections are
only used at compile time.  They're discarded for vmlinux but they
should also be discarded for modules.

Since this is a recurring pattern, prefix the section names with
".discard.".  It's a nice convention and vmlinux.lds.h already discards
such sections.

Also remove the 'a' (allocatable) flag from the __unreachable section
since it doesn't make sense for a discarded section.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")
Link: http://lkml.kernel.org/r/20170301180444.lhd53c5tibc4ns77@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 2 --
 include/linux/compiler-gcc.h  | 2 +-
 include/linux/frame.h         | 2 +-
 scripts/mod/modpost.c         | 1 +
 scripts/module-common.lds     | 5 ++++-
 tools/objtool/builtin-check.c | 6 +++---
 6 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ad0118fbce90..c74ae9ce8dc4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -345,8 +345,6 @@ SECTIONS
 	DISCARDS
 	/DISCARD/ : {
 		*(.eh_frame)
-		*(__func_stack_frame_non_standard)
-		*(__unreachable)
 	}
 }
 
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index b6bb9019d87f..0efef9cf014f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -200,7 +200,7 @@
 #ifdef CONFIG_STACK_VALIDATION
 #define annotate_unreachable() ({					\
 	asm("%c0:\t\n"							\
-	    ".pushsection __unreachable, \"a\"\t\n"			\
+	    ".pushsection .discard.unreachable\t\n"			\
 	    ".long %c0b - .\t\n"					\
 	    ".popsection\t\n" : : "i" (__LINE__));			\
 })
diff --git a/include/linux/frame.h b/include/linux/frame.h
index e6baaba3f1ae..d772c61c31da 100644
--- a/include/linux/frame.h
+++ b/include/linux/frame.h
@@ -11,7 +11,7 @@
  * For more information, see tools/objtool/Documentation/stack-validation.txt.
  */
 #define STACK_FRAME_NON_STANDARD(func) \
-	static void __used __section(__func_stack_frame_non_standard) \
+	static void __used __section(.discard.func_stack_frame_non_standard) \
 		*__func_stack_frame_non_standard_##func = func
 
 #else /* !CONFIG_STACK_VALIDATION */
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 4dedd0d3d3a7..30d752a4a6a6 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -854,6 +854,7 @@ static const char *const section_white_list[] =
 	".cmem*",			/* EZchip */
 	".fmt_slot*",			/* EZchip */
 	".gnu.lto*",
+	".discard.*",
 	NULL
 };
 
diff --git a/scripts/module-common.lds b/scripts/module-common.lds
index 73a2c7da0e55..cf7e52e4781b 100644
--- a/scripts/module-common.lds
+++ b/scripts/module-common.lds
@@ -4,7 +4,10 @@
  * combine them automatically.
  */
 SECTIONS {
-	/DISCARD/ : { *(.discard) }
+	/DISCARD/ : {
+		*(.discard)
+		*(.discard.*)
+	}
 
 	__ksymtab		0 : { *(SORT(___ksymtab+*)) }
 	__ksymtab_gpl		0 : { *(SORT(___ksymtab_gpl+*)) }
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 5fc52ee3264c..4cfdbb5b6967 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -339,13 +339,13 @@ static int add_dead_ends(struct objtool_file *file)
 	struct instruction *insn;
 	bool found;
 
-	sec = find_section_by_name(file->elf, ".rela__unreachable");
+	sec = find_section_by_name(file->elf, ".rela.discard.unreachable");
 	if (!sec)
 		return 0;
 
 	list_for_each_entry(rela, &sec->rela_list, list) {
 		if (rela->sym->type != STT_SECTION) {
-			WARN("unexpected relocation symbol type in .rela__unreachable");
+			WARN("unexpected relocation symbol type in %s", sec->name);
 			return -1;
 		}
 		insn = find_insn(file, rela->sym->sec, rela->addend);
@@ -1272,7 +1272,7 @@ int cmd_check(int argc, const char **argv)
 
 	INIT_LIST_HEAD(&file.insn_list);
 	hash_init(file.insn_hash);
-	file.whitelist = find_section_by_name(file.elf, "__func_stack_frame_non_standard");
+	file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
 	file.rodata = find_section_by_name(file.elf, ".rodata");
 	file.ignore_unreachables = false;
 	file.c_file = find_section_by_name(file.elf, ".comment");
-- 
cgit v1.2.3


From 0837e49ab3fa8d903a499984575d71efee8097ce Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 1 Mar 2017 15:11:23 +0000
Subject: KEYS: Differentiate uses of rcu_dereference_key() and
 user_key_payload()

rcu_dereference_key() and user_key_payload() are currently being used in
two different, incompatible ways:

 (1) As a wrapper to rcu_dereference() - when only the RCU read lock used
     to protect the key.

 (2) As a wrapper to rcu_dereference_protected() - when the key semaphor is
     used to protect the key and the may be being modified.

Fix this by splitting both of the key wrappers to produce:

 (1) RCU accessors for keys when caller has the key semaphore locked:

	dereference_key_locked()
	user_key_payload_locked()

 (2) RCU accessors for keys when caller holds the RCU read lock:

	dereference_key_rcu()
	user_key_payload_rcu()

This should fix following warning in the NFS idmapper

  ===============================
  [ INFO: suspicious RCU usage. ]
  4.10.0 #1 Tainted: G        W
  -------------------------------
  ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage!
  other info that might help us debug this:
  rcu_scheduler_active = 2, debug_locks = 0
  1 lock held by mount.nfs/5987:
    #0:  (rcu_read_lock){......}, at: [<d000000002527abc>] nfs_idmap_get_key+0x15c/0x420 [nfsv4]
  stack backtrace:
  CPU: 1 PID: 5987 Comm: mount.nfs Tainted: G        W       4.10.0 #1
  Call Trace:
    dump_stack+0xe8/0x154 (unreliable)
    lockdep_rcu_suspicious+0x140/0x190
    nfs_idmap_get_key+0x380/0x420 [nfsv4]
    nfs_map_name_to_uid+0x2a0/0x3b0 [nfsv4]
    decode_getfattr_attrs+0xfac/0x16b0 [nfsv4]
    decode_getfattr_generic.constprop.106+0xbc/0x150 [nfsv4]
    nfs4_xdr_dec_lookup_root+0xac/0xb0 [nfsv4]
    rpcauth_unwrap_resp+0xe8/0x140 [sunrpc]
    call_decode+0x29c/0x910 [sunrpc]
    __rpc_execute+0x140/0x8f0 [sunrpc]
    rpc_run_task+0x170/0x200 [sunrpc]
    nfs4_call_sync_sequence+0x68/0xa0 [nfsv4]
    _nfs4_lookup_root.isra.44+0xd0/0xf0 [nfsv4]
    nfs4_lookup_root+0xe0/0x350 [nfsv4]
    nfs4_lookup_root_sec+0x70/0xa0 [nfsv4]
    nfs4_find_root_sec+0xc4/0x100 [nfsv4]
    nfs4_proc_get_rootfh+0x5c/0xf0 [nfsv4]
    nfs4_get_rootfh+0x6c/0x190 [nfsv4]
    nfs4_server_common_setup+0xc4/0x260 [nfsv4]
    nfs4_create_server+0x278/0x3c0 [nfsv4]
    nfs4_remote_mount+0x50/0xb0 [nfsv4]
    mount_fs+0x74/0x210
    vfs_kern_mount+0x78/0x220
    nfs_do_root_mount+0xb0/0x140 [nfsv4]
    nfs4_try_mount+0x60/0x100 [nfsv4]
    nfs_fs_mount+0x5ec/0xda0 [nfs]
    mount_fs+0x74/0x210
    vfs_kern_mount+0x78/0x220
    do_mount+0x254/0xf70
    SyS_mount+0x94/0x100
    system_call+0x38/0xe0

Reported-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 Documentation/security/keys.txt          | 17 +++++++++++++++--
 drivers/md/dm-crypt.c                    |  2 +-
 fs/cifs/connect.c                        |  2 +-
 fs/crypto/keyinfo.c                      |  2 +-
 fs/ecryptfs/ecryptfs_kernel.h            |  2 +-
 fs/fscache/object-list.c                 |  2 +-
 fs/nfs/nfs4idmap.c                       |  2 +-
 include/keys/user-type.h                 |  9 +++++++--
 include/linux/key.h                      |  5 ++++-
 lib/digsig.c                             |  2 +-
 net/dns_resolver/dns_query.c             |  4 ++--
 security/keys/dh.c                       |  2 +-
 security/keys/encrypted-keys/encrypted.c |  4 ++--
 security/keys/trusted.c                  |  4 ++--
 security/keys/user_defined.c             |  6 +++---
 15 files changed, 43 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 3849814bfe6d..0e03baf271bd 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1151,8 +1151,21 @@ access the data:
      usage.  This is called key->payload.rcu_data0.  The following accessors
      wrap the RCU calls to this element:
 
-	rcu_assign_keypointer(struct key *key, void *data);
-	void *rcu_dereference_key(struct key *key);
+     (a) Set or change the first payload pointer:
+
+		rcu_assign_keypointer(struct key *key, void *data);
+
+     (b) Read the first payload pointer with the key semaphore held:
+
+		[const] void *dereference_key_locked([const] struct key *key);
+
+	 Note that the return value will inherit its constness from the key
+	 parameter.  Static analysis will give an error if it things the lock
+	 isn't held.
+
+     (c) Read the first payload pointer with the RCU read lock held:
+
+		const void *dereference_key_rcu(const struct key *key);
 
 
 ===================
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1cb2ca9dfae3..389a3637ffcc 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1536,7 +1536,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
 
 	down_read(&key->sem);
 
-	ukp = user_key_payload(key);
+	ukp = user_key_payload_locked(key);
 	if (!ukp) {
 		up_read(&key->sem);
 		key_put(key);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 777ad9f4fc3c..8a3ecef30d3c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2455,7 +2455,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	}
 
 	down_read(&key->sem);
-	upayload = user_key_payload(key);
+	upayload = user_key_payload_locked(key);
 	if (IS_ERR_OR_NULL(upayload)) {
 		rc = upayload ? PTR_ERR(upayload) : -EINVAL;
 		goto out_key_put;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 02eb6b9e4438..d5d896fa5a71 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -103,7 +103,7 @@ static int validate_user_key(struct fscrypt_info *crypt_info,
 		goto out;
 	}
 	down_read(&keyring_key->sem);
-	ukp = user_key_payload(keyring_key);
+	ukp = user_key_payload_locked(keyring_key);
 	if (ukp->datalen != sizeof(struct fscrypt_key)) {
 		res = -EINVAL;
 		up_read(&keyring_key->sem);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 599a29237cfe..95c1c8d34539 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -117,7 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 
 	auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
 	if (!auth_tok)
-		return (struct ecryptfs_auth_tok *)user_key_payload(key)->data;
+		return (struct ecryptfs_auth_tok *)user_key_payload_locked(key)->data;
 	else
 		return auth_tok;
 }
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 5d5ddaa84b21..67f940892ef8 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
 	config = 0;
 	rcu_read_lock();
 
-	confkey = user_key_payload(key);
+	confkey = user_key_payload_rcu(key);
 	buf = confkey->data;
 
 	for (len = confkey->datalen - 1; len >= 0; len--) {
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index c444285bb1b1..835c163f61af 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 	if (ret < 0)
 		goto out_up;
 
-	payload = user_key_payload(rkey);
+	payload = user_key_payload_rcu(rkey);
 	if (IS_ERR_OR_NULL(payload)) {
 		ret = PTR_ERR(payload);
 		goto out_up;
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index c56fef40f53e..e098cbe27db5 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -48,9 +48,14 @@ extern void user_describe(const struct key *user, struct seq_file *m);
 extern long user_read(const struct key *key,
 		      char __user *buffer, size_t buflen);
 
-static inline const struct user_key_payload *user_key_payload(const struct key *key)
+static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key)
 {
-	return (struct user_key_payload *)rcu_dereference_key(key);
+	return (struct user_key_payload *)dereference_key_rcu(key);
+}
+
+static inline struct user_key_payload *user_key_payload_locked(const struct key *key)
+{
+	return (struct user_key_payload *)dereference_key_locked((struct key *)key);
 }
 
 #endif /* CONFIG_KEYS */
diff --git a/include/linux/key.h b/include/linux/key.h
index 722914798f37..e45212f2777e 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -354,7 +354,10 @@ static inline bool key_is_instantiated(const struct key *key)
 		!test_bit(KEY_FLAG_NEGATIVE, &key->flags);
 }
 
-#define rcu_dereference_key(KEY)					\
+#define dereference_key_rcu(KEY)					\
+	(rcu_dereference((KEY)->payload.rcu_data0))
+
+#define dereference_key_locked(KEY)					\
 	(rcu_dereference_protected((KEY)->payload.rcu_data0,		\
 				   rwsem_is_locked(&((struct key *)(KEY))->sem)))
 
diff --git a/lib/digsig.c b/lib/digsig.c
index 55b8b2f41a9e..03d7c63837ae 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -85,7 +85,7 @@ static int digsig_verify_rsa(struct key *key,
 	struct pubkey_hdr *pkh;
 
 	down_read(&key->sem);
-	ukp = user_key_payload(key);
+	ukp = user_key_payload_locked(key);
 
 	if (ukp->datalen < sizeof(*pkh))
 		goto err1;
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ecc28cff08ab..d502c94b1a82 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -70,7 +70,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
 	      const char *options, char **_result, time64_t *_expiry)
 {
 	struct key *rkey;
-	const struct user_key_payload *upayload;
+	struct user_key_payload *upayload;
 	const struct cred *saved_cred;
 	size_t typelen, desclen;
 	char *desc, *cp;
@@ -141,7 +141,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
 	if (ret)
 		goto put;
 
-	upayload = user_key_payload(rkey);
+	upayload = user_key_payload_locked(rkey);
 	len = upayload->datalen;
 
 	ret = -ENOMEM;
diff --git a/security/keys/dh.c b/security/keys/dh.c
index 531ed2ec132f..893af4c45038 100644
--- a/security/keys/dh.c
+++ b/security/keys/dh.c
@@ -55,7 +55,7 @@ static ssize_t mpi_from_key(key_serial_t keyid, size_t maxlen, MPI *mpi)
 		if (status == 0) {
 			const struct user_key_payload *payload;
 
-			payload = user_key_payload(key);
+			payload = user_key_payload_locked(key);
 
 			if (maxlen == 0) {
 				*mpi = NULL;
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index 4fb315cddf5b..0010955d7876 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -314,7 +314,7 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k
 		goto error;
 
 	down_read(&ukey->sem);
-	upayload = user_key_payload(ukey);
+	upayload = user_key_payload_locked(ukey);
 	*master_key = upayload->data;
 	*master_keylen = upayload->datalen;
 error:
@@ -926,7 +926,7 @@ static long encrypted_read(const struct key *key, char __user *buffer,
 	size_t asciiblob_len;
 	int ret;
 
-	epayload = rcu_dereference_key(key);
+	epayload = dereference_key_locked(key);
 
 	/* returns the hex encoded iv, encrypted-data, and hmac as ascii */
 	asciiblob_len = epayload->datablob_len + ivsize + 1
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index 90d61751ff12..2ae31c5a87de 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -1140,12 +1140,12 @@ out:
 static long trusted_read(const struct key *key, char __user *buffer,
 			 size_t buflen)
 {
-	struct trusted_key_payload *p;
+	const struct trusted_key_payload *p;
 	char *ascii_buf;
 	char *bufp;
 	int i;
 
-	p = rcu_dereference_key(key);
+	p = dereference_key_locked(key);
 	if (!p)
 		return -EINVAL;
 	if (!buffer || buflen <= 0)
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index e187c8909d9d..26605134f17a 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -107,7 +107,7 @@ int user_update(struct key *key, struct key_preparsed_payload *prep)
 	/* attach the new data, displacing the old */
 	key->expiry = prep->expiry;
 	if (!test_bit(KEY_FLAG_NEGATIVE, &key->flags))
-		zap = rcu_dereference_key(key);
+		zap = dereference_key_locked(key);
 	rcu_assign_keypointer(key, prep->payload.data[0]);
 	prep->payload.data[0] = NULL;
 
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(user_update);
  */
 void user_revoke(struct key *key)
 {
-	struct user_key_payload *upayload = key->payload.data[0];
+	struct user_key_payload *upayload = user_key_payload_locked(key);
 
 	/* clear the quota */
 	key_payload_reserve(key, 0);
@@ -169,7 +169,7 @@ long user_read(const struct key *key, char __user *buffer, size_t buflen)
 	const struct user_key_payload *upayload;
 	long ret;
 
-	upayload = user_key_payload(key);
+	upayload = user_key_payload_locked(key);
 	ret = upayload->datalen;
 
 	/* we can return the data as is */
-- 
cgit v1.2.3


From eb1e011a14748a1d9df9a7d7df9a5711721a1bdb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 15 Feb 2017 09:49:26 +0100
Subject: average: change to declare precision, not factor

Declaring the factor is counter-intuitive, and people are prone
to using small(-ish) values even when that makes no sense.

Change the DECLARE_EWMA() macro to take the fractional precision,
in bits, rather than a factor, and update all users.

While at it, add some more documentation.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/virtio_net.c                    |  2 +-
 drivers/net/wireless/ath/ath5k/ath5k.h      |  2 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00.h |  2 +-
 include/linux/average.h                     | 61 +++++++++++++++++++----------
 net/batman-adv/types.h                      |  2 +-
 net/mac80211/ieee80211_i.h                  |  2 +-
 net/mac80211/sta_info.h                     |  2 +-
 7 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bf95016f442a..e9d7e2b70085 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -51,7 +51,7 @@ module_param(gso, bool, 0444);
  * at once, the weight is chosen so that the EWMA will be insensitive to short-
  * term, transient changes in packet size.
  */
-DECLARE_EWMA(pkt_len, 1, 64)
+DECLARE_EWMA(pkt_len, 0, 64)
 
 /* With mergeable buffers we align buffer address and use the low bits to
  * encode its true size. Buffer size is up to 1 page so we need to align to
diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h
index 67fedb61fcc0..979800c6f57f 100644
--- a/drivers/net/wireless/ath/ath5k/ath5k.h
+++ b/drivers/net/wireless/ath/ath5k/ath5k.h
@@ -1252,7 +1252,7 @@ struct ath5k_statistics {
 #define ATH5K_TXQ_LEN_MAX	(ATH_TXBUF / 4)		/* bufs per queue */
 #define ATH5K_TXQ_LEN_LOW	(ATH5K_TXQ_LEN_MAX / 2)	/* low mark */
 
-DECLARE_EWMA(beacon_rssi, 1024, 8)
+DECLARE_EWMA(beacon_rssi, 10, 8)
 
 /* Driver state associated with an instance of a device */
 struct ath5k_hw {
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
index 26869b3bef45..340787894c69 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
@@ -257,7 +257,7 @@ struct link_qual {
 	int tx_failed;
 };
 
-DECLARE_EWMA(rssi, 1024, 8)
+DECLARE_EWMA(rssi, 10, 8)
 
 /*
  * Antenna settings about the currently active link.
diff --git a/include/linux/average.h b/include/linux/average.h
index d04aa58280de..7ddaf340d2ac 100644
--- a/include/linux/average.h
+++ b/include/linux/average.h
@@ -1,45 +1,66 @@
 #ifndef _LINUX_AVERAGE_H
 #define _LINUX_AVERAGE_H
 
-/* Exponentially weighted moving average (EWMA) */
+/*
+ * Exponentially weighted moving average (EWMA)
+ *
+ * This implements a fixed-precision EWMA algorithm, with both the
+ * precision and fall-off coefficient determined at compile-time
+ * and built into the generated helper funtions.
+ *
+ * The first argument to the macro is the name that will be used
+ * for the struct and helper functions.
+ *
+ * The second argument, the precision, expresses how many bits are
+ * used for the fractional part of the fixed-precision values.
+ *
+ * The third argument, the weight reciprocal, determines how the
+ * new values will be weighed vs. the old state, new values will
+ * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note
+ * that this parameter must be a power of two for efficiency.
+ */
 
-#define DECLARE_EWMA(name, _factor, _weight)				\
+#define DECLARE_EWMA(name, _precision, _weight_rcp)			\
 	struct ewma_##name {						\
 		unsigned long internal;					\
 	};								\
 	static inline void ewma_##name##_init(struct ewma_##name *e)	\
 	{								\
-		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
-		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		/*							\
+		 * Even if you want to feed it just 0/1 you should have	\
+		 * some bits for the non-fractional part...		\
+		 */							\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
 		e->internal = 0;					\
 	}								\
 	static inline unsigned long					\
 	ewma_##name##_read(struct ewma_##name *e)			\
 	{								\
-		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
-		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
-		return e->internal >> ilog2(_factor);			\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+		return e->internal >> (_precision);			\
 	}								\
 	static inline void ewma_##name##_add(struct ewma_##name *e,	\
 					     unsigned long val)		\
 	{								\
 		unsigned long internal = ACCESS_ONCE(e->internal);	\
-		unsigned long weight = ilog2(_weight);			\
-		unsigned long factor = ilog2(_factor);			\
+		unsigned long weight_rcp = ilog2(_weight_rcp);		\
+		unsigned long precision = _precision;			\
 									\
-		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
-		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
 									\
 		ACCESS_ONCE(e->internal) = internal ?			\
-			(((internal << weight) - internal) +		\
-				(val << factor)) >> weight :		\
-			(val << factor);				\
+			(((internal << weight_rcp) - internal) +	\
+				(val << precision)) >> weight_rcp :	\
+			(val << precision);				\
 	}
 
 #endif /* _LINUX_AVERAGE_H */
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 8f64a5c01345..66b25e410a41 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -402,7 +402,7 @@ struct batadv_gw_node {
 	struct rcu_head rcu;
 };
 
-DECLARE_EWMA(throughput, 1024, 8)
+DECLARE_EWMA(throughput, 10, 8)
 
 /**
  * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 159a1a733725..0e718437d080 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -428,7 +428,7 @@ struct ieee80211_sta_tx_tspec {
 	bool downgraded;
 };
 
-DECLARE_EWMA(beacon_signal, 16, 4)
+DECLARE_EWMA(beacon_signal, 4, 4)
 
 struct ieee80211_if_managed {
 	struct timer_list timer;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 15599c70a38f..e65cda34d2bc 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -372,7 +372,7 @@ struct mesh_sta {
 	unsigned int fail_avg;
 };
 
-DECLARE_EWMA(signal, 1024, 8)
+DECLARE_EWMA(signal, 10, 8)
 
 struct ieee80211_sta_rx_stats {
 	unsigned long packets;
-- 
cgit v1.2.3


From 9ccd27cc2e8bfbec502d7c64a353dc4489536b81 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 21:11:09 +0100
Subject: sched/headers: Make all include/linux/sched/*.h headers build
 standalone

Make each header self-sufficient, so that it can be built successfully
both in an allnoconfig and allyesconfig kernel.

Also standardize the naming of their header guards.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/deadline.h |  8 +++++---
 include/linux/sched/prio.h     |  6 +++---
 include/linux/sched/rt.h       | 10 ++++++----
 include/linux/sched/sysctl.h   | 10 +++++++---
 4 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9089a2ae913d..975be862e083 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,5 +1,7 @@
-#ifndef _SCHED_DEADLINE_H
-#define _SCHED_DEADLINE_H
+#ifndef _LINUX_SCHED_DEADLINE_H
+#define _LINUX_SCHED_DEADLINE_H
+
+#include <linux/sched.h>
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -26,4 +28,4 @@ static inline bool dl_time_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
-#endif /* _SCHED_DEADLINE_H */
+#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index d9cf5a5762d9..2cc450f6ec54 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -1,5 +1,5 @@
-#ifndef _SCHED_PRIO_H
-#define _SCHED_PRIO_H
+#ifndef _LINUX_SCHED_PRIO_H
+#define _LINUX_SCHED_PRIO_H
 
 #define MAX_NICE	19
 #define MIN_NICE	-20
@@ -57,4 +57,4 @@ static inline long rlimit_to_nice(long prio)
 	return (MAX_NICE - prio + 1);
 }
 
-#endif /* _SCHED_PRIO_H */
+#endif /* _LINUX_SCHED_PRIO_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index a30b172df6e1..3bd668414f61 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -1,7 +1,9 @@
-#ifndef _SCHED_RT_H
-#define _SCHED_RT_H
+#ifndef _LINUX_SCHED_RT_H
+#define _LINUX_SCHED_RT_H
 
-#include <linux/sched/prio.h>
+#include <linux/sched.h>
+
+struct task_struct;
 
 static inline int rt_prio(int prio)
 {
@@ -57,4 +59,4 @@ extern void normalize_rt_tasks(void);
  */
 #define RR_TIMESLICE		(100 * HZ / 1000)
 
-#endif /* _SCHED_RT_H */
+#endif /* _LINUX_SCHED_RT_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 49308e142aae..0f5ecd4d298e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -1,5 +1,9 @@
-#ifndef _SCHED_SYSCTL_H
-#define _SCHED_SYSCTL_H
+#ifndef _LINUX_SCHED_SYSCTL_H
+#define _LINUX_SCHED_SYSCTL_H
+
+#include <linux/types.h>
+
+struct ctl_table;
 
 #ifdef CONFIG_DETECT_HUNG_TASK
 extern int	     sysctl_hung_task_check_count;
@@ -78,4 +82,4 @@ extern int sysctl_schedstats(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
 
-#endif /* _SCHED_SYSCTL_H */
+#endif /* _LINUX_SCHED_SYSCTL_H */
-- 
cgit v1.2.3


From c930b2c0de32f45ce8f67affe936ce7a05b07b00 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 12:22:54 +0100
Subject: sched/core: Convert ___assert_task_state() link time assert to
 BUILD_BUG_ON()

The length of TASK_STATE_TO_CHAR_STR was still checked using the old
link-time manual error method - convert it to BUILD_BUG_ON(). This
has a couple of advantages:

 - it's more obvious what's going on

 - it reduces the size and complexity of <linux/sched.h>

 - BUILD_BUG_ON() will fail during compilation, with a clearer
   error message than the link time assert.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 3 ---
 kernel/sched/core.c   | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a28deb5f210..c204613396cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,9 +223,6 @@ extern void proc_sched_set_task(struct task_struct *p);
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
-extern char ___assert_task_state[1 - 2*!!(
-		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
-
 /* Convenience macros for the sake of set_current_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bbfb917a9b49..7d76ccb79e91 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5233,6 +5233,9 @@ void sched_show_task(struct task_struct *p)
 	int ppid;
 	unsigned long state = p->state;
 
+	/* Make sure the string lines up properly with the number of task states: */
+	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
 	if (!try_get_task_stack(p))
 		return;
 	if (state)
-- 
cgit v1.2.3


From 59ddbcb2f45b958cf1f11f122b666cbcf50cd57b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:37:48 +0100
Subject: sched/core: Move the get_preempt_disable_ip() inline to sched/core.c

It's defined in <linux/sched.h>, but nothing outside the scheduler
uses it - so move it to the sched/core.c usage site.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 9 ---------
 kernel/sched/core.c   | 9 +++++++++
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c204613396cd..df42cac04243 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3419,15 +3419,6 @@ static inline void cond_resched_rcu(void)
 #endif
 }
 
-static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-	return p->preempt_disable_ip;
-#else
-	return 0;
-#endif
-}
-
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d76ccb79e91..2acdf19c5f7c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3211,6 +3211,15 @@ static inline void preempt_latency_start(int val) { }
 static inline void preempt_latency_stop(int val) { }
 #endif
 
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	return p->preempt_disable_ip;
+#else
+	return 0;
+#endif
+}
+
 /*
  * Print scheduling while atomic bug:
  */
-- 
cgit v1.2.3


From 0c98d344fe5c27f6e4bce42ac503e9e9a51c7d1d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 15:38:10 +0100
Subject: sched/core: Remove the tsk_cpus_allowed() wrapper

So the original intention of tsk_cpus_allowed() was to 'future-proof'
the field - but it's pretty ineffectual at that, because half of
the code uses ->cpus_allowed directly ...

Also, the wrapper makes the code longer than the original expression!

So just get rid of it. This also shrinks <linux/sched.h> a bit.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/smp.c                  |  2 +-
 arch/powerpc/platforms/cell/spufs/sched.c  |  2 +-
 arch/sparc/kernel/sysfs.c                  |  2 +-
 drivers/cpufreq/sparc-us2e-cpufreq.c       |  4 ++--
 drivers/cpufreq/sparc-us3-cpufreq.c        |  4 ++--
 drivers/infiniband/hw/hfi1/affinity.c      |  2 +-
 drivers/infiniband/hw/hfi1/sdma.c          |  2 +-
 include/linux/sched.h                      |  3 ---
 kernel/sched/core.c                        | 20 ++++++++++----------
 kernel/sched/cpudeadline.c                 |  4 ++--
 kernel/sched/cpupri.c                      |  4 ++--
 kernel/sched/deadline.c                    |  7 +++----
 kernel/sched/fair.c                        | 25 ++++++++++++-------------
 kernel/sched/rt.c                          |  5 ++---
 lib/smp_processor_id.c                     |  2 +-
 samples/trace_events/trace-events-sample.c |  2 +-
 16 files changed, 42 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 573fb3a461b5..21fdf02583fe 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -795,7 +795,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	 * se we pin us down to CPU 0 for a short while
 	 */
 	alloc_cpumask_var(&old_mask, GFP_NOWAIT);
-	cpumask_copy(old_mask, tsk_cpus_allowed(current));
+	cpumask_copy(old_mask, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
 	
 	if (smp_ops && smp_ops->setup_cpu)
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 460f5f31d5cb..9b543df210fb 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -140,7 +140,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	 * runqueue. The context will be rescheduled on the proper node
 	 * if it is timesliced or preempted.
 	 */
-	cpumask_copy(&ctx->cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
 
 	/* Save the current cpu id for spu interrupt routing. */
 	ctx->last_ran = raw_smp_processor_id();
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 4808b6d23455..d63fc613e7a9 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -106,7 +106,7 @@ static unsigned long run_on_cpu(unsigned long cpu,
 	cpumask_t old_affinity;
 	unsigned long ret;
 
-	cpumask_copy(&old_affinity, tsk_cpus_allowed(current));
+	cpumask_copy(&old_affinity, &current->cpus_allowed);
 	/* should return -EINVAL to userspace */
 	if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
 		return 0;
diff --git a/drivers/cpufreq/sparc-us2e-cpufreq.c b/drivers/cpufreq/sparc-us2e-cpufreq.c
index b73feeb666f9..35ddb6da93aa 100644
--- a/drivers/cpufreq/sparc-us2e-cpufreq.c
+++ b/drivers/cpufreq/sparc-us2e-cpufreq.c
@@ -234,7 +234,7 @@ static unsigned int us2e_freq_get(unsigned int cpu)
 	cpumask_t cpus_allowed;
 	unsigned long clock_tick, estar;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	clock_tick = sparc64_get_clock_tick(cpu) / 1000;
@@ -252,7 +252,7 @@ static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
 	unsigned long clock_tick, divisor, old_divisor, estar;
 	cpumask_t cpus_allowed;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
diff --git a/drivers/cpufreq/sparc-us3-cpufreq.c b/drivers/cpufreq/sparc-us3-cpufreq.c
index 9bb42ba50efa..a8d86a449ca1 100644
--- a/drivers/cpufreq/sparc-us3-cpufreq.c
+++ b/drivers/cpufreq/sparc-us3-cpufreq.c
@@ -82,7 +82,7 @@ static unsigned int us3_freq_get(unsigned int cpu)
 	unsigned long reg;
 	unsigned int ret;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	reg = read_safari_cfg();
@@ -99,7 +99,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
 	unsigned long new_bits, new_freq, reg;
 	cpumask_t cpus_allowed;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	new_freq = sparc64_get_clock_tick(cpu) / 1000;
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 7a3d906b3671..e2cd2cd3b28a 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -576,7 +576,7 @@ int hfi1_get_proc_affinity(int node)
 	struct hfi1_affinity_node *entry;
 	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
 	const struct cpumask *node_mask,
-		*proc_mask = tsk_cpus_allowed(current);
+		*proc_mask = &current->cpus_allowed;
 	struct hfi1_affinity_node_list *affinity = &node_affinity;
 	struct cpu_mask_set *set = &affinity->proc;
 
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 1d81cac1fa6c..5cde1ecda0fe 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -856,7 +856,7 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
 {
 	struct sdma_rht_node *rht_node;
 	struct sdma_engine *sde = NULL;
-	const struct cpumask *current_mask = tsk_cpus_allowed(current);
+	const struct cpumask *current_mask = &current->cpus_allowed;
 	unsigned long cpu_id;
 
 	/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index df42cac04243..6d1cc20cc477 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1995,9 +1995,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 }
 #endif
 
-/* Future-safe accessor for struct task_struct's cpus_allowed. */
-#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
-
 static inline int tsk_nr_cpus_allowed(struct task_struct *p)
 {
 	return p->nr_cpus_allowed;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2acdf19c5f7c..ef5bbf760a08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -981,7 +981,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 		return rq;
 
 	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 		return rq;
 
 	rq = move_queued_task(rq, p, dest_cpu);
@@ -1259,10 +1259,10 @@ static int migrate_swap_stop(void *data)
 	if (task_cpu(arg->src_task) != arg->src_cpu)
 		goto unlock;
 
-	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+	if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
 		goto unlock;
 
-	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+	if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
 		goto unlock;
 
 	__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1303,10 +1303,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
 	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+	if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+	if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
 		goto out;
 
 	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1490,14 +1490,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		for_each_cpu(dest_cpu, nodemask) {
 			if (!cpu_active(dest_cpu))
 				continue;
-			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+			if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 				return dest_cpu;
 		}
 	}
 
 	for (;;) {
 		/* Any allowed, online CPU? */
-		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+		for_each_cpu(dest_cpu, &p->cpus_allowed) {
 			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
 				continue;
 			if (!cpu_online(dest_cpu))
@@ -1552,7 +1552,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	if (tsk_nr_cpus_allowed(p) > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
-		cpu = cpumask_any(tsk_cpus_allowed(p));
+		cpu = cpumask_any(&p->cpus_allowed);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -1564,7 +1564,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
-	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
+	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
 		     !cpu_online(cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 
@@ -5473,7 +5473,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
 	if (curr_cpu == target_cpu)
 		return 0;
 
-	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+	if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
 		return -EINVAL;
 
 	/* TODO: This is not properly updating schedstats */
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index e73119013c53..fba235c7d026 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -128,10 +128,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
+	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
 		best_cpu = cpumask_any(later_mask);
 		goto out;
-	} else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
+	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
 			dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 		best_cpu = cpudl_maximum(cp);
 		if (later_mask)
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 11e9705bf937..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (skip)
 			continue;
 
-		if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
+		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
 
 		if (lowest_mask) {
-			cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
+			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
 
 			/*
 			 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27737f34757d..8e4d6e4e3ccc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 		 * If we cannot preempt any rq, fall back to pick any
 		 * online cpu.
 		 */
-		cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+		cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
 		if (cpu >= nr_cpu_ids) {
 			/*
 			 * Fail to find any suitable cpu.
@@ -1235,7 +1235,7 @@ static void set_curr_task_dl(struct rq *rq)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+	    cpumask_test_cpu(cpu, &p->cpus_allowed))
 		return 1;
 	return 0;
 }
@@ -1384,8 +1384,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(later_rq->cpu,
-						       tsk_cpus_allowed(task)) ||
+				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 274c747a01ce..3b60d73ab290 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1551,7 +1551,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	 */
 	if (cur) {
 		/* Skip this swap candidate if cannot move to the source cpu */
-		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+		if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
 			goto unlock;
 
 		/*
@@ -1661,7 +1661,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 
 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 		/* Skip this CPU if the source task cannot migrate */
-		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+		if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
 			continue;
 
 		env->dst_cpu = cpu;
@@ -5458,7 +5458,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_cpus(group),
-					tsk_cpus_allowed(p)))
+					&p->cpus_allowed))
 			continue;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -5578,7 +5578,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 		return cpumask_first(sched_group_cpus(group));
 
 	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
 		if (idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
@@ -5717,7 +5717,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 	if (!test_idle_cores(target, false))
 		return -1;
 
-	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
 
 	for_each_cpu_wrap(core, cpus, target, wrap) {
 		bool idle = true;
@@ -5751,7 +5751,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		return -1;
 
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
-		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
 		if (idle_cpu(cpu))
 			return cpu;
@@ -5803,7 +5803,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	time = local_clock();
 
 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
-		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
 		if (idle_cpu(cpu))
 			break;
@@ -5958,7 +5958,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+			      && cpumask_test_cpu(cpu, &p->cpus_allowed);
 	}
 
 	rcu_read_lock();
@@ -6698,7 +6698,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 		return 0;
 
-	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+	if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
 		int cpu;
 
 		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -6718,7 +6718,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 		/* Prevent to re-select dst_cpu via env's cpus */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+			if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
 				env->flags |= LBF_DST_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
@@ -7252,7 +7252,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 
 /*
  * Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to tsk_cpus_allowed() constraints.
+ * groups is inadequate due to ->cpus_allowed constraints.
  *
  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
@@ -8211,8 +8211,7 @@ more_balance:
 			 * if the curr task on busiest cpu can't be
 			 * moved to this_cpu
 			 */
-			if (!cpumask_test_cpu(this_cpu,
-					tsk_cpus_allowed(busiest->curr))) {
+			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
 				env.flags |= LBF_ALL_PINNED;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e8836cfc4cdb..cbd356f63883 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1591,7 +1591,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+	    cpumask_test_cpu(cpu, &p->cpus_allowed))
 		return 1;
 	return 0;
 }
@@ -1726,8 +1726,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(lowest_rq->cpu,
-						       tsk_cpus_allowed(task)) ||
+				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 1afec32de6f2..690d75b132fa 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -22,7 +22,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
 	 * Kernel threads bound to a single CPU can safely use
 	 * smp_processor_id():
 	 */
-	if (cpumask_equal(tsk_cpus_allowed(current), cpumask_of(this_cpu)))
+	if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
 		goto out;
 
 	/*
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 30e282d33d4d..bc7fcf010a5b 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -33,7 +33,7 @@ static void simple_thread_func(int cnt)
 
 	/* Silly tracepoints */
 	trace_foo_bar("hello", cnt, array, random_strings[len],
-		      tsk_cpus_allowed(current));
+		      &current->cpus_allowed);
 
 	trace_foo_with_template_simple("HELLO", cnt);
 
-- 
cgit v1.2.3


From 4b53a3412d6663214ce9c754eff9373a9cff9dee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 15:41:03 +0100
Subject: sched/core: Remove the tsk_nr_cpus_allowed() wrapper

tsk_nr_cpus_allowed() too is a pretty pointless wrapper that
is not used consistently and which makes the code both harder
to read and longer as well.

So remove it - this also shrinks <linux/sched.h> a bit.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  5 -----
 kernel/sched/core.c     |  2 +-
 kernel/sched/deadline.c | 28 ++++++++++++++--------------
 kernel/sched/rt.c       | 24 ++++++++++++------------
 4 files changed, 27 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d1cc20cc477..e732881517f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1995,11 +1995,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 }
 #endif
 
-static inline int tsk_nr_cpus_allowed(struct task_struct *p)
-{
-	return p->nr_cpus_allowed;
-}
-
 #define TNF_MIGRATED	0x01
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef5bbf760a08..2d51ec65dc73 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1549,7 +1549,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(&p->cpus_allowed);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8e4d6e4e3ccc..99b2c33a9fbc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory++;
 
 	update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory--;
 
 	update_dl_migration(dl_rq);
@@ -958,7 +958,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_dl_entity(&p->dl, pi_se, flags);
 
-	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1032,9 +1032,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * try to make it stay here, it might be important.
 	 */
 	if (unlikely(dl_task(curr)) &&
-	    (tsk_nr_cpus_allowed(curr) < 2 ||
+	    (curr->nr_cpus_allowed < 2 ||
 	     !dl_entity_preempt(&p->dl, &curr->dl)) &&
-	    (tsk_nr_cpus_allowed(p) > 1)) {
+	    (p->nr_cpus_allowed > 1)) {
 		int target = find_later_rq(p);
 
 		if (target != -1 &&
@@ -1055,7 +1055,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+	if (rq->curr->nr_cpus_allowed == 1 ||
 	    cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
 		return;
 
@@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (tsk_nr_cpus_allowed(p) != 1 &&
+	if (p->nr_cpus_allowed != 1 &&
 	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
 		return;
 
@@ -1178,7 +1178,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 {
 	update_curr_dl(rq);
 
-	if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
+	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1279,7 +1279,7 @@ static int find_later_rq(struct task_struct *task)
 	if (unlikely(!later_mask))
 		return -1;
 
-	if (tsk_nr_cpus_allowed(task) == 1)
+	if (task->nr_cpus_allowed == 1)
 		return -1;
 
 	/*
@@ -1424,7 +1424,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+	BUG_ON(p->nr_cpus_allowed <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!dl_task(p));
@@ -1463,7 +1463,7 @@ retry:
 	 */
 	if (dl_task(rq->curr) &&
 	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
-	    tsk_nr_cpus_allowed(rq->curr) > 1) {
+	    rq->curr->nr_cpus_allowed > 1) {
 		resched_curr(rq);
 		return 0;
 	}
@@ -1610,9 +1610,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    tsk_nr_cpus_allowed(p) > 1 &&
+	    p->nr_cpus_allowed > 1 &&
 	    dl_task(rq->curr) &&
-	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+	    (rq->curr->nr_cpus_allowed < 2 ||
 	     !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
 		push_dl_tasks(rq);
 	}
@@ -1726,7 +1726,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 
 	if (rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
 			queue_push_tasks(rq);
 #endif
 		if (dl_task(rq->curr))
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index cbd356f63883..9f3e40226dec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -335,7 +335,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total++;
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory++;
 
 	update_rt_migration(rt_rq);
@@ -352,7 +352,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total--;
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory--;
 
 	update_rt_migration(rt_rq);
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_rt_entity(rt_se, flags);
 
-	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * will have to sort it out.
 	 */
 	if (curr && unlikely(rt_task(curr)) &&
-	    (tsk_nr_cpus_allowed(curr) < 2 ||
+	    (curr->nr_cpus_allowed < 2 ||
 	     curr->prio <= p->prio)) {
 		int target = find_lowest_rq(p);
 
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+	if (rq->curr->nr_cpus_allowed == 1 ||
 	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
 		return;
 
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (tsk_nr_cpus_allowed(p) != 1
+	if (p->nr_cpus_allowed != 1
 	    && cpupri_find(&rq->rd->cpupri, p, NULL))
 		return;
 
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 	 * The previous task needs to be made eligible for pushing
 	 * if it is still active
 	 */
-	if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
+	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (unlikely(!lowest_mask))
 		return -1;
 
-	if (tsk_nr_cpus_allowed(task) == 1)
+	if (task->nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
 
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1761,7 +1761,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+	BUG_ON(p->nr_cpus_allowed <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!rt_task(p));
@@ -2121,9 +2121,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    tsk_nr_cpus_allowed(p) > 1 &&
+	    p->nr_cpus_allowed > 1 &&
 	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
-	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+	    (rq->curr->nr_cpus_allowed < 2 ||
 	     rq->curr->prio <= p->prio))
 		push_rt_tasks(rq);
 }
@@ -2196,7 +2196,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
 			queue_push_tasks(rq);
 #endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio)
-- 
cgit v1.2.3


From f9411ebe3d85cbbea06298241e6053d031d281fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 09:50:49 +0100
Subject: rcu: Separate the RCU synchronization types and APIs into
 <linux/rcupdate_wait.h>

So rcupdate.h is a pretty complex header, in particular it includes
<linux/completion.h> which includes <linux/wait.h> - creating a
dependency that includes <linux/wait.h> in <linux/sched.h>,
which prevents the isolation of <linux/sched.h> from the derived
<linux/wait.h> header.

Solve part of the problem by decoupling rcupdate.h from completions:
this can be done by separating out the rcu_synchronize types and APIs,
and updating their usage sites.

Since this is a mostly RCU-internal types this will not just simplify
<linux/sched.h>'s dependencies, but will make all the hundreds of
.c files that include rcupdate.h but not completions or wait.h build
faster.

( For rcutiny this means that two dependent APIs have to be uninlined,
  but that shouldn't be much of a problem as they are rare variants. )

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/autofs4/autofs_i.h         |  1 +
 include/linux/dcache.h        |  1 +
 include/linux/rcupdate.h      | 40 ----------------------------------
 include/linux/rcupdate_wait.h | 50 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/rcutiny.h       | 11 ++--------
 kernel/rcu/srcu.c             |  2 +-
 kernel/rcu/tiny.c             | 14 +++++++++++-
 kernel/rcu/tree.c             |  2 +-
 kernel/rcu/update.c           |  1 +
 kernel/sched/core.c           |  1 +
 10 files changed, 71 insertions(+), 52 deletions(-)
 create mode 100644 include/linux/rcupdate_wait.h

(limited to 'include/linux')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c885daae68c8..beef981aa54f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -14,6 +14,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/completion.h>
 
 /* This is the range of ioctl() numbers we claim as ours */
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 591b6c16f9c1..d2e38dc6172c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -11,6 +11,7 @@
 #include <linux/rcupdate.h>
 #include <linux/lockref.h>
 #include <linux/stringhash.h>
+#include <linux/wait.h>
 
 struct path;
 struct vfsmount;
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6ade6a52d9d4..de88b33c0974 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -40,7 +40,6 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 #include <linux/lockdep.h>
-#include <linux/completion.h>
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
@@ -226,45 +225,6 @@ void call_rcu_sched(struct rcu_head *head,
 
 void synchronize_sched(void);
 
-/*
- * Structure allowing asynchronous waiting on RCU.
- */
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-void wakeme_after_rcu(struct rcu_head *head);
-
-void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
-		   struct rcu_synchronize *rs_array);
-
-#define _wait_rcu_gp(checktiny, ...) \
-do {									\
-	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ };		\
-	struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)];	\
-	__wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array),		\
-			__crcu_array, __rs_array);			\
-} while (0)
-
-#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
-
-/**
- * synchronize_rcu_mult - Wait concurrently for multiple grace periods
- * @...: List of call_rcu() functions for the flavors to wait on.
- *
- * This macro waits concurrently for multiple flavors of RCU grace periods.
- * For example, synchronize_rcu_mult(call_rcu, call_rcu_bh) would wait
- * on concurrent RCU and RCU-bh grace periods.  Waiting on a give SRCU
- * domain requires you to write a wrapper function for that SRCU domain's
- * call_srcu() function, supplying the corresponding srcu_struct.
- *
- * If Tiny RCU, tell _wait_rcu_gp() not to bother waiting for RCU
- * or RCU-bh, given that anywhere synchronize_rcu_mult() can be called
- * is automatically a grace period.
- */
-#define synchronize_rcu_mult(...) \
-	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
-
 /**
  * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
  * @head: structure to be used for queueing the RCU updates.
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
new file mode 100644
index 000000000000..e774b4f5f220
--- /dev/null
+++ b/include/linux/rcupdate_wait.h
@@ -0,0 +1,50 @@
+#ifndef _LINUX_SCHED_RCUPDATE_WAIT_H
+#define _LINUX_SCHED_RCUPDATE_WAIT_H
+
+/*
+ * RCU synchronization types and methods:
+ */
+
+#include <linux/rcupdate.h>
+#include <linux/completion.h>
+
+/*
+ * Structure allowing asynchronous waiting on RCU.
+ */
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+void wakeme_after_rcu(struct rcu_head *head);
+
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+		   struct rcu_synchronize *rs_array);
+
+#define _wait_rcu_gp(checktiny, ...) \
+do {									\
+	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ };		\
+	struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)];	\
+	__wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array),		\
+			__crcu_array, __rs_array);			\
+} while (0)
+
+#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
+
+/**
+ * synchronize_rcu_mult - Wait concurrently for multiple grace periods
+ * @...: List of call_rcu() functions for the flavors to wait on.
+ *
+ * This macro waits concurrently for multiple flavors of RCU grace periods.
+ * For example, synchronize_rcu_mult(call_rcu, call_rcu_bh) would wait
+ * on concurrent RCU and RCU-bh grace periods.  Waiting on a give SRCU
+ * domain requires you to write a wrapper function for that SRCU domain's
+ * call_srcu() function, supplying the corresponding srcu_struct.
+ *
+ * If Tiny RCU, tell _wait_rcu_gp() not to bother waiting for RCU
+ * or RCU-bh, given that anywhere synchronize_rcu_mult() can be called
+ * is automatically a grace period.
+ */
+#define synchronize_rcu_mult(...) \
+	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
+
+#endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4f9b2fa2173d..b452953e21c8 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -53,15 +53,8 @@ static inline void cond_synchronize_sched(unsigned long oldstate)
 	might_sleep();
 }
 
-static inline void rcu_barrier_bh(void)
-{
-	wait_rcu_gp(call_rcu_bh);
-}
-
-static inline void rcu_barrier_sched(void)
-{
-	wait_rcu_gp(call_rcu_sched);
-}
+extern void rcu_barrier_bh(void);
+extern void rcu_barrier_sched(void);
 
 static inline void synchronize_rcu_expedited(void)
 {
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e773129c8b08..ef3bcfb15b39 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -30,7 +30,7 @@
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index fa6a48d3917b..6ad330dbbae2 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -25,7 +25,7 @@
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
@@ -47,6 +47,18 @@ static void __call_rcu(struct rcu_head *head,
 
 #include "tiny_plugin.h"
 
+void rcu_barrier_bh(void)
+{
+	wait_rcu_gp(call_rcu_bh);
+}
+EXPORT_SYMBOL(rcu_barrier_bh);
+
+void rcu_barrier_sched(void)
+{
+	wait_rcu_gp(call_rcu_sched);
+}
+EXPORT_SYMBOL(rcu_barrier_sched);
+
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d80e0d2f68c6..cb62ce23ffc7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 9e03db9ea9c0..a0e90e0afc75 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -49,6 +49,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
 #include <linux/tick.h>
+#include <linux/rcupdate_wait.h>
 
 #define CREATE_TRACE_POINTS
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d51ec65dc73..1bd317db9810 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,7 @@
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
 #include <linux/context_tracking.h>
+#include <linux/rcupdate_wait.h>
 
 #include <linux/blkdev.h>
 #include <linux/kprobes.h>
-- 
cgit v1.2.3


From 780de9dd2720debc14c501dab4dc80d1f75ad50e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 11:50:56 +0100
Subject: sched/headers, cgroups: Remove the threadgroup_change_*() wrappery

threadgroup_change_begin()/end() is a pointless wrapper around
cgroup_threadgroup_change_begin()/end(), minus a might_sleep()
in the !CONFIG_CGROUPS=y case.

Remove the wrappery, move the might_sleep() (the down_read()
already has a might_sleep() check).

This debloats <linux/sched.h> a bit and simplifies this API.

Update all call sites.

No change in functionality.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                   |  6 +++---
 include/linux/cgroup-defs.h | 13 ++++++++-----
 include/linux/sched.h       | 28 ----------------------------
 kernel/cgroup/pids.c        |  2 +-
 kernel/fork.c               |  6 +++---
 kernel/signal.c             |  6 +++---
 6 files changed, 18 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 698a86094f76..e595e1529581 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1088,7 +1088,7 @@ static int de_thread(struct task_struct *tsk)
 		struct task_struct *leader = tsk->group_leader;
 
 		for (;;) {
-			threadgroup_change_begin(tsk);
+			cgroup_threadgroup_change_begin(tsk);
 			write_lock_irq(&tasklist_lock);
 			/*
 			 * Do this under tasklist_lock to ensure that
@@ -1099,7 +1099,7 @@ static int de_thread(struct task_struct *tsk)
 				break;
 			__set_current_state(TASK_KILLABLE);
 			write_unlock_irq(&tasklist_lock);
-			threadgroup_change_end(tsk);
+			cgroup_threadgroup_change_end(tsk);
 			schedule();
 			if (unlikely(__fatal_signal_pending(tsk)))
 				goto killed;
@@ -1157,7 +1157,7 @@ static int de_thread(struct task_struct *tsk)
 		if (unlikely(leader->ptrace))
 			__wake_up_parent(leader, leader->parent);
 		write_unlock_irq(&tasklist_lock);
-		threadgroup_change_end(tsk);
+		cgroup_threadgroup_change_end(tsk);
 
 		release_task(leader);
 	}
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3c02404cfce9..6a3f850cabab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -531,8 +531,8 @@ extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
  * @tsk: target task
  *
- * Called from threadgroup_change_begin() and allows cgroup operations to
- * synchronize against threadgroup changes using a percpu_rw_semaphore.
+ * Allows cgroup operations to synchronize against threadgroup changes
+ * using a percpu_rw_semaphore.
  */
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
 {
@@ -543,8 +543,7 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
  * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
  * @tsk: target task
  *
- * Called from threadgroup_change_end().  Counterpart of
- * cgroup_threadcgroup_change_begin().
+ * Counterpart of cgroup_threadcgroup_change_begin().
  */
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 {
@@ -555,7 +554,11 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 
 #define CGROUP_SUBSYS_COUNT 0
 
-static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+	might_sleep();
+}
+
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
 
 #endif	/* CONFIG_CGROUPS */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e732881517f2..3f61baac928b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3162,34 +3162,6 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
 }
 
-/**
- * threadgroup_change_begin - mark the beginning of changes to a threadgroup
- * @tsk: task causing the changes
- *
- * All operations which modify a threadgroup - a new thread joining the
- * group, death of a member thread (the assertion of PF_EXITING) and
- * exec(2) dethreading the process and replacing the leader - are wrapped
- * by threadgroup_change_{begin|end}().  This is to provide a place which
- * subsystems needing threadgroup stability can hook into for
- * synchronization.
- */
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-	might_sleep();
-	cgroup_threadgroup_change_begin(tsk);
-}
-
-/**
- * threadgroup_change_end - mark the end of changes to a threadgroup
- * @tsk: task causing the changes
- *
- * See threadgroup_change_begin().
- */
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
-	cgroup_threadgroup_change_end(tsk);
-}
-
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 
 static inline struct thread_info *task_thread_info(struct task_struct *task)
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2bd673783f1a..e756dae49300 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -214,7 +214,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
 
 /*
  * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
- * on threadgroup_change_begin() held by the copy_process().
+ * on cgroup_threadgroup_change_begin() held by the copy_process().
  */
 static int pids_can_fork(struct task_struct *task)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 246bf9aaf9df..d043fedc03c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1746,7 +1746,7 @@ static __latent_entropy struct task_struct *copy_process(
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
-	threadgroup_change_begin(current);
+	cgroup_threadgroup_change_begin(current);
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted the the new process's css_set can be changed
@@ -1843,7 +1843,7 @@ static __latent_entropy struct task_struct *copy_process(
 
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
-	threadgroup_change_end(current);
+	cgroup_threadgroup_change_end(current);
 	perf_event_fork(p);
 
 	trace_task_newtask(p, clone_flags);
@@ -1854,7 +1854,7 @@ static __latent_entropy struct task_struct *copy_process(
 bad_fork_cancel_cgroup:
 	cgroup_cancel_fork(p);
 bad_fork_free_pid:
-	threadgroup_change_end(current);
+	cgroup_threadgroup_change_end(current);
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_thread:
diff --git a/kernel/signal.c b/kernel/signal.c
index 214a8feeb771..bae358532d0a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2395,11 +2395,11 @@ void exit_signals(struct task_struct *tsk)
 	 * @tsk is about to have PF_EXITING set - lock out users which
 	 * expect stable threadgroup.
 	 */
-	threadgroup_change_begin(tsk);
+	cgroup_threadgroup_change_begin(tsk);
 
 	if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
 		tsk->flags |= PF_EXITING;
-		threadgroup_change_end(tsk);
+		cgroup_threadgroup_change_end(tsk);
 		return;
 	}
 
@@ -2410,7 +2410,7 @@ void exit_signals(struct task_struct *tsk)
 	 */
 	tsk->flags |= PF_EXITING;
 
-	threadgroup_change_end(tsk);
+	cgroup_threadgroup_change_end(tsk);
 
 	if (!signal_pending(tsk))
 		goto out;
-- 
cgit v1.2.3


From 314ff7851fc8ea66cbf48eaa93d8ebfb5ca084a9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 11:03:31 +0100
Subject: mm/vmacache, sched/headers: Introduce 'struct vmacache' and move it
 from <linux/sched.h> to <linux/mm_types>

The <linux/sched.h> header includes various vmacache related defines,
which are arguably misplaced.

Move them to mm_types.h and minimize the sched.h impact by putting
all task vmacache state into a new 'struct vmacache' structure.

No change in functionality.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h  | 12 ++++++++++++
 include/linux/sched.h     | 11 ++++-------
 include/linux/vmacache.h  |  2 +-
 kernel/debug/debug_core.c |  4 ++--
 mm/nommu.c                |  2 +-
 mm/vmacache.c             | 10 +++++-----
 6 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4f6d440ad785..137797cd7b50 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -360,6 +360,18 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 
+/*
+ * The per task VMA cache array:
+ */
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
+
+struct vmacache {
+	u32 seqnum;
+	struct vm_area_struct *vmas[VMACACHE_SIZE];
+};
+
 struct core_thread {
 	struct task_struct *task;
 	struct core_thread *next;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3f61baac928b..e87c97e1a947 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -134,10 +134,6 @@ struct blk_plug;
 struct filename;
 struct nameidata;
 
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
 /*
  * These are the constant used to fake the fixed-point load-average
  * counting. Some notes:
@@ -1550,9 +1546,10 @@ struct task_struct {
 #endif
 
 	struct mm_struct *mm, *active_mm;
-	/* per-thread vma caching */
-	u32 vmacache_seqnum;
-	struct vm_area_struct *vmacache[VMACACHE_SIZE];
+
+	/* Per-thread vma caching: */
+	struct vmacache vmacache;
+
 #if defined(SPLIT_RSS_COUNTING)
 	struct task_rss_stat	rss_stat;
 #endif
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
index c3fa0fd43949..1081db987391 100644
--- a/include/linux/vmacache.h
+++ b/include/linux/vmacache.h
@@ -12,7 +12,7 @@
 
 static inline void vmacache_flush(struct task_struct *tsk)
 {
-	memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
+	memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
 }
 
 extern void vmacache_flush_all(struct mm_struct *mm);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 79517e5549f1..a603ef28f70c 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -232,9 +232,9 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 		int i;
 
 		for (i = 0; i < VMACACHE_SIZE; i++) {
-			if (!current->vmacache[i])
+			if (!current->vmacache.vmas[i])
 				continue;
-			flush_cache_range(current->vmacache[i],
+			flush_cache_range(current->vmacache.vmas[i],
 					  addr, addr + BREAK_INSTR_SIZE);
 		}
 	}
diff --git a/mm/nommu.c b/mm/nommu.c
index fe9f4fa4a7a7..aae06e854552 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -757,7 +757,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 	mm->map_count--;
 	for (i = 0; i < VMACACHE_SIZE; i++) {
 		/* if the vma is cached, invalidate the entire cache */
-		if (curr->vmacache[i] == vma) {
+		if (curr->vmacache.vmas[i] == vma) {
 			vmacache_invalidate(mm);
 			break;
 		}
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 035fdeb35b43..7c233f8e20ee 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -60,7 +60,7 @@ static inline bool vmacache_valid_mm(struct mm_struct *mm)
 void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
 {
 	if (vmacache_valid_mm(newvma->vm_mm))
-		current->vmacache[VMACACHE_HASH(addr)] = newvma;
+		current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
 }
 
 static bool vmacache_valid(struct mm_struct *mm)
@@ -71,12 +71,12 @@ static bool vmacache_valid(struct mm_struct *mm)
 		return false;
 
 	curr = current;
-	if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+	if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
 		/*
 		 * First attempt will always be invalid, initialize
 		 * the new cache for this task here.
 		 */
-		curr->vmacache_seqnum = mm->vmacache_seqnum;
+		curr->vmacache.seqnum = mm->vmacache_seqnum;
 		vmacache_flush(curr);
 		return false;
 	}
@@ -93,7 +93,7 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
 		return NULL;
 
 	for (i = 0; i < VMACACHE_SIZE; i++) {
-		struct vm_area_struct *vma = current->vmacache[i];
+		struct vm_area_struct *vma = current->vmacache.vmas[i];
 
 		if (!vma)
 			continue;
@@ -121,7 +121,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
 		return NULL;
 
 	for (i = 0; i < VMACACHE_SIZE; i++) {
-		struct vm_area_struct *vma = current->vmacache[i];
+		struct vm_area_struct *vma = current->vmacache.vmas[i];
 
 		if (vma && vma->vm_start == start && vma->vm_end == end) {
 			count_vm_vmacache_event(VMACACHE_FIND_HITS);
-- 
cgit v1.2.3


From af8601ad420f6afa6445c927ad9f36d9700d96d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 09:57:00 +0100
Subject: kasan, sched/headers: Uninline kasan_enable/disable_current()

<linux/kasan.h> is a low level header that is included early
in affected kernel headers. But it includes <linux/sched.h>
which complicates the cleanup of sched.h dependencies.

But kasan.h has almost no need for sched.h: its only use of
scheduler functionality is in two inline functions which are
not used very frequently - so uninline kasan_enable_current()
and kasan_disable_current().

Also add a <linux/sched.h> dependency to a .c file that depended
on kasan.h including it.

This paves the way to remove the <linux/sched.h> include from kasan.h.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kasan.h | 10 ++--------
 lib/sbitmap.c         |  1 +
 mm/kasan/kasan.c      | 10 ++++++++++
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index c908b25bf5a5..7793036eac80 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -30,16 +30,10 @@ static inline void *kasan_mem_to_shadow(const void *addr)
 }
 
 /* Enable reporting bugs after kasan_disable_current() */
-static inline void kasan_enable_current(void)
-{
-	current->kasan_depth++;
-}
+extern void kasan_enable_current(void);
 
 /* Disable reporting bugs for current task */
-static inline void kasan_disable_current(void)
-{
-	current->kasan_depth--;
-}
+extern void kasan_disable_current(void);
 
 void kasan_unpoison_shadow(const void *address, size_t size);
 
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 55e11c4b2f3b..60e800e0b5a0 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -15,6 +15,7 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
+#include <linux/sched.h>
 #include <linux/random.h>
 #include <linux/sbitmap.h>
 #include <linux/seq_file.h>
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 25f0e6521f36..d99312ed7c22 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -39,6 +39,16 @@
 #include "kasan.h"
 #include "../slab.h"
 
+void kasan_enable_current(void)
+{
+	current->kasan_depth++;
+}
+
+void kasan_disable_current(void)
+{
+	current->kasan_depth--;
+}
+
 /*
  * Poisons the shadow memory for 'size' bytes starting from 'addr'.
  * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
-- 
cgit v1.2.3


From 105ab3d8ce7269887d24d224054677125e18037c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/topology.h>

We are going to split <linux/sched/topology.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/topology.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/topology.c     | 1 +
 arch/arm64/kernel/topology.c   | 1 +
 arch/powerpc/kernel/smp.c      | 1 +
 arch/s390/kernel/topology.c    | 1 +
 arch/x86/kernel/smpboot.c      | 1 +
 block/blk-mq.c                 | 1 +
 block/blk-softirq.c            | 1 +
 include/linux/cpuset.h         | 1 +
 include/linux/sched/topology.h | 6 ++++++
 kernel/sched/fair.c            | 2 ++
 kernel/sched/sched.h           | 1 +
 11 files changed, 17 insertions(+)
 create mode 100644 include/linux/sched/topology.h

(limited to 'include/linux')

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ebf47d91b804..f8a3ab82e77f 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -21,6 +21,7 @@
 #include <linux/nodemask.h>
 #include <linux/of.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 565dd69888cc..08243533e5ee 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -20,6 +20,7 @@
 #include <linux/nodemask.h>
 #include <linux/of.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cpufreq.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 21fdf02583fe..fce17789c675 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 2cd5f4f1013c..17660e800e74 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a0d38685f7df..fe7a66e6b5a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -45,6 +45,7 @@
 #include <linux/smp.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9e6b064e5339..746c14e0d157 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -20,6 +20,7 @@
 #include <linux/cpu.h>
 #include <linux/cache.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 06cf9807f49a..87b7df4851bf 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -9,6 +9,7 @@
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 
 #include "blk.h"
 
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index bfc204e70338..c608c39cb161 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/mm.h>
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
new file mode 100644
index 000000000000..4f1159f76f92
--- /dev/null
+++ b/include/linux/sched/topology.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_TOPOLOGY_H
+#define _LINUX_SCHED_TOPOLOGY_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b60d73ab290..11e0ab57748a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,8 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
+
 #include <linux/latencytop.h>
 #include <linux/cpumask.h>
 #include <linux/cpuidle.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71b10a9b73cf..9b9cb260d9cf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,6 +1,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
-- 
cgit v1.2.3


From 4c822698cba8bdd93724117eded12bf34eb80252 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/idle.h>

We are going to split  <linux/sched/idle.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/idle.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mwait.h   | 1 +
 arch/x86/kernel/process.c      | 1 +
 drivers/cpuidle/driver.c       | 1 +
 include/linux/sched/idle.h     | 6 ++++++
 include/linux/sched/topology.h | 2 ++
 kernel/sched/idle.c            | 1 +
 kernel/smp.c                   | 1 +
 7 files changed, 13 insertions(+)
 create mode 100644 include/linux/sched/idle.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index f37f2d8a2989..bda3c27f0da0 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_MWAIT_H
 
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 
 #include <asm/cpufeature.h>
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7780efa635b9..6395e0cd7dd6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,7 @@
 #include <linux/prctl.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index ab264d393233..e53fb861beb0 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -11,6 +11,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/cpuidle.h>
 #include <linux/cpumask.h>
 #include <linux/tick.h>
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
new file mode 100644
index 000000000000..a3cf79b86986
--- /dev/null
+++ b/include/linux/sched/idle.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_IDLE_H
+#define _LINUX_SCHED_IDLE_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_IDLE_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 4f1159f76f92..184b56b8aa64 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -3,4 +3,6 @@
 
 #include <linux/sched.h>
 
+#include <linux/sched/idle.h>
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6a4bae0a649d..ac6d5176463d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -2,6 +2,7 @@
  * Generic entry point for the idle threads
  */
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
 #include <linux/cpuhotplug.h>
diff --git a/kernel/smp.c b/kernel/smp.c
index 77fcdb9f2775..a817769b53c0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -17,6 +17,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/hypervisor.h>
 
 #include "smpboot.h"
-- 
cgit v1.2.3


From 84f001e15737f8214b0f5f0f7dfec0fb1027938f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/wake_q.h>

We are going to split <linux/sched/wake_q.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/wake_q.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/wake_q.h    | 6 ++++++
 ipc/mqueue.c                    | 1 +
 ipc/msg.c                       | 1 +
 ipc/sem.c                       | 1 +
 kernel/futex.c                  | 1 +
 kernel/locking/mutex.c          | 1 +
 kernel/locking/rtmutex.c        | 1 +
 kernel/locking/rtmutex_common.h | 1 +
 kernel/locking/rwsem-xadd.c     | 3 ++-
 kernel/sched/sched.h            | 1 +
 10 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/wake_q.h

(limited to 'include/linux')

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
new file mode 100644
index 000000000000..6383b35e4eba
--- /dev/null
+++ b/include/linux/sched/wake_q.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_WAKE_Q_H
+#define _LINUX_SCHED_WAKE_Q_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4fdd97031431..40e448de8a7e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -35,6 +35,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/slab.h>
+#include <linux/sched/wake_q.h>
 
 #include <net/sock.h>
 #include "util.h"
diff --git a/ipc/msg.c b/ipc/msg.c
index e3e52ce01123..ecc387e573f6 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -31,6 +31,7 @@
 #include <linux/list.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/sched/wake_q.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
 #include <linux/seq_file.h>
diff --git a/ipc/sem.c b/ipc/sem.c
index e468cd1c12f0..947dc2348271 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -82,6 +82,7 @@
 #include <linux/rwsem.h>
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
+#include <linux/sched/wake_q.h>
 
 #include <linux/uaccess.h>
 #include "util.h"
diff --git a/kernel/futex.c b/kernel/futex.c
index b687cb22301c..8ddcf9ea953c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ad2d9e22697b..57f6311e2405 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -21,6 +21,7 @@
 #include <linux/ww_mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d340be3a488f..d4f798491361 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/sched/wake_q.h>
 #include <linux/timer.h>
 
 #include "rtmutex_common.h"
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 990134617b4c..856dfff5c33a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -13,6 +13,7 @@
 #define __KERNEL_RTMUTEX_COMMON_H
 
 #include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
 
 /*
  * This is the control structure for tasks blocked on a rt_mutex,
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2ad8d8dc3bb1..4fe8d8ad4396 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -10,10 +10,11 @@
  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  */
 #include <linux/rwsem.h>
-#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/osq_lock.h>
 
 #include "rwsem.h"
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9b9cb260d9cf..683570739f46 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
-- 
cgit v1.2.3


From e601757102cfd3eeae068f53b3bc1234f3a2b2e9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/clock.h>

We are going to split <linux/sched/clock.h> out of <linux/sched.h>, which
will have to be picked up from other headers and .c files.

Create a trivial placeholder <linux/sched/clock.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/mach-bcm/platsmp.c             | 1 +
 arch/arm/mach-omap2/pm-debug.c          | 1 +
 arch/arm/probes/kprobes/test-core.c     | 1 +
 arch/cris/kernel/time.c                 | 2 +-
 arch/ia64/kernel/setup.c                | 1 +
 arch/microblaze/kernel/timer.c          | 1 +
 arch/mn10300/kernel/time.c              | 1 +
 arch/parisc/kernel/setup.c              | 1 +
 arch/parisc/kernel/time.c               | 1 +
 arch/powerpc/kernel/time.c              | 1 +
 arch/s390/kernel/time.c                 | 1 +
 arch/sparc/kernel/ds.c                  | 1 +
 arch/sparc/kernel/viohs.c               | 1 +
 arch/tile/kernel/time.c                 | 1 +
 arch/x86/events/amd/ibs.c               | 1 +
 arch/x86/events/core.c                  | 1 +
 arch/x86/kernel/cpu/amd.c               | 1 +
 arch/x86/kernel/cpu/centaur.c           | 1 +
 arch/x86/kernel/cpu/common.c            | 1 +
 arch/x86/kernel/cpu/cyrix.c             | 1 +
 arch/x86/kernel/cpu/intel.c             | 1 +
 arch/x86/kernel/cpu/transmeta.c         | 1 +
 arch/x86/kernel/kvmclock.c              | 1 +
 arch/x86/kernel/nmi.c                   | 1 +
 arch/x86/kernel/tsc.c                   | 1 +
 block/cfq-iosched.c                     | 1 +
 drivers/acpi/apei/ghes.c                | 1 +
 drivers/clocksource/arm_arch_timer.c    | 1 +
 drivers/clocksource/pxa_timer.c         | 1 +
 drivers/clocksource/timer-digicolor.c   | 1 +
 drivers/cpuidle/cpuidle.c               | 1 +
 drivers/firmware/tegra/bpmp.c           | 1 +
 drivers/gpu/drm/i915/i915_gem_request.c | 2 ++
 drivers/gpu/drm/i915/intel_drv.h        | 1 +
 drivers/md/bcache/bset.c                | 1 +
 drivers/md/bcache/btree.c               | 1 +
 drivers/md/bcache/sysfs.c               | 1 +
 drivers/md/bcache/util.c                | 1 +
 drivers/md/bcache/util.h                | 1 +
 drivers/md/bcache/writeback.c           | 1 +
 drivers/misc/cxl/native.c               | 1 +
 drivers/net/irda/pxaficp_ir.c           | 1 +
 drivers/perf/arm_pmu.c                  | 1 +
 drivers/vhost/net.c                     | 1 +
 include/linux/blkdev.h                  | 1 +
 include/linux/sched/clock.h             | 6 ++++++
 include/linux/skbuff.h                  | 1 +
 include/net/busy_poll.h                 | 1 +
 kernel/events/core.c                    | 1 +
 kernel/locking/lockdep.c                | 1 +
 kernel/locking/qspinlock_stat.h         | 1 +
 kernel/printk/printk.c                  | 1 +
 kernel/sched/clock.c                    | 1 +
 kernel/sched/core.c                     | 1 +
 kernel/sched/sched.h                    | 1 +
 kernel/time/sched_clock.c               | 1 +
 kernel/time/tick-sched.c                | 1 +
 kernel/torture.c                        | 1 +
 kernel/trace/ring_buffer.c              | 1 +
 kernel/trace/trace_clock.c              | 1 +
 kernel/trace/trace_hwlat.c              | 1 +
 kernel/trace/trace_output.c             | 1 +
 kernel/watchdog.c                       | 1 +
 lib/plist.c                             | 1 +
 net/ipv4/tcp_cdg.c                      | 2 ++
 65 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/clock.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-bcm/platsmp.c b/arch/arm/mach-bcm/platsmp.c
index 582886d0d02f..9e3f275934eb 100644
--- a/arch/arm/mach-bcm/platsmp.c
+++ b/arch/arm/mach-bcm/platsmp.c
@@ -21,6 +21,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/arm/mach-omap2/pm-debug.c b/arch/arm/mach-omap2/pm-debug.c
index 003a6cb248be..5c46ea6756d7 100644
--- a/arch/arm/mach-omap2/pm-debug.c
+++ b/arch/arm/mach-omap2/pm-debug.c
@@ -21,6 +21,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/io.h>
diff --git a/arch/arm/probes/kprobes/test-core.c b/arch/arm/probes/kprobes/test-core.c
index 9775de22e2ff..c893726aa52d 100644
--- a/arch/arm/probes/kprobes/test-core.c
+++ b/arch/arm/probes/kprobes/test-core.c
@@ -203,6 +203,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/kprobes.h>
 #include <linux/errno.h>
 #include <linux/stddef.h>
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index 2dda6da71521..bc562cf511a6 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -29,7 +29,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/profile.h>
-#include <linux/sched.h>	/* just for sched_clock() - funny that */
+#include <linux/sched/clock.h>
 
 
 #define D(x)
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index d68322966f33..32a6cc36296f 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/threads.h>
diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c
index 1d6fad50fa76..999066192715 100644
--- a/arch/microblaze/kernel/timer.c
+++ b/arch/microblaze/kernel/timer.c
@@ -12,6 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/clk.h>
 #include <linux/clockchips.h>
diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c
index 67c6416a58f8..06b83b17c5f1 100644
--- a/arch/mn10300/kernel/time.c
+++ b/arch/mn10300/kernel/time.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/time.h>
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 068ed3607bac..dee6f9d6a153 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -37,6 +37,7 @@
 #include <linux/proc_fs.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/processor.h>
 #include <asm/sections.h>
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 1e22f981cd81..89421df70160 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bc84a8d47b9e..37833c5dc274 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -34,6 +34,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index de66abb479c9..c31da46bc037 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c
index f87a55d77094..b542cc7c8d94 100644
--- a/arch/sparc/kernel/ds.c
+++ b/arch/sparc/kernel/ds.c
@@ -9,6 +9,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
diff --git a/arch/sparc/kernel/viohs.c b/arch/sparc/kernel/viohs.c
index 526fcb5d8ce9..b30b30ab3ddd 100644
--- a/arch/sparc/kernel/viohs.c
+++ b/arch/sparc/kernel/viohs.c
@@ -8,6 +8,7 @@
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
 
 #include <asm/ldc.h>
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index c9357012b1c8..5bd4e88c7c60 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -20,6 +20,7 @@
 #include <linux/clockchips.h>
 #include <linux/hardirq.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/module.h>
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 496e60391fac..786fd875de92 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/ptrace.h>
 #include <linux/syscore_ops.h>
+#include <linux/sched/clock.h>
 
 #include <asm/apic.h>
 
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1635c0c8df23..2ecc0e97772b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4e95b2e0d95f..35a5d5dca2fa 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 
 #include <linux/io.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 2c234a6d94c4..adc0ebd8bed0 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,6 @@
 
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c64ca5929cb5..c7e2ca966386 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -8,6 +8,7 @@
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/init.h>
 #include <linux/kprobes.h>
 #include <linux/kgdb.h>
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 47416f959a48..0a3bc19de017 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -10,6 +10,7 @@
 #include <asm/tsc.h>
 #include <asm/cpufeature.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 017ecd3bb553..fe0a615a051b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
 #include <linux/bitops.h>
 #include <linux/smp.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/thread_info.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index c1ea5b999839..8457b4978668 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/mm.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index bae6ea6cfb94..d88967659098 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -25,6 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/memblock.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bfe4d6c96fbd..d17b1a940add 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -20,6 +20,7 @@
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/sched/clock.h>
 
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2724dc82f992..46bcda4cb1c2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -2,6 +2,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/timer.h>
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 137944777859..440b95ee593c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -8,6 +8,7 @@
  */
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/ktime.h>
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e53bef6cf53c..b192b42a8351 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -44,6 +44,7 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/nmi.h>
+#include <linux/sched/clock.h>
 
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 93aa1364376a..7a8a4117f123 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -24,6 +24,7 @@
 #include <linux/of_address.h>
 #include <linux/io.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/acpi.h>
 
diff --git a/drivers/clocksource/pxa_timer.c b/drivers/clocksource/pxa_timer.c
index 9cae38eebec2..1c24de215c14 100644
--- a/drivers/clocksource/pxa_timer.c
+++ b/drivers/clocksource/pxa_timer.c
@@ -19,6 +19,7 @@
 #include <linux/clockchips.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 
 #include <clocksource/pxa.h>
diff --git a/drivers/clocksource/timer-digicolor.c b/drivers/clocksource/timer-digicolor.c
index 10318cc99c0e..e9f50d289362 100644
--- a/drivers/clocksource/timer-digicolor.c
+++ b/drivers/clocksource/timer-digicolor.c
@@ -31,6 +31,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/irqreturn.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 62810ff3b00f..548b90be7685 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/notifier.h>
 #include <linux/pm_qos.h>
 #include <linux/cpu.h>
diff --git a/drivers/firmware/tegra/bpmp.c b/drivers/firmware/tegra/bpmp.c
index 4ff02d310868..84e4c9a58a0c 100644
--- a/drivers/firmware/tegra/bpmp.c
+++ b/drivers/firmware/tegra/bpmp.c
@@ -19,6 +19,7 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/semaphore.h>
+#include <linux/sched/clock.h>
 
 #include <soc/tegra/bpmp.h>
 #include <soc/tegra/bpmp-abi.h>
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index f31deeb72703..df3fef393dbe 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -24,6 +24,8 @@
 
 #include <linux/prefetch.h>
 #include <linux/dma-fence-array.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include "i915_drv.h"
 
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index b9cde116dab3..344f238b283f 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -28,6 +28,7 @@
 #include <linux/async.h>
 #include <linux/i2c.h>
 #include <linux/hdmi.h>
+#include <linux/sched/clock.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include <drm/drm_crtc.h>
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 646fe85261c1..18526d44688d 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -11,6 +11,7 @@
 #include "bset.h"
 
 #include <linux/console.h>
+#include <linux/sched/clock.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index a43eedd5804d..384559af310e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -32,6 +32,7 @@
 #include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/clock.h>
 #include <trace/events/bcache.h>
 
 /*
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3ff57d61dde..f90f13616980 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -13,6 +13,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/sort.h>
+#include <linux/sched/clock.h>
 
 static const char * const cache_replacement_policies[] = {
 	"lru",
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index dde6172f3f10..8c3a938f4bf0 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/types.h>
+#include <linux/sched/clock.h>
 
 #include "util.h"
 
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cf2cbc211d83..a126919ed102 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -6,6 +6,7 @@
 #include <linux/errno.h>
 #include <linux/blkdev.h>
 #include <linux/kernel.h>
+#include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 69e1ae59cab8..6ac2e48b9235 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -13,6 +13,7 @@
 
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/sched/clock.h>
 #include <trace/events/bcache.h>
 
 /* Rate limiting */
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 09505f432eda..7ae710585267 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -9,6 +9,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/mm.h>
diff --git a/drivers/net/irda/pxaficp_ir.c b/drivers/net/irda/pxaficp_ir.c
index 6e8f616be48e..1dba16bc7f8d 100644
--- a/drivers/net/irda/pxaficp_ir.c
+++ b/drivers/net/irda/pxaficp_ir.c
@@ -24,6 +24,7 @@
 #include <linux/dma/pxa-dma.h>
 #include <linux/gpio.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/irmod.h>
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 6d9335865880..9612b84bc3e0 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -20,6 +20,7 @@
 #include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/spinlock.h>
 #include <linux/irq.h>
 #include <linux/irqdesc.h>
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2fe35354f20e..5c98ad4d2f4c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,6 +17,7 @@
 #include <linux/workqueue.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/vmalloc.h>
 
 #include <linux/net.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aecca0e7d9ca..796016e63c1d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -2,6 +2,7 @@
 #define _LINUX_BLKDEV_H
 
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #ifdef CONFIG_BLOCK
 
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
new file mode 100644
index 000000000000..7cb7d79b2cf9
--- /dev/null
+++ b/include/linux/sched/clock.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_CLOCK_H
+#define _LINUX_SCHED_CLOCK_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_CLOCK_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 69ccd2636911..c776abd86937 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -34,6 +34,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <net/flow_dissector.h>
 #include <linux/splice.h>
 #include <linux/in6.h>
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b8d637225a07..b96832df239e 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -25,6 +25,7 @@
 #define _LINUX_NET_BUSY_POLL_H
 
 #include <linux/netdevice.h>
+#include <linux/sched/clock.h>
 #include <net/ip.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1031bdf9f012..42fe16a84f97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -46,6 +46,7 @@
 #include <linux/filter.h>
 #include <linux/namei.h>
 #include <linux/parser.h>
+#include <linux/sched/clock.h>
 
 #include "internal.h"
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9812e5dd409e..cdafaff926ca 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -28,6 +28,7 @@
 #define DISABLE_BRANCH_PROFILING
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index e852be4851fc..4a30ef63c607 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -63,6 +63,7 @@ enum qlock_stats {
  */
 #include <linux/debugfs.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/fs.h>
 
 static const char * const qstat_names[qstat_num + 1] = {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 34da86e73d00..47050eedc206 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
 #include <linux/utsname.h>
 #include <linux/ctype.h>
 #include <linux/uio.h>
+#include <linux/sched/clock.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ad64efe41722..dd7817cdbf58 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -58,6 +58,7 @@
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/static_key.h>
 #include <linux/workqueue.h>
 #include <linux/compiler.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1bd317db9810..1a7fd3d21e5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 683570739f46..4d386461f13a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a26036d37a38..ea6b610c4c57 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/syscore_ops.h>
 #include <linux/hrtimer.h>
 #include <linux/sched_clock.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2c115fdab397..4fee1c3abd0b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,6 +19,7 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
diff --git a/kernel/torture.c b/kernel/torture.c
index 01a99976f072..55de96529287 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -30,6 +30,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a85739efcc30..96fc3c043ad6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6,6 +6,7 @@
 #include <linux/trace_events.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
+#include <linux/sched/clock.h>
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
 #include <linux/irq_work.h>
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 0f06532a755b..5fdc779f411d 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/ktime.h>
 #include <linux/trace_clock.h>
 
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index edfacd954e1b..21ea6ae77d93 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -44,6 +44,7 @@
 #include <linux/uaccess.h>
 #include <linux/cpumask.h>
 #include <linux/delay.h>
+#include <linux/sched/clock.h>
 #include "trace.h"
 
 static struct trace_array	*hwlat_trace;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 070866c32eb9..107afca97649 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <linux/sched/clock.h>
 
 #include "trace_output.h"
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 63177be0159e..144d7b1b0364 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -21,6 +21,7 @@
 #include <linux/sched/rt.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
+#include <linux/sched/clock.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
diff --git a/lib/plist.c b/lib/plist.c
index 3a30c53db061..199408f91057 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -175,6 +175,7 @@ void plist_requeue(struct plist_node *node, struct plist_head *head)
 
 #ifdef CONFIG_DEBUG_PI_LIST
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 35b280361cb2..50a0f3e51d5b 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -27,6 +27,8 @@
 #include <linux/kernel.h>
 #include <linux/random.h>
 #include <linux/module.h>
+#include <linux/sched/clock.h>
+
 #include <net/tcp.h>
 
 #define HYSTART_ACK_TRAIN	1
-- 
cgit v1.2.3


From 4f17722c7256af8e17c2c4f29f170247264bdf48 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 08:45:17 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/loadavg.h>

We are going to split <linux/sched/loadavg.h> out of <linux/sched.h>, which
will have to be picked up from a couple of .c files.

Create a trivial placeholder <linux/sched/topology.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/m68k/kernel/time.c                         | 1 +
 arch/microblaze/kernel/heartbeat.c              | 1 +
 arch/powerpc/platforms/cell/cpufreq_spudemand.c | 1 +
 arch/powerpc/platforms/cell/spufs/sched.c       | 1 +
 arch/s390/appldata/appldata_os.c                | 1 +
 arch/sh/drivers/heartbeat.c                     | 1 +
 arch/sparc/kernel/led.c                         | 1 +
 drivers/cpuidle/governors/menu.c                | 1 +
 drivers/leds/trigger/ledtrig-heartbeat.c        | 1 +
 drivers/platform/x86/intel_ips.c                | 1 +
 fs/proc/loadavg.c                               | 1 +
 include/linux/sched/loadavg.h                   | 6 ++++++
 kernel/debug/kdb/kdb_main.c                     | 1 +
 kernel/sched/core.c                             | 1 +
 kernel/sched/loadavg.c                          | 1 +
 kernel/sys.c                                    | 1 +
 kernel/time/timekeeping.c                       | 1 +
 net/sched/em_meta.c                             | 1 +
 18 files changed, 23 insertions(+)
 create mode 100644 include/linux/sched/loadavg.h

(limited to 'include/linux')

diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 4e5aa2f4f522..87160b4415fb 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
diff --git a/arch/microblaze/kernel/heartbeat.c b/arch/microblaze/kernel/heartbeat.c
index 4643e3ab9414..2022130139d2 100644
--- a/arch/microblaze/kernel/heartbeat.c
+++ b/arch/microblaze/kernel/heartbeat.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/io.h>
 
 #include <asm/setup.h>
diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
index 88301e53f085..882944c36ef5 100644
--- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c
+++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
@@ -22,6 +22,7 @@
 
 #include <linux/cpufreq.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 9b543df210fb..d317c84dc794 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -24,6 +24,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/sched/rt.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 08b9e942a262..079446619f89 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/netdevice.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <asm/appldata.h>
 #include <asm/smp.h>
 
diff --git a/arch/sh/drivers/heartbeat.c b/arch/sh/drivers/heartbeat.c
index 49bace446a1a..c6d96049a0bb 100644
--- a/arch/sh/drivers/heartbeat.c
+++ b/arch/sh/drivers/heartbeat.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/timer.h>
 #include <linux/io.h>
 #include <linux/slab.h>
diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c
index 3ae36f36e758..44a3ed93c214 100644
--- a/arch/sparc/kernel/led.c
+++ b/arch/sparc/kernel/led.c
@@ -8,6 +8,7 @@
 #include <linux/jiffies.h>
 #include <linux/timer.h>
 #include <linux/uaccess.h>
+#include <linux/sched/loadavg.h>
 
 #include <asm/auxio.h>
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 8d6d25c38c02..fe86060e48f7 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -18,6 +18,7 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/math64.h>
 #include <linux/cpu.h>
 
diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c
index e6f2f8b9f09a..afa3b4099214 100644
--- a/drivers/leds/trigger/ledtrig-heartbeat.c
+++ b/drivers/leds/trigger/ledtrig-heartbeat.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/leds.h>
 #include <linux/reboot.h>
 #include <linux/suspend.h>
diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c
index 55663b3d7282..58dcee562d64 100644
--- a/drivers/platform/x86/intel_ips.c
+++ b/drivers/platform/x86/intel_ips.c
@@ -68,6 +68,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/tick.h>
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index aec66e6c2060..add5255ce7e0 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -3,6 +3,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
new file mode 100644
index 000000000000..3ae74be327e8
--- /dev/null
+++ b/include/linux/sched/loadavg.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_LOADAVG_H
+#define _LINUX_SCHED_LOADAVG_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_LOADAVG_H */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ca183919d302..308937c7a687 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -18,6 +18,7 @@
 #include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/sysrq.h>
 #include <linux/smp.h>
 #include <linux/utsname.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ed39d1d0b64a..11a0684a29a7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <uapi/linux/sched/types.h>
+#include <linux/sched/loadavg.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a2d6eb71f06b..7296b7308eca 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/export.h>
+#include <linux/sched/loadavg.h>
 
 #include "sched.h"
 
diff --git a/kernel/sys.c b/kernel/sys.c
index b07adca97ea3..28b8a4c1bc7e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,7 @@
 #include <linux/binfmts.h>
 
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 95b258dd75db..fb564acee0f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/syscore_ops.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 41c80b6c3906..ae7e4f5b348b 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -63,6 +63,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/random.h>
-- 
cgit v1.2.3


From 4eb5aaa3af8a54e5e9bac90e2b42bbab1f1ee5a3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:29 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/autogroup.h>

We are going to split <linux/sched/autogroup.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/autogroup.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/base.c                  | 1 +
 include/linux/sched/autogroup.h | 6 ++++++
 kernel/exit.c                   | 1 +
 kernel/fork.c                   | 1 +
 kernel/sched/autogroup.h        | 1 +
 kernel/sys.c                    | 1 +
 6 files changed, 11 insertions(+)
 create mode 100644 include/linux/sched/autogroup.h

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1e1e182d571b..27af4ea315a3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -85,6 +85,7 @@
 #include <linux/user_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/sched/autogroup.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/include/linux/sched/autogroup.h b/include/linux/sched/autogroup.h
new file mode 100644
index 000000000000..d745f22e57dc
--- /dev/null
+++ b/include/linux/sched/autogroup.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_AUTOGROUP_H
+#define _LINUX_SCHED_AUTOGROUP_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_AUTOGROUP_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a768a3672a5..9c0b92833fbb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -6,6 +6,7 @@
 
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/sched/autogroup.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index d043fedc03c8..234f4bfb8efd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/autogroup.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 890c95f2587a..ce40c810cd5c 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,7 @@
 
 #include <linux/kref.h>
 #include <linux/rwsem.h>
+#include <linux/sched/autogroup.h>
 
 struct autogroup {
 	/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 28b8a4c1bc7e..cfe82ae62423 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,7 @@
 #include <linux/binfmts.h>
 
 #include <linux/sched.h>
+#include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
-- 
cgit v1.2.3


From 6e84f31522f931027bf695752087ece278c10d3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:29 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/mm.h>

We are going to split <linux/sched/mm.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/mm.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

The APIs that are going to be moved first are:

   mm_alloc()
   __mmdrop()
   mmdrop()
   mmdrop_async_fn()
   mmdrop_async()
   mmget_not_zero()
   mmput()
   mmput_async()
   get_task_mm()
   mm_access()
   mm_release()

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arc/include/asm/mmu_context.h          | 1 +
 arch/arc/kernel/troubleshoot.c              | 2 ++
 arch/arm/mach-rpc/ecard.c                   | 1 +
 arch/frv/mm/mmu-context.c                   | 1 +
 arch/m68k/sun3/mmu_emu.c                    | 1 +
 arch/powerpc/platforms/cell/spufs/context.c | 2 ++
 drivers/android/binder.c                    | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c    | 1 +
 drivers/gpu/drm/etnaviv/etnaviv_gem.c       | 1 +
 drivers/gpu/drm/i915/i915_gem_userptr.c     | 1 +
 drivers/infiniband/core/umem.c              | 1 +
 drivers/infiniband/core/umem_odp.c          | 1 +
 drivers/infiniband/hw/hfi1/file_ops.c       | 1 +
 drivers/infiniband/hw/mlx4/main.c           | 2 ++
 drivers/infiniband/hw/mlx5/main.c           | 1 +
 drivers/infiniband/hw/usnic/usnic_uiom.c    | 1 +
 drivers/iommu/amd_iommu_v2.c                | 1 +
 drivers/iommu/intel-svm.c                   | 1 +
 drivers/lguest/lguest_user.c                | 1 +
 drivers/misc/cxl/fault.c                    | 1 +
 drivers/misc/mic/scif/scif_rma.c            | 2 ++
 drivers/oprofile/buffer_sync.c              | 1 +
 drivers/vfio/vfio_iommu_spapr_tce.c         | 2 ++
 drivers/vfio/vfio_iommu_type1.c             | 1 +
 drivers/vhost/vhost.c                       | 1 +
 drivers/xen/gntdev.c                        | 1 +
 fs/exec.c                                   | 1 +
 fs/proc/array.c                             | 1 +
 fs/proc/base.c                              | 1 +
 fs/proc/task_mmu.c                          | 1 +
 fs/proc/task_nommu.c                        | 2 ++
 fs/userfaultfd.c                            | 1 +
 include/linux/sched/mm.h                    | 6 ++++++
 kernel/cgroup/cpuset.c                      | 1 +
 kernel/events/core.c                        | 1 +
 kernel/events/uprobes.c                     | 1 +
 kernel/exit.c                               | 1 +
 kernel/fork.c                               | 1 +
 kernel/futex.c                              | 1 +
 kernel/ptrace.c                             | 1 +
 kernel/sched/sched.h                        | 1 +
 kernel/sys.c                                | 1 +
 kernel/trace/trace_output.c                 | 1 +
 kernel/tsacct.c                             | 1 +
 mm/khugepaged.c                             | 1 +
 mm/ksm.c                                    | 1 +
 mm/memcontrol.c                             | 1 +
 mm/memory.c                                 | 1 +
 mm/mempolicy.c                              | 1 +
 mm/migrate.c                                | 1 +
 mm/mmu_context.c                            | 1 +
 mm/mmu_notifier.c                           | 1 +
 mm/nommu.c                                  | 1 +
 mm/oom_kill.c                               | 1 +
 mm/process_vm_access.c                      | 1 +
 mm/rmap.c                                   | 1 +
 mm/swapfile.c                               | 1 +
 mm/util.c                                   | 1 +
 virt/kvm/async_pf.c                         | 1 +
 virt/kvm/kvm_main.c                         | 1 +
 60 files changed, 71 insertions(+)
 create mode 100644 include/linux/sched/mm.h

(limited to 'include/linux')

diff --git a/arch/arc/include/asm/mmu_context.h b/arch/arc/include/asm/mmu_context.h
index b0b87f2447f5..64b5ebae1ae8 100644
--- a/arch/arc/include/asm/mmu_context.h
+++ b/arch/arc/include/asm/mmu_context.h
@@ -20,6 +20,7 @@
 
 #include <asm/arcregs.h>
 #include <asm/tlb.h>
+#include <linux/sched/mm.h>
 
 #include <asm-generic/mm_hooks.h>
 
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 82f9bc819f4a..0c48907f56a9 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -13,6 +13,8 @@
 #include <linux/fs_struct.h>
 #include <linux/proc_fs.h>
 #include <linux/file.h>
+#include <linux/sched/mm.h>
+
 #include <asm/arcregs.h>
 #include <asm/irqflags.h>
 
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index dc67a7fb3831..6b279d037774 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -31,6 +31,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/interrupt.h>
 #include <linux/completion.h>
 #include <linux/reboot.h>
diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c
index 3473bde77f56..4031289bf2ec 100644
--- a/arch/frv/mm/mmu-context.c
+++ b/arch/frv/mm/mmu-context.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mm.h>
 #include <asm/tlbflush.h>
 
diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c
index e9d7fbe4d5ae..7fdc61525e0b 100644
--- a/arch/m68k/sun3/mmu_emu.c
+++ b/arch/m68k/sun3/mmu_emu.c
@@ -15,6 +15,7 @@
 #include <linux/bootmem.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 
 #include <asm/setup.h>
 #include <asm/traps.h>
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 3b4152faeb1f..b500b17254a0 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -25,6 +25,8 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
+
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
 #include "spufs.h"
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 2bbcdc6fdfee..e33e7fbe870b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/rbtree.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ca5f2aa7232d..84d1ffd1eef9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/amd-iommu.h>
 #include <linux/notifier.h>
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index e78f1406885d..ae2882b64b82 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -16,6 +16,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/shmem_fs.h>
+#include <linux/sched/mm.h>
 
 #include "etnaviv_drv.h"
 #include "etnaviv_gem.h"
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 0115989e324a..22b46398831e 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -31,6 +31,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
+#include <linux/sched/mm.h>
 
 struct i915_mm_struct {
 	struct mm_struct *mm;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 446b56a5260b..d525b1a2986a 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -35,6 +35,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index f2fc0431512d..7279f259494f 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -32,6 +32,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/pid.h>
 #include <linux/slab.h>
 #include <linux/export.h>
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 3b19c16a9e45..f78c739b330a 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -48,6 +48,7 @@
 #include <linux/cdev.h>
 #include <linux/vmalloc.h>
 #include <linux/io.h>
+#include <linux/sched/mm.h>
 
 #include <rdma/ib.h>
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 88608906ce25..56da8f0d71bb 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -39,6 +39,8 @@
 #include <linux/inetdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/if_vlan.h>
+#include <linux/sched/mm.h>
+
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/devlink.h>
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5b3355268725..4b1ec3ff152a 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -41,6 +41,7 @@
 #include <asm/pat.h>
 #endif
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/delay.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 1ccee6ea5bc3..ba868be5af2e 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -35,6 +35,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/iommu.h>
 #include <linux/workqueue.h>
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index f8ed8c95b685..063343909b0d 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -22,6 +22,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/iommu.h>
 #include <linux/wait.h>
 #include <linux/pci.h>
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 51f2b228723f..23c427602c55 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -16,6 +16,7 @@
 #include <linux/intel-iommu.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/intel-svm.h>
 #include <linux/rculist.h>
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 30c60687d277..1a6787bc9386 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -8,6 +8,7 @@
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/export.h>
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 377e650a2a1d..e33011e3e7e2 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -9,6 +9,7 @@
 
 #include <linux/workqueue.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/pid.h>
 #include <linux/mm.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index f806a4471eb9..d0e9c60f944e 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -17,6 +17,8 @@
  */
 #include <linux/dma_remapping.h>
 #include <linux/pagemap.h>
+#include <linux/sched/mm.h>
+
 #include "scif_main.h"
 #include "scif_map.h"
 
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 642478d35e99..eb0c35994b54 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -31,6 +31,7 @@
 #include <linux/fs.h>
 #include <linux/oprofile.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/gfp.h>
 
 #include "oprofile_stats.h"
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 59b3f62a2d64..185d50ee1b12 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -20,6 +20,8 @@
 #include <linux/err.h>
 #include <linux/vfio.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
+
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/mmu_context.h>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bd6f293c4ebd..7e94c7fc90ae 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -32,6 +32,7 @@
 #include <linux/mm.h>
 #include <linux/rbtree.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4269e621e254..4a00140a7624 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -27,6 +27,7 @@
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/sched/mm.h>
 #include <linux/interval_tree_generic.h>
 
 #include "vhost.h"
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 2ef2b61b69df..c77a0751a311 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -32,6 +32,7 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
diff --git a/fs/exec.c b/fs/exec.c
index e595e1529581..8929da96cca1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -32,6 +32,7 @@
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/sched/mm.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fe12b519d09b..61bea3a4ecc5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -60,6 +60,7 @@
 #include <linux/tty.h>
 #include <linux/string.h>
 #include <linux/mman.h>
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 27af4ea315a3..9efe12376af1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -86,6 +86,7 @@
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ee3efb229ef6..f08bd31c1081 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/sched/mm.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 1ef97cfcf422..23266694db11 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -7,6 +7,8 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
+#include <linux/sched/mm.h>
+
 #include "internal.h"
 
 /*
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 3c421d06a18e..18d12bfff770 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/hashtable.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mm.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
new file mode 100644
index 000000000000..a7adba1cd0a9
--- /dev/null
+++ b/include/linux/sched/mm.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_MM_H
+#define _LINUX_SCHED_MM_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3088886cd37..a5c46db2855c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -44,6 +44,7 @@
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/security.h>
 #include <linux/slab.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 42fe16a84f97..6f41548f2e32 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -47,6 +47,7 @@
 #include <linux/namei.h>
 #include <linux/parser.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
 
 #include "internal.h"
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d630f8ac4d2f..62f2dbd13efc 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>	/* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/export.h>
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>	/* set_pte_at_notify */
diff --git a/kernel/exit.c b/kernel/exit.c
index 9c0b92833fbb..48649ccbb649 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 234f4bfb8efd..1875468e2a81 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -13,6 +13,7 @@
 
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/futex.c b/kernel/futex.c
index 8ddcf9ea953c..229a744b1781 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -62,6 +62,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49ba7c1ade9d..6fe5a2897912 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -10,6 +10,7 @@
 #include <linux/capability.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4d386461f13a..e2307a6c29f1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/mm.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index cfe82ae62423..dc71ec1e1967 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -51,6 +51,7 @@
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 107afca97649..02a4aeb22c47 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -9,6 +9,7 @@
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
 
 #include "trace_output.h"
 
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5c21f0535056..571a2d3821d8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -18,6 +18,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 34bce5c308e3..212a2afe8830 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2,6 +2,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 520e4c37fec7..5632c9d02457 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 45867e439d31..c52ec893e241 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
diff --git a/mm/memory.c b/mm/memory.c
index 14fc0b40f0bb..b0495ec74d29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -40,6 +40,7 @@
 
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1e7873e40c9a..90892416ed75 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,6 +73,7 @@
 #include <linux/hugetlb.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 2c63ac06791b..9a0897a14d37 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,6 +40,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
 #include <linux/page_owner.h>
+#include <linux/sched/mm.h>
 
 #include <asm/tlbflush.h>
 
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index daf67bb02b4a..8888b124a386 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mmu_context.h>
 #include <linux/export.h>
 
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 32bc9f2ff7eb..a7652acd2ab9 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -17,6 +17,7 @@
 #include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 
 /* global SRCU for all MMs */
diff --git a/mm/nommu.c b/mm/nommu.c
index aae06e854552..79abed514a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -17,6 +17,7 @@
 
 #include <linux/export.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/vmacache.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 51c091849dcb..d28936618698 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,6 +22,7 @@
 #include <linux/err.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 84d0c7eada2b..8973cd231ece 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 8774791e2809..0367a0506667 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -46,6 +46,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fadc6a1c0da0..ff2bf3f61b14 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
diff --git a/mm/util.c b/mm/util.c
index b8f538863b5a..14f4e13323e2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,6 +5,7 @@
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 2366177172f6..bb298a200cd3 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
 
 #include "async_pf.h"
 #include <trace/events/kvm.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 35f71409d9ee..dd5de09bf362 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -33,6 +33,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 #include <linux/anon_inodes.h>
-- 
cgit v1.2.3


From f7ccbae45c5e2c1077654b0e857e7efb1aa31c92 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:30 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/coredump.h>

We are going to split <linux/sched/coredump.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/coredump.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/binfmt_elf.c                | 1 +
 fs/binfmt_elf_fdpic.c          | 1 +
 fs/coredump.c                  | 1 +
 fs/exec.c                      | 1 +
 fs/proc/base.c                 | 1 +
 fs/proc/internal.h             | 1 +
 include/linux/khugepaged.h     | 3 ++-
 include/linux/ksm.h            | 1 +
 include/linux/sched/coredump.h | 6 ++++++
 kernel/cred.c                  | 1 +
 kernel/events/uprobes.c        | 1 +
 kernel/fork.c                  | 1 +
 kernel/ptrace.c                | 1 +
 kernel/sys.c                   | 1 +
 kernel/sysctl.c                | 1 +
 mm/huge_memory.c               | 1 +
 mm/khugepaged.c                | 1 +
 mm/ksm.c                       | 1 +
 mm/memory.c                    | 1 +
 mm/oom_kill.c                  | 1 +
 20 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/coredump.h

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 443a6f537d56..fbbe52e1250e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
 #include <asm/param.h>
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index ffca4bbc3d63..222873bb689c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -35,6 +35,7 @@
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
 #include <linux/coredump.h>
+#include <linux/sched/coredump.h>
 #include <linux/dax.h>
 
 #include <linux/uaccess.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index ae6b05629ca1..23a539b29225 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -16,6 +16,7 @@
 #include <linux/personality.h>
 #include <linux/binfmts.h>
 #include <linux/coredump.h>
+#include <linux/sched/coredump.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
diff --git a/fs/exec.c b/fs/exec.c
index 8929da96cca1..9c80d011594e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9efe12376af1..4c5fa704e38c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -87,6 +87,7 @@
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5d6960f5f1c0..550e8b183abe 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
 #include <linux/binfmts.h>
+#include <linux/sched/coredump.h>
 
 struct ctl_table_header;
 struct mempolicy;
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 1e032a1ddb3e..5d9a400af509 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -1,7 +1,8 @@
 #ifndef _LINUX_KHUGEPAGED_H
 #define _LINUX_KHUGEPAGED_H
 
-#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
+#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
+
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct attribute_group khugepaged_attr_group;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 481c8c4627ca..e1cfda4bee58 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 
 struct stable_node;
 struct mem_cgroup;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
new file mode 100644
index 000000000000..ab81e564e076
--- /dev/null
+++ b/include/linux/sched/coredump.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_COREDUMP_H
+#define _LINUX_SCHED_COREDUMP_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 5f264fb5737d..2bc66075740f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -12,6 +12,7 @@
 #include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 #include <linux/key.h>
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 62f2dbd13efc..0e137f98a50c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/export.h>
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>	/* set_pte_at_notify */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1875468e2a81..7332448b668a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6fe5a2897912..bcdbdc19eb35 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -11,6 +11,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index dc71ec1e1967..e2d743fb7f55 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bb260ceb3718..acf0a5a06da7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
 #include <linux/capability.h>
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/coredump.h>
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 71e3dede95b4..dee92fa26bbb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -9,6 +9,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/mmu_notifier.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 212a2afe8830..ba40b7f673f4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -3,6 +3,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 5632c9d02457..19b4f2dea7a5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -20,6 +20,7 @@
 #include <linux/mman.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
diff --git a/mm/memory.c b/mm/memory.c
index b0495ec74d29..6fc918b2c459 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -41,6 +41,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d28936618698..69f75b014607 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -23,6 +23,7 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
-- 
cgit v1.2.3


From 3f07c0144132e4f59d88055ac8ff3e691a5fa2b8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:30 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/signal.h>

We are going to split <linux/sched/signal.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/signal.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c                                    | 2 +-
 arch/alpha/kernel/signal.c                                     | 2 +-
 arch/alpha/kernel/traps.c                                      | 2 +-
 arch/alpha/mm/fault.c                                          | 2 +-
 arch/arc/kernel/traps.c                                        | 2 +-
 arch/arc/mm/fault.c                                            | 2 +-
 arch/arm/kernel/ptrace.c                                       | 2 +-
 arch/arm/kernel/traps.c                                        | 2 +-
 arch/arm/mm/alignment.c                                        | 2 +-
 arch/arm/mm/fault.c                                            | 2 +-
 arch/arm/mm/init.c                                             | 1 +
 arch/arm/mm/mmap.c                                             | 2 +-
 arch/arm/vfp/vfpmodule.c                                       | 2 +-
 arch/arm64/kernel/fpsimd.c                                     | 2 +-
 arch/arm64/kernel/ptrace.c                                     | 2 +-
 arch/arm64/kernel/traps.c                                      | 2 +-
 arch/arm64/mm/fault.c                                          | 2 +-
 arch/arm64/mm/mmap.c                                           | 2 +-
 arch/avr32/kernel/traps.c                                      | 2 +-
 arch/blackfin/kernel/trace.c                                   | 2 +-
 arch/blackfin/kernel/traps.c                                   | 1 +
 arch/cris/mm/fault.c                                           | 1 +
 arch/frv/kernel/traps.c                                        | 2 +-
 arch/h8300/kernel/ptrace_s.c                                   | 2 +-
 arch/hexagon/kernel/traps.c                                    | 2 +-
 arch/hexagon/mm/vm_fault.c                                     | 1 +
 arch/ia64/kernel/asm-offsets.c                                 | 2 +-
 arch/ia64/kernel/brl_emu.c                                     | 2 +-
 arch/ia64/kernel/mca.c                                         | 2 +-
 arch/ia64/kernel/traps.c                                       | 2 +-
 arch/ia64/kernel/unaligned.c                                   | 2 +-
 arch/ia64/mm/fault.c                                           | 2 +-
 arch/ia64/mm/init.c                                            | 1 +
 arch/mips/kernel/branch.c                                      | 2 +-
 arch/mips/kernel/signal_o32.c                                  | 1 +
 arch/mips/mm/mmap.c                                            | 2 +-
 arch/mips/sgi-ip22/ip22-berr.c                                 | 2 +-
 arch/mips/sgi-ip22/ip22-reset.c                                | 2 +-
 arch/mn10300/kernel/fpu.c                                      | 2 ++
 arch/openrisc/mm/fault.c                                       | 2 +-
 arch/parisc/kernel/sys_parisc.c                                | 1 +
 arch/parisc/kernel/unaligned.c                                 | 2 +-
 arch/parisc/math-emu/driver.c                                  | 3 ++-
 arch/powerpc/kvm/book3s_64_vio.c                               | 1 +
 arch/powerpc/mm/mmap.c                                         | 2 +-
 arch/powerpc/mm/mmu_context_iommu.c                            | 2 +-
 arch/powerpc/platforms/cell/spufs/fault.c                      | 2 +-
 arch/powerpc/xmon/xmon.c                                       | 2 +-
 arch/s390/kernel/nmi.c                                         | 3 +++
 arch/s390/mm/mmap.c                                            | 1 +
 arch/score/kernel/traps.c                                      | 2 +-
 arch/sh/kernel/cpu/sh2a/fpu.c                                  | 2 +-
 arch/sh/kernel/hw_breakpoint.c                                 | 1 +
 arch/sh/kernel/traps.c                                         | 2 ++
 arch/sh/math-emu/math.c                                        | 2 +-
 arch/sh/mm/asids-debugfs.c                                     | 2 ++
 arch/sh/mm/fault.c                                             | 1 +
 arch/sparc/kernel/sys_sparc_32.c                               | 2 +-
 arch/sparc/kernel/sys_sparc_64.c                               | 2 +-
 arch/sparc/kernel/unaligned_32.c                               | 2 +-
 arch/tile/mm/mmap.c                                            | 2 +-
 arch/um/drivers/line.c                                         | 3 ++-
 arch/um/kernel/reboot.c                                        | 2 +-
 arch/um/kernel/skas/mmu.c                                      | 3 ++-
 arch/um/kernel/tlb.c                                           | 3 ++-
 arch/um/kernel/trap.c                                          | 2 +-
 arch/unicore32/kernel/fpu-ucf64.c                              | 2 +-
 arch/unicore32/kernel/traps.c                                  | 1 +
 arch/unicore32/mm/fault.c                                      | 2 +-
 arch/x86/entry/vsyscall/vsyscall_64.c                          | 1 +
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c                       | 2 +-
 arch/x86/kvm/mmu.c                                             | 1 +
 arch/x86/mm/mmap.c                                             | 2 +-
 arch/xtensa/kernel/traps.c                                     | 2 +-
 drivers/android/binder.c                                       | 2 +-
 drivers/block/drbd/drbd_int.h                                  | 2 +-
 drivers/char/snsc_event.c                                      | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_events.c                        | 2 +-
 drivers/gpu/drm/drm_lock.c                                     | 2 ++
 drivers/gpu/drm/ttm/ttm_lock.c                                 | 2 +-
 drivers/infiniband/core/umem.c                                 | 2 +-
 drivers/infiniband/hw/hfi1/user_pages.c                        | 2 +-
 drivers/infiniband/hw/qib/qib_user_pages.c                     | 1 +
 drivers/infiniband/hw/usnic/usnic_uiom.c                       | 2 +-
 drivers/isdn/i4l/isdn_tty.c                                    | 1 +
 drivers/isdn/mISDN/l1oip_core.c                                | 2 ++
 drivers/md/md.c                                                | 1 +
 drivers/md/raid1.c                                             | 3 +++
 drivers/md/raid5.c                                             | 2 ++
 drivers/misc/genwqe/card_dev.c                                 | 2 +-
 drivers/misc/mic/cosm/cosm_scif_server.c                       | 2 ++
 drivers/misc/mic/cosm_client/cosm_scif_client.c                | 2 ++
 drivers/misc/mic/scif/scif_rma.c                               | 1 +
 drivers/net/slip/slip.c                                        | 2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c        | 2 +-
 drivers/parisc/power.c                                         | 2 +-
 drivers/ps3/ps3-sys-manager.c                                  | 1 +
 drivers/s390/char/fs3270.c                                     | 1 +
 drivers/s390/char/keyboard.c                                   | 2 +-
 drivers/scsi/bnx2fc/bnx2fc.h                                   | 2 +-
 drivers/scsi/bnx2i/bnx2i.h                                     | 2 +-
 drivers/scsi/libiscsi.c                                        | 1 +
 drivers/staging/android/lowmemorykiller.c                      | 2 +-
 drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c          | 2 +-
 drivers/staging/rtl8188eu/include/osdep_service.h              | 2 +-
 drivers/staging/rtl8712/osdep_service.h                        | 2 +-
 drivers/staging/rtl8712/rtl8712_cmd.c                          | 1 +
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c  | 1 +
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h | 2 +-
 drivers/target/iscsi/iscsi_target.c                            | 1 +
 drivers/target/iscsi/iscsi_target_erl0.c                       | 2 ++
 drivers/target/iscsi/iscsi_target_login.c                      | 1 +
 drivers/target/iscsi/iscsi_target_nego.c                       | 1 +
 drivers/tty/pty.c                                              | 2 +-
 drivers/tty/sysrq.c                                            | 2 +-
 drivers/tty/tty_io.c                                           | 2 +-
 drivers/tty/vt/keyboard.c                                      | 2 +-
 drivers/tty/vt/vt.c                                            | 2 +-
 drivers/tty/vt/vt_ioctl.c                                      | 2 +-
 drivers/usb/atm/usbatm.c                                       | 2 +-
 drivers/usb/core/devio.c                                       | 1 +
 drivers/usb/gadget/function/f_mass_storage.c                   | 1 +
 drivers/vfio/vfio_iommu_spapr_tce.c                            | 1 +
 drivers/vfio/vfio_iommu_type1.c                                | 2 +-
 drivers/w1/w1_family.c                                         | 2 +-
 drivers/w1/w1_int.c                                            | 1 +
 fs/attr.c                                                      | 1 +
 fs/autofs4/waitq.c                                             | 1 +
 fs/cifs/connect.c                                              | 1 +
 fs/coda/upcall.c                                               | 2 +-
 fs/coredump.c                                                  | 2 +-
 fs/exec.c                                                      | 1 +
 fs/file.c                                                      | 2 +-
 fs/fs_struct.c                                                 | 2 +-
 fs/jffs2/background.c                                          | 2 +-
 fs/lockd/svc.c                                                 | 2 +-
 fs/ncpfs/inode.c                                               | 1 +
 fs/ncpfs/sock.c                                                | 1 +
 fs/nfs/callback.c                                              | 1 +
 fs/nfsd/nfssvc.c                                               | 2 +-
 fs/proc/fd.c                                                   | 2 +-
 fs/select.c                                                    | 4 ++--
 include/linux/oom.h                                            | 2 +-
 include/linux/ptrace.h                                         | 1 +
 include/linux/sched/signal.h                                   | 6 ++++++
 include/linux/signalfd.h                                       | 2 +-
 include/linux/taskstats_kern.h                                 | 2 +-
 ipc/mqueue.c                                                   | 1 +
 kernel/bpf/syscall.c                                           | 1 +
 kernel/cpu.c                                                   | 2 +-
 kernel/debug/gdbstub.c                                         | 1 +
 kernel/debug/kdb/kdb_bt.c                                      | 2 +-
 kernel/hung_task.c                                             | 2 ++
 kernel/rcu/update.c                                            | 2 +-
 kernel/sched/sched.h                                           | 1 +
 kernel/time/itimer.c                                           | 1 +
 kernel/time/posix-cpu-timers.c                                 | 2 +-
 kernel/time/tick-sched.c                                       | 2 +-
 kernel/tracepoint.c                                            | 2 +-
 kernel/tsacct.c                                                | 2 +-
 kernel/user_namespace.c                                        | 1 +
 lib/is_single_threaded.c                                       | 3 +--
 mm/filemap.c                                                   | 1 +
 mm/kmemleak.c                                                  | 2 +-
 mm/memory-failure.c                                            | 2 +-
 mm/vmacache.c                                                  | 2 +-
 net/9p/client.c                                                | 2 +-
 net/atm/common.c                                               | 2 +-
 net/ax25/af_ax25.c                                             | 2 +-
 net/caif/caif_socket.c                                         | 2 +-
 net/core/stream.c                                              | 1 +
 net/decnet/af_decnet.c                                         | 2 +-
 net/irda/af_irda.c                                             | 1 +
 net/netrom/af_netrom.c                                         | 2 +-
 net/rose/af_rose.c                                             | 2 +-
 net/sctp/socket.c                                              | 1 +
 net/sunrpc/svc.c                                               | 2 +-
 net/unix/af_unix.c                                             | 2 +-
 net/x25/af_x25.c                                               | 2 +-
 security/selinux/hooks.c                                       | 2 +-
 180 files changed, 204 insertions(+), 121 deletions(-)
 create mode 100644 include/linux/sched/signal.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 9d27a7d333dc..568ca29f2ad9 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 17308f925306..b8221f112eee 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -6,7 +6,7 @@
  *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/errno.h>
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index af2994206b4b..6448ab00043d 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -10,7 +10,7 @@
 
 #include <linux/jiffies.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/delay.h>
 #include <linux/extable.h>
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 47948b4dd157..c25e8827e7cd 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -4,7 +4,7 @@
  *  Copyright (C) 1995  Linus Torvalds
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <asm/io.h>
diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index c927aa84e652..ff83e78d0cfb 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -13,7 +13,7 @@
  * Rahul Trivedi: Codito Technologies 2004
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kdebug.h>
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index e94e5aa33985..162c97528872 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -9,7 +9,7 @@
 
 #include <linux/signal.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index ae738a6319f6..46f7bab81c40 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -10,7 +10,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/elf.h>
 #include <linux/smp.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 9688ec0c6ef4..dc5f1a22b3c9 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -24,7 +24,7 @@
 #include <linux/bug.h>
 #include <linux/delay.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/irq.h>
 
 #include <linux/atomic.h>
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 7d5f4c736a16..d7fe2de9cc9e 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -19,7 +19,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #include <asm/cp15.h>
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index c2b5b9892fd1..520c7778d330 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -16,7 +16,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index bf4d3bc41a7a..2a9040dcf47e 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/mman.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/nodemask.h>
 #include <linux/initrd.h>
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 66353caa35b9..d448f9cd7715 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -5,7 +5,7 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/shm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 #include <linux/personality.h>
 #include <linux/random.h>
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 569d5a650a4a..a71a48e71fff 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -15,7 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/notifier.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index b883f1f75216..06da8ea16bbe 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -21,7 +21,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/hardirq.h>
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index a22161ccf447..64fc32ea3422 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -22,7 +22,7 @@
 #include <linux/audit.h>
 #include <linux/compat.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/ptrace.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7d47c2cdfd93..5b8779a849a2 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -29,7 +29,7 @@
 #include <linux/kexec.h>
 #include <linux/delay.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/syscalls.h>
 
 #include <asm/atomic.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 81283851c9af..30dd60ab8a65 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -26,7 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 #include <linux/preempt.h>
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 01c171723bb3..1e0a2650c88b 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -22,7 +22,7 @@
 #include <linux/mman.h>
 #include <linux/export.h>
 #include <linux/shm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 #include <linux/personality.h>
 #include <linux/random.h>
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index eb4a3fcfbaff..50b541325025 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -14,7 +14,7 @@
 #include <linux/extable.h>
 #include <linux/module.h>	/* print_modules */
 #include <linux/notifier.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #include <asm/addrspace.h>
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index 719dd796c12c..01e546ad2f7a 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -11,7 +11,7 @@
 #include <linux/thread_info.h>
 #include <linux/mm.h>
 #include <linux/oom.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index 1ed85ddadc0d..32676d7721b1 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -9,6 +9,7 @@
 #include <linux/bug.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <asm/traps.h>
 #include <asm/cplb.h>
 #include <asm/blackfin.h>
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 94183d3639ef..1fca464f1b9e 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/extable.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <arch/system.h>
 
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index 31221fb4348e..43c134d6081c 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/h8300/kernel/ptrace_s.c b/arch/h8300/kernel/ptrace_s.c
index ef5a9c13e76d..c0af930052c0 100644
--- a/arch/h8300/kernel/ptrace_s.c
+++ b/arch/h8300/kernel/ptrace_s.c
@@ -10,7 +10,7 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 110dab152f82..4496bcf605ef 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -19,7 +19,7 @@
  */
 
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kdebug.h>
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 489875fd2be4..3eec33c5cfd7 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -28,6 +28,7 @@
 #include <asm/traps.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/extable.h>
 #include <linux/hardirq.h>
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 60ef83e6db71..8786c8b4f187 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -6,7 +6,7 @@
 
 #define ASM_OFFSETS_C 1
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pid.h>
 #include <linux/clocksource.h>
 #include <linux/kbuild.h>
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
index 8682df6263d6..987b11be0021 100644
--- a/arch/ia64/kernel/brl_emu.c
+++ b/arch/ia64/kernel/brl_emu.c
@@ -8,7 +8,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <asm/processor.h>
 
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 9509cc73b9c6..5ac51069e453 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -72,7 +72,7 @@
 #include <linux/jiffies.h>
 #include <linux/types.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 8981ce98afb3..48ba46b025e1 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -9,7 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/export.h>
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 99348d7f2255..a13680ca1e61 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -15,7 +15,7 @@
  */
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/extable.h>
 #include <linux/ratelimit.h>
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 7f2feb21753c..15f09cfff335 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -4,7 +4,7 @@
  * Copyright (C) 1998-2002 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/extable.h>
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 06cdaef54b2e..8f3efa682ee8 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -12,6 +12,7 @@
 #include <linux/elf.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
 #include <linux/personality.h>
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index ae037a304ee4..b11facd11c9d 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -7,7 +7,7 @@
  * Copyright (C) 2001 MIPS Technologies, Inc.
  */
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/export.h>
 #include <asm/branch.h>
diff --git a/arch/mips/kernel/signal_o32.c b/arch/mips/kernel/signal_o32.c
index 5e169fc5ca5c..2b3572fb5f1b 100644
--- a/arch/mips/kernel/signal_o32.c
+++ b/arch/mips/kernel/signal_o32.c
@@ -11,6 +11,7 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #include <asm/abi.h>
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index d6d92c02308d..374d71e61ef6 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -13,7 +13,7 @@
 #include <linux/export.h>
 #include <linux/personality.h>
 #include <linux/random.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 unsigned long shm_align_mask = PAGE_SIZE - 1;	/* Sane caches */
 EXPORT_SYMBOL(shm_align_mask);
diff --git a/arch/mips/sgi-ip22/ip22-berr.c b/arch/mips/sgi-ip22/ip22-berr.c
index 3f6ccd53c15d..ff8e1935c873 100644
--- a/arch/mips/sgi-ip22/ip22-berr.c
+++ b/arch/mips/sgi-ip22/ip22-berr.c
@@ -6,7 +6,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <asm/addrspace.h>
 #include <asm/traps.h>
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index a36f6b87548a..03a39ac5ead9 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -10,7 +10,7 @@
 #include <linux/rtc/ds1286.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/notifier.h>
 #include <linux/pm.h>
 #include <linux/timer.h>
diff --git a/arch/mn10300/kernel/fpu.c b/arch/mn10300/kernel/fpu.c
index 2578b7ae7dd5..50ce7b447fed 100644
--- a/arch/mn10300/kernel/fpu.c
+++ b/arch/mn10300/kernel/fpu.c
@@ -9,6 +9,8 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+
 #include <asm/fpu.h>
 #include <asm/elf.h>
 #include <asm/exceptions.h>
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 53592a639744..e310ab499385 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -18,7 +18,7 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/extable.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <linux/uaccess.h>
 #include <asm/siginfo.h>
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index bf3294171230..ce07cd3f2507 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -30,6 +30,7 @@
 #include <linux/linkage.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/sched/signal.h>
 #include <linux/shm.h>
 #include <linux/syscalls.h>
 #include <linux/utsname.h>
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index 0a21067ac0a3..a08ab481e556 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -23,7 +23,7 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/ratelimit.h>
 #include <linux/uaccess.h>
diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c
index 09ef4136c693..2fb59d2e2b29 100644
--- a/arch/parisc/math-emu/driver.c
+++ b/arch/parisc/math-emu/driver.c
@@ -27,7 +27,8 @@
  *  Copyright (C) 2001	      Hewlett-Packard <bame@debian.org>
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+
 #include "float.h"
 #include "math-emu.h"
 
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index ab9d14c0e460..3e26cd4979f9 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/hugetlb.h>
 #include <linux/list.h>
 #include <linux/anon_inodes.h>
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 2f1e44362198..8013861aeaa7 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -25,7 +25,7 @@
 #include <linux/personality.h>
 #include <linux/mm.h>
 #include <linux/random.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/elf-randomize.h>
 #include <linux/security.h>
 #include <linux/mman.h>
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 7de7124ac91b..497130c5c742 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -10,7 +10,7 @@
  *
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/rculist.h>
 #include <linux/vmalloc.h>
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index e29e4d5afa2d..870c0a82d560 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -19,7 +19,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 
 #include <asm/spu.h>
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 26fa03fc9f3c..16321ad9e70c 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -13,7 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
 #include <linux/reboot.h>
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 80c093e0c6f1..9bf8327154ee 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -13,6 +13,9 @@
 #include <linux/errno.h>
 #include <linux/hardirq.h>
 #include <linux/time.h>
+#include <linux/module.h>
+#include <linux/sched/signal.h>
+
 #include <linux/export.h>
 #include <asm/lowcore.h>
 #include <asm/smp.h>
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 7ae1282d5be9..5ea09403bb87 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -26,6 +26,7 @@
 #include <linux/personality.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/sched/signal.h>
 #include <linux/random.h>
 #include <linux/compat.h>
 #include <linux/security.h>
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 569ac02f68df..fa624f30f783 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -24,7 +24,7 @@
  */
 
 #include <linux/extable.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <asm/cacheflush.h>
 #include <asm/irq.h>
diff --git a/arch/sh/kernel/cpu/sh2a/fpu.c b/arch/sh/kernel/cpu/sh2a/fpu.c
index 98bbaa447c93..352f894bece1 100644
--- a/arch/sh/kernel/cpu/sh2a/fpu.c
+++ b/arch/sh/kernel/cpu/sh2a/fpu.c
@@ -9,7 +9,7 @@
  *
  * FIXME! These routines can be optimized in big endian case.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <asm/processor.h>
 #include <asm/io.h>
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index 2197fc584186..afe965712a69 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -11,6 +11,7 @@
  */
 #include <linux/init.h>
 #include <linux/perf_event.h>
+#include <linux/sched/signal.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/percpu.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 9513fa7840aa..3036dee854d1 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -8,6 +8,8 @@
 #include <linux/hardirq.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
+#include <linux/sched/signal.h>
+
 #include <linux/extable.h>
 #include <linux/module.h>	/* print_modules */
 #include <asm/unwinder.h>
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index 5078cb809750..c86f4360c6ce 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -10,7 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c
index bf95fdaedd0c..110bd35165bf 100644
--- a/arch/sh/mm/asids-debugfs.c
+++ b/arch/sh/mm/asids-debugfs.c
@@ -20,6 +20,8 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
+
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
 
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 9bf876780cef..6fd1bf7481c7 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -13,6 +13,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index fb7b185ee941..ae49639a484e 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -7,7 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 884c70331345..54d3999d8119 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -7,7 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index d20d4e3fd129..8367dce5f41b 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -8,7 +8,7 @@
 
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index ef61c597898b..377e312dc27e 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -17,7 +17,7 @@
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/limits.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
 
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 62087028a9ce..366e57f5e8d6 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -5,8 +5,9 @@
 
 #include <linux/irqreturn.h>
 #include <linux/kd.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
+
 #include "chan.h"
 #include <irq_kern.h>
 #include <irq_user.h>
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index b60a9f8cda75..79218106a033 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -3,7 +3,7 @@
  * Licensed under the GPL
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 3943e9d7d13d..7a1f2a936fd1 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -5,8 +5,9 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
+
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/sections.h>
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 3777b82759bd..37508b190106 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -5,7 +5,8 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <as-layout.h>
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ad8f206ab5e8..9711ae4aaa6a 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -4,7 +4,7 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c
index a53343a90ca2..12c8c9527b8e 100644
--- a/arch/unicore32/kernel/fpu-ucf64.c
+++ b/arch/unicore32/kernel/fpu-ucf64.c
@@ -13,7 +13,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 
 #include <asm/fpu-ucf64.h>
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index c54e32410ead..7f5e06f9a202 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -14,6 +14,7 @@
  */
 #include <linux/module.h>
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
 #include <linux/kallsyms.h>
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index b656d216a8a8..bbefcc46a45e 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -17,7 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 
 #include <asm/pgtable.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 636c4b341f36..df91fb393a01 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -27,6 +27,7 @@
 
 #include <linux/kernel.h>
 #include <linux/timer.h>
+#include <linux/sched/signal.h>
 #include <linux/syscalls.h>
 #include <linux/ratelimit.h>
 
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 8af04afdfcb9..d48af18e7baf 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -25,7 +25,7 @@
 #include <linux/sysfs.h>
 #include <linux/kernfs.h>
 #include <linux/seq_file.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/task_work.h>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1cda35277278..ac7810513d0e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -36,6 +36,7 @@
 #include <linux/compiler.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/kern_levels.h>
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index d2dc0438d654..5eabf34008f1 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -28,7 +28,7 @@
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/limits.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <asm/elf.h>
 
 struct va_alignment __read_mostly va_align = {
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 282bf721a4d6..84abd66e680d 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -24,7 +24,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e33e7fbe870b..aae4d8d4be36 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -31,7 +31,7 @@
 #include <linux/poll.h>
 #include <linux/debugfs.h>
 #include <linux/rbtree.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 4cb8f21ff4ef..724d1c50fc52 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -30,7 +30,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c
index 59bcefd6ec7c..e452673dff66 100644
--- a/drivers/char/snsc_event.c
+++ b/drivers/char/snsc_event.c
@@ -16,7 +16,7 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
 #include <asm/sn/sn_sal.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 6a3470f84998..d1ce83d73a87 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -23,7 +23,7 @@
 #include <linux/mm_types.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
diff --git a/drivers/gpu/drm/drm_lock.c b/drivers/gpu/drm/drm_lock.c
index 32d43f86a8f2..96bb6badb818 100644
--- a/drivers/gpu/drm/drm_lock.c
+++ b/drivers/gpu/drm/drm_lock.c
@@ -34,6 +34,8 @@
  */
 
 #include <linux/export.h>
+#include <linux/sched/signal.h>
+
 #include <drm/drmP.h>
 #include "drm_legacy.h"
 #include "drm_internal.h"
diff --git a/drivers/gpu/drm/ttm/ttm_lock.c b/drivers/gpu/drm/ttm/ttm_lock.c
index f154fb1929bd..913f4318cdc0 100644
--- a/drivers/gpu/drm/ttm/ttm_lock.c
+++ b/drivers/gpu/drm/ttm/ttm_lock.c
@@ -33,7 +33,7 @@
 #include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 
 #define TTM_WRITE_LOCK_PENDING    (1 << 0)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index d525b1a2986a..27f155d2df8d 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -34,7 +34,7 @@
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/export.h>
 #include <linux/hugetlb.h>
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 20f4ddcac3b0..68295a12b771 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -46,7 +46,7 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 #include <linux/module.h>
 
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 75f08624ac05..ce83ba9a12ef 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 
 #include "qib.h"
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index ba868be5af2e..c49db7c33979 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -34,7 +34,7 @@
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/iommu.h>
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 63eaa0a9f8a1..1b169559a240 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 #include "isdn_common.h"
 #include "isdn_tty.h"
 #ifdef CONFIG_ISDN_AUDIO
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index 67c21876c35f..6ceca7db62ad 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -234,6 +234,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include "core.h"
 #include "l1oip.h"
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 985374f20e2e..548d1b8014f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -44,6 +44,7 @@
 
 */
 
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/badblocks.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7453d94eeed7..fbc2d7851b49 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,7 +37,10 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+
 #include <trace/events/block.h>
+
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2ce23b01dbb2..4fb09b3fcb41 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,6 +55,8 @@
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
 #include <linux/flex_array.h>
+#include <linux/sched/signal.h>
+
 #include <trace/events/block.h>
 
 #include "md.h"
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index cb290b8ca0c8..dd4617764f14 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -29,7 +29,7 @@
 #include <linux/pci.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
diff --git a/drivers/misc/mic/cosm/cosm_scif_server.c b/drivers/misc/mic/cosm/cosm_scif_server.c
index 5696df4326b5..85f7d09cc65f 100644
--- a/drivers/misc/mic/cosm/cosm_scif_server.c
+++ b/drivers/misc/mic/cosm/cosm_scif_server.c
@@ -19,6 +19,8 @@
  *
  */
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
+
 #include "cosm_main.h"
 
 /*
diff --git a/drivers/misc/mic/cosm_client/cosm_scif_client.c b/drivers/misc/mic/cosm_client/cosm_scif_client.c
index 03e98bf1ac15..aa530fcceaa9 100644
--- a/drivers/misc/mic/cosm_client/cosm_scif_client.c
+++ b/drivers/misc/mic/cosm_client/cosm_scif_client.c
@@ -22,6 +22,8 @@
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
+
 #include "../cosm/cosm_main.h"
 
 #define COSM_SCIF_MAX_RETRIES 10
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index d0e9c60f944e..329727e00e97 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -18,6 +18,7 @@
 #include <linux/dma_remapping.h>
 #include <linux/pagemap.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 
 #include "scif_main.h"
 #include "scif_map.h"
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 08db4d687533..1da31dc47f86 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -66,7 +66,7 @@
 
 #include <linux/uaccess.h>
 #include <linux/bitops.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index c5744b45ec8f..65689469c5a1 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -22,7 +22,7 @@
 #include <linux/pci_ids.h>
 #include <linux/netdevice.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mmc/sdio.h>
 #include <linux/mmc/sdio_ids.h>
 #include <linux/mmc/sdio_func.h>
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index ef31b77404ef..e2a3112f1c98 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -39,7 +39,7 @@
 #include <linux/kernel.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 #include <linux/pm.h>
 
diff --git a/drivers/ps3/ps3-sys-manager.c b/drivers/ps3/ps3-sys-manager.c
index f2ab435954f6..73e496a72113 100644
--- a/drivers/ps3/ps3-sys-manager.c
+++ b/drivers/ps3/ps3-sys-manager.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
+#include <linux/sched/signal.h>
 
 #include <asm/firmware.h>
 #include <asm/lv1call.h>
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 85eca1cef063..c4518168fd02 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c
index 82c913318b73..ba0e4f93503d 100644
--- a/drivers/s390/char/keyboard.c
+++ b/drivers/s390/char/keyboard.c
@@ -7,7 +7,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/sysrq.h>
 
diff --git a/drivers/scsi/bnx2fc/bnx2fc.h b/drivers/scsi/bnx2fc/bnx2fc.h
index fdd4eb4e41b2..4fc8ed5fe067 100644
--- a/drivers/scsi/bnx2fc/bnx2fc.h
+++ b/drivers/scsi/bnx2fc/bnx2fc.h
@@ -39,7 +39,7 @@
 #include <linux/bitops.h>
 #include <linux/log2.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 
 #include <scsi/scsi.h>
diff --git a/drivers/scsi/bnx2i/bnx2i.h b/drivers/scsi/bnx2i/bnx2i.h
index ed7f3228e234..89ef1a1678d1 100644
--- a/drivers/scsi/bnx2i/bnx2i.h
+++ b/drivers/scsi/bnx2i/bnx2i.h
@@ -25,7 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/in.h>
 #include <linux/kfifo.h>
 #include <linux/netdevice.h>
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 834d1212b6d5..07c08ce68d70 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/log2.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <asm/unaligned.h>
 #include <net/tcp.h>
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index ec3b66561412..054660049395 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -37,7 +37,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/oom.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/profile.h>
diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c
index cf902154f0aa..bcf9f3dd0310 100644
--- a/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c
+++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c
@@ -34,7 +34,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/fs_struct.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include "../../../include/linux/libcfs/libcfs.h"
 
diff --git a/drivers/staging/rtl8188eu/include/osdep_service.h b/drivers/staging/rtl8188eu/include/osdep_service.h
index ee3f5ee06529..9e390648d93e 100644
--- a/drivers/staging/rtl8188eu/include/osdep_service.h
+++ b/drivers/staging/rtl8188eu/include/osdep_service.h
@@ -37,7 +37,7 @@
 #include <linux/io.h>
 #include <linux/mutex.h>
 #include <linux/sem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/etherdevice.h>
 #include <linux/wireless.h>
 #include <net/iw_handler.h>
diff --git a/drivers/staging/rtl8712/osdep_service.h b/drivers/staging/rtl8712/osdep_service.h
index b8a170978434..5d33020554cd 100644
--- a/drivers/staging/rtl8712/osdep_service.h
+++ b/drivers/staging/rtl8712/osdep_service.h
@@ -33,7 +33,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/semaphore.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sem.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/staging/rtl8712/rtl8712_cmd.c b/drivers/staging/rtl8712/rtl8712_cmd.c
index f19b6b27aa71..5346c657485d 100644
--- a/drivers/staging/rtl8712/rtl8712_cmd.c
+++ b/drivers/staging/rtl8712/rtl8712_cmd.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/kref.h>
 #include <linux/netdevice.h>
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
index cb0b7ca36b1e..8a0d214f6e9b 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
@@ -34,6 +34,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/cdev.h>
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h
index 4055d4bf9f74..e63964f5a18a 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h
@@ -47,7 +47,7 @@
 #include <linux/types.h>
 #include <linux/interrupt.h>
 #include <linux/random.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ctype.h>
 #include <linux/uaccess.h>
 #include <linux/time.h>  /* for time_t */
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index da2c73a255de..fa1d578d56bd 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -24,6 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/idr.h>
 #include <linux/delay.h>
+#include <linux/sched/signal.h>
 #include <asm/unaligned.h>
 #include <net/ipv6.h>
 #include <scsi/scsi_proto.h>
diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c
index b54e72c7ab0f..a4d5e6749932 100644
--- a/drivers/target/iscsi/iscsi_target_erl0.c
+++ b/drivers/target/iscsi/iscsi_target_erl0.c
@@ -17,6 +17,8 @@
  * GNU General Public License for more details.
  ******************************************************************************/
 
+#include <linux/sched/signal.h>
+
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
 #include <target/target_core_fabric.h>
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index eab274d17b5c..b03cc03423c1 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/idr.h>
 #include <linux/tcp.h>        /* TCP_NODELAY */
 #include <net/ipv6.h>         /* ipv6_addr_v4mapped() */
diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index 46388c9e08da..29eb09e0cd15 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -19,6 +19,7 @@
 #include <linux/ctype.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <net/sock.h>
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index a23fa5ed1d67..66b59a15780d 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -12,7 +12,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/major.h>
 #include <linux/mm.h>
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 71136742e606..65db0aeb3d80 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -14,7 +14,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index a1fd3f7d487a..985d33f0f315 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -69,7 +69,7 @@
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 397e1509fe51..6b3a2c00974d 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -26,7 +26,7 @@
 
 #include <linux/consolemap.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/mm.h>
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 9d3ce505e7ab..5c4933bb4b53 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -72,7 +72,7 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/kernel.h>
diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index a56edf2d58eb..0cbfe1ff6f6c 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c
@@ -10,7 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/timer.h>
 #include <linux/kernel.h>
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index 5a59da0dc98a..3e80aa3b917a 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -74,7 +74,7 @@
 #include <linux/moduleparam.h>
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
 #include <linux/stat.h>
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index ca425e8099ea..cfc3cff6e8d5 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -36,6 +36,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/signal.h>
 #include <linux/poll.h>
diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c
index 8f3659b65f53..4c8aacc232c0 100644
--- a/drivers/usb/gadget/function/f_mass_storage.c
+++ b/drivers/usb/gadget/function/f_mass_storage.c
@@ -207,6 +207,7 @@
 #include <linux/fs.h>
 #include <linux/kref.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/limits.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 185d50ee1b12..cf3de91fbfe7 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -21,6 +21,7 @@
 #include <linux/vfio.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 
 #include <asm/iommu.h>
 #include <asm/tce.h>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 7e94c7fc90ae..c26fa1f3ed86 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -31,7 +31,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/drivers/w1/w1_family.c b/drivers/w1/w1_family.c
index df1c9bb90eb5..2096f460498f 100644
--- a/drivers/w1/w1_family.c
+++ b/drivers/w1/w1_family.c
@@ -14,7 +14,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
-#include <linux/sched.h>	/* schedule_timeout() */
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 
diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index 4ce1b66d5092..2cae7b29bb5f 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -17,6 +17,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/moduleparam.h>
 
diff --git a/fs/attr.c b/fs/attr.c
index c902b3d53508..135304146120 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -9,6 +9,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <linux/capability.h>
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 79fbd85db4ba..24a58bf9ca72 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/file.h>
 #include "autofs_i.h"
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 777ad9f4fc3c..9bf25be05636 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/net.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index f6c6c8adbc01..e82357c89979 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -15,7 +15,7 @@
  */
 
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index 23a539b29225..c74ab43b8383 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -17,6 +17,7 @@
 #include <linux/binfmts.h>
 #include <linux/coredump.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/signal.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
@@ -34,7 +35,6 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
-#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/timekeeping.h>
diff --git a/fs/exec.c b/fs/exec.c
index 9c80d011594e..aa228b98ad5b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/file.c b/fs/file.c
index 69d6990e3021..ad6f094f2eff 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -12,7 +12,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 7dca743b2ce1..543ed50f0387 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -1,5 +1,5 @@
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index e5c1783ab64a..453a6a1fff34 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -16,7 +16,7 @@
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
 #include <linux/completion.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include "nodelist.h"
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7e4ea3b9f472..e7c8b9c76e48 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -17,7 +17,7 @@
 #include <linux/sysctl.h>
 #include <linux/moduleparam.h>
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/uio.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 7eb89c23c847..d5606099712a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/sched/signal.h>
 #include <linux/namei.h>
 
 #include <net/sock.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 97b111d79489..bdea177aa405 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -16,6 +16,7 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/in.h>
 #include <linux/net.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 484bebc20bca..bb79972dc638 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,6 +9,7 @@
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index efd66da99201..786a4a2cb2d7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -6,7 +6,7 @@
  * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/freezer.h>
 #include <linux/module.h>
 #include <linux/fs_struct.h>
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 00ce1531b2f5..c330495c3115 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/dcache.h>
 #include <linux/path.h>
diff --git a/fs/select.c b/fs/select.c
index 305c0daf5d67..e2112270d75a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,7 +15,8 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/rt.h>
 #include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/slab.h>
@@ -26,7 +27,6 @@
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
 #include <linux/hrtimer.h>
-#include <linux/sched/rt.h>
 #include <linux/freezer.h>
 #include <net/busy_poll.h>
 #include <linux/vmalloc.h>
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b4e36e92bc87..8a266e2be5a6 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -2,7 +2,7 @@
 #define __INCLUDE_LINUX_OOM_H
 
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e0e539321ab9..422bc2e4cb6a 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -3,6 +3,7 @@
 
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
+#include <linux/sched/signal.h>		/* For send_sig(), same_thread_group(), etc. */
 #include <linux/err.h>			/* for IS_ERR_VALUE */
 #include <linux/bug.h>			/* For BUG_ON.  */
 #include <linux/pid_namespace.h>	/* For task_active_pid_ns.  */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
new file mode 100644
index 000000000000..da69cd05cd68
--- /dev/null
+++ b/include/linux/sched/signal.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_SIGNAL_H
+#define _LINUX_SCHED_SIGNAL_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index eadbe227c256..4985048640a7 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -8,7 +8,7 @@
 #define _LINUX_SIGNALFD_H
 
 #include <uapi/linux/signalfd.h>
-
+#include <linux/sched/signal.h>
 
 #ifdef CONFIG_SIGNALFD
 
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 58de6edf751f..e2a5daf8d14f 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -8,7 +8,7 @@
 #define _LINUX_TASKSTATS_KERN_H
 
 #include <linux/taskstats.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 
 #ifdef CONFIG_TASKSTATS
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 40e448de8a7e..4f7241fbeff3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -36,6 +36,7 @@
 #include <linux/user_namespace.h>
 #include <linux/slab.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
 
 #include <net/sock.h>
 #include "util.h"
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 461eb1e66a0f..7af0dcc5d755 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
 #include <linux/bpf_trace.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0a5f630f5c54..0aae3183b029 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -7,7 +7,7 @@
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/notifier.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 19d9a578c753..7510dc687c0d 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -29,6 +29,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/serial_core.h>
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index fe15fff5df53..9b976a42376d 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -12,7 +12,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kdb.h>
 #include <linux/nmi.h>
 #include "kdb_private.h"
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 40c07e4fa116..129247e56902 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,6 +16,8 @@
 #include <linux/export.h>
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
+#include <linux/sched/signal.h>
+
 #include <trace/events/sched.h>
 
 /*
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a0e90e0afc75..da128deb10ec 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -36,7 +36,7 @@
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e2307a6c29f1..641249471952 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index a95f13c31464..f6b961c5e58c 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -10,6 +10,7 @@
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
+#include <linux/sched/signal.h>
 #include <linux/posix-timers.h>
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index b4377a5e4269..a2475a9f57d8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -2,7 +2,7 @@
  * Implement CPU time clocks for the POSIX clock interface.
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/posix-timers.h>
 #include <linux/errno.h>
 #include <linux/math64.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4fee1c3abd0b..0f411a4690a0 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -18,7 +18,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1f9a31f934a4..9f6984d52fec 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,7 +24,7 @@
 #include <linux/tracepoint.h>
 #include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/static_key.h>
 
 extern struct tracepoint * const __start___tracepoints_ptrs[];
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 571a2d3821d8..d9a03b80b75d 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -17,7 +17,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86b7854fec8e..2f735cbe05e8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
 #include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 #include <linux/highuid.h>
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
index 391fd23976a2..9745cfffcb69 100644
--- a/lib/is_single_threaded.c
+++ b/lib/is_single_threaded.c
@@ -9,8 +9,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
  */
-
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 /*
  * Returns true if the task does not share ->mm with another thread/process.
diff --git a/mm/filemap.c b/mm/filemap.c
index 1944c631e3e6..1694623a6289 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/dax.h>
 #include <linux/fs.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index da3436953022..2df6d3687b2a 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -73,7 +73,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3d0f2fd4bf73..b74984db386e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -40,7 +40,7 @@
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/kernel-page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 7c233f8e20ee..4355d34c68a6 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2014 Davidlohr Bueso.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
 
diff --git a/net/9p/client.c b/net/9p/client.c
index 3fc94a49ccd5..25cfd8a4bc36 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -32,7 +32,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/uio.h>
 #include <net/9p/9p.h>
diff --git a/net/atm/common.c b/net/atm/common.c
index a3ca922d307b..9613381f5db0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -13,7 +13,7 @@
 #include <linux/errno.h>	/* error codes */
 #include <linux/capability.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>		/* struct timeval */
 #include <linux/skbuff.h>
 #include <linux/bitops.h>
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 90fcf5fc2e0a..a8e42cedf1db 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -20,7 +20,7 @@
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 92cbbd2afddb..adcad344c843 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -9,7 +9,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
diff --git a/net/core/stream.c b/net/core/stream.c
index f575bcf64af2..20231dbb1da0 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/net.h>
 #include <linux/signal.h>
 #include <linux/tcp.h>
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index a90ed67027b0..e6e79eda9763 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -106,7 +106,7 @@ Version 0.0.6    2.1.110   07-aug-98   Eduardo Marcelo Serrat
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ab254041dab7..81adc29a448d 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -46,6 +46,7 @@
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/net.h>
 #include <linux/irda.h>
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index ed212ffc1d9d..4bbf4526b885 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -17,7 +17,7 @@
 #include <linux/in.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 9ad301c46b88..b8a1df2c9785 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -20,7 +20,7 @@
 #include <linux/in.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/string.h>
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 465a9c8464f9..6f0a9be50f50 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,7 @@
 #include <linux/kernel.h>
 #include <linux/wait.h>
 #include <linux/time.h>
+#include <linux/sched/signal.h>
 #include <linux/ip.h>
 #include <linux/capability.h>
 #include <linux/fcntl.h>
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b94efd93d3e4..a08aeb56b8e4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/net.h>
 #include <linux/in.h>
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e2d18b9f910f..ee37b390260a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -85,7 +85,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/stat.h>
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 079c883aa96e..fd28a49dbe8f 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -41,7 +41,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/net.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9a8f12f8d5b7..b12f873f92ba 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -28,7 +28,7 @@
 #include <linux/kernel.h>
 #include <linux/tracehook.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/lsm_hooks.h>
 #include <linux/xattr.h>
 #include <linux/capability.h>
-- 
cgit v1.2.3


From 8703e8a465b1e9cadc3680b4b1248f5987e54518 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:30 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/user.h>

We are going to split <linux/sched/user.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/user.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 block/ioprio.c               | 1 +
 include/linux/cred.h         | 2 +-
 include/linux/sched/user.h   | 6 ++++++
 ipc/mqueue.c                 | 1 +
 kernel/fork.c                | 1 +
 kernel/signal.c              | 1 +
 kernel/user.c                | 1 +
 mm/mlock.c                   | 1 +
 net/core/scm.c               | 1 +
 security/keys/process_keys.c | 1 +
 10 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/user.h

(limited to 'include/linux')

diff --git a/block/ioprio.c b/block/ioprio.c
index 3790669232ff..89c43e07787d 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -25,6 +25,7 @@
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
+#include <linux/sched/user.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
diff --git a/include/linux/cred.h b/include/linux/cred.h
index f0e70a1bb3ac..045d33e48069 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -18,8 +18,8 @@
 #include <linux/selinux.h>
 #include <linux/atomic.h>
 #include <linux/uidgid.h>
+#include <linux/sched/user.h>
 
-struct user_struct;
 struct cred;
 struct inode;
 
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
new file mode 100644
index 000000000000..83238e5ddadd
--- /dev/null
+++ b/include/linux/sched/user.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_USER_H
+#define _LINUX_SCHED_USER_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_USER_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4f7241fbeff3..e8d41ff57241 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/user.h>
 
 #include <net/sock.h>
 #include "util.h"
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332448b668a..a5ad1e4ab604 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -15,6 +15,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/user.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index bae358532d0a..762fcf64f0c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/user.c b/kernel/user.c
index b069ccbfb0b0..00281add65b2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/bitops.h>
 #include <linux/key.h>
+#include <linux/sched/user.h>
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index cdbed8aaa426..1050511f8b2b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,6 +8,7 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/sched/user.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pagemap.h>
diff --git a/net/core/scm.c b/net/core/scm.c
index b6d83686e149..b1ff8a441748 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -14,6 +14,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/stat.h>
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 918cddcd4516..b6fdd22205b1 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/keyctl.h>
 #include <linux/fs.h>
 #include <linux/err.h>
-- 
cgit v1.2.3


From 55687da166bf51129ed6b110d7711f4c7560abe2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:31 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/cpufreq.h>

We are going to split <linux/sched/cpufreq.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/cpufreq.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/cpufreq/cpufreq_governor.c | 1 -
 drivers/cpufreq/cpufreq_governor.h | 1 +
 drivers/cpufreq/cpufreq_ondemand.c | 1 +
 drivers/cpufreq/intel_pstate.c     | 2 +-
 include/linux/sched/cpufreq.h      | 6 ++++++
 kernel/sched/sched.h               | 1 +
 6 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/sched/cpufreq.h

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 631bd2c86c5e..47e24b5384b3 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -18,7 +18,6 @@
 
 #include <linux/export.h>
 #include <linux/kernel_stat.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 
 #include "cpufreq_governor.h"
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index f5717ca070cc..0236ec2cd654 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -20,6 +20,7 @@
 #include <linux/atomic.h>
 #include <linux/irq_work.h>
 #include <linux/cpufreq.h>
+#include <linux/sched/cpufreq.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 4a017e895296..3937acf7e026 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -16,6 +16,7 @@
 #include <linux/percpu-defs.h>
 #include <linux/slab.h>
 #include <linux/tick.h>
+#include <linux/sched/cpufreq.h>
 
 #include "cpufreq_ondemand.h"
 
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index eb0f7fb71685..8f6d29302aac 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -19,7 +19,7 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/cpufreq.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
new file mode 100644
index 000000000000..07ed9664fa20
--- /dev/null
+++ b/include/linux/sched/cpufreq.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_CPUFREQ_H
+#define _LINUX_SCHED_CPUFREQ_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_CPUFREQ_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 641249471952..7dfeb14fa43c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -7,6 +7,7 @@
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/cpufreq.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
-- 
cgit v1.2.3


From 6a3827d7509cbf96b7e961f8957c1f01d1bcf894 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:31 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/numa_balancing.h>

We are going to split <linux/sched/numa_balancing.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/numa_balancing.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                            | 1 +
 fs/proc/array.c                      | 1 +
 include/linux/sched/numa_balancing.h | 6 ++++++
 include/trace/events/sched.h         | 2 +-
 kernel/fork.c                        | 1 +
 kernel/sched/sched.h                 | 1 +
 mm/huge_memory.c                     | 1 +
 mm/memory.c                          | 1 +
 mm/mempolicy.c                       | 1 +
 9 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/numa_balancing.h

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index aa228b98ad5b..e857c7260b1f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -35,6 +35,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 61bea3a4ecc5..cfe7f934a300 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -61,6 +61,7 @@
 #include <linux/string.h>
 #include <linux/mman.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
new file mode 100644
index 000000000000..999182279f78
--- /dev/null
+++ b/include/linux/sched/numa_balancing.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_NUMA_BALANCING_H
+#define _LINUX_SCHED_NUMA_BALANCING_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_NUMA_BALANCING_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9b90c57517a9..9e3ef6c99e4b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -4,7 +4,7 @@
 #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_SCHED_H
 
-#include <linux/sched.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
 
diff --git a/kernel/fork.c b/kernel/fork.c
index a5ad1e4ab604..08d6c091ac7c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -16,6 +16,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/user.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7dfeb14fa43c..7e8ce0347fbf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/cpufreq.h>
 #include <linux/u64_stats_sync.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index dee92fa26bbb..d36b2af4d1bf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/mmu_notifier.h>
diff --git a/mm/memory.c b/mm/memory.c
index 6fc918b2c459..d4994a28dc85 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -42,6 +42,7 @@
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 90892416ed75..18e0105810df 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -74,6 +74,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 1e4bae6468375f81a3d4bdf436deaa9ea294e12b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:32 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/jobctl.h>

We are going to split <linux/sched/jobctl.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/jobctl.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the file that is going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/jobctl.h | 4 ++++
 include/linux/sched/signal.h | 1 +
 2 files changed, 5 insertions(+)
 create mode 100644 include/linux/sched/jobctl.h

(limited to 'include/linux')

diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
new file mode 100644
index 000000000000..228e4644937b
--- /dev/null
+++ b/include/linux/sched/jobctl.h
@@ -0,0 +1,4 @@
+#ifndef _LINUX_SCHED_JOBCTL_H
+#define _LINUX_SCHED_JOBCTL_H
+
+#endif /* _LINUX_SCHED_JOBCTL_H */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index da69cd05cd68..ce93c4a02b79 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -2,5 +2,6 @@
 #define _LINUX_SCHED_SIGNAL_H
 
 #include <linux/sched.h>
+#include <linux/sched/jobctl.h>
 
 #endif /* _LINUX_SCHED_SIGNAL_H */
-- 
cgit v1.2.3


From 5b825c3af1d8a0af4deb4a5eb349d0d0050c62e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 17:54:15 +0100
Subject: sched/headers: Prepare to remove <linux/cred.h> inclusion from
 <linux/sched.h>

Add #include <linux/cred.h> dependencies to all .c files rely on sched.h
doing that for them.

Note that even if the count where we need to add extra headers seems high,
it's still a net win, because <linux/sched.h> is included in over
2,200 files ...

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/sys_oabi-compat.c                     | 1 +
 arch/mips/kernel/mips-mt-fpaff.c                      | 1 +
 arch/x86/include/asm/intel_rdt.h                      | 1 +
 arch/x86/kernel/cpu/intel_cacheinfo.c                 | 1 +
 block/ioprio.c                                        | 1 +
 drivers/misc/eeprom/eeprom.c                          | 1 +
 drivers/misc/lkdtm_heap.c                             | 1 +
 drivers/misc/vmw_vmci/vmci_context.c                  | 1 +
 drivers/misc/vmw_vmci/vmci_host.c                     | 1 +
 drivers/staging/lustre/lustre/include/lustre_compat.h | 1 +
 drivers/staging/lustre/lustre/ptlrpc/sec.c            | 1 +
 drivers/xen/balloon.c                                 | 1 +
 fs/9p/v9fs.c                                          | 1 +
 fs/affs/inode.c                                       | 1 +
 fs/affs/super.c                                       | 1 +
 fs/autofs4/dev-ioctl.c                                | 1 +
 fs/befs/linuxvfs.c                                    | 1 +
 fs/binfmt_elf.c                                       | 1 +
 fs/cachefiles/internal.h                              | 1 +
 fs/compat.c                                           | 1 +
 fs/exportfs/expfs.c                                   | 1 +
 fs/ext2/balloc.c                                      | 1 +
 fs/ext4/ialloc.c                                      | 2 ++
 fs/file_table.c                                       | 1 +
 fs/gfs2/inode.c                                       | 1 +
 fs/gfs2/sys.c                                         | 1 +
 fs/hfs/inode.c                                        | 1 +
 fs/hfsplus/inode.c                                    | 1 +
 fs/isofs/inode.c                                      | 1 +
 fs/jffs2/fs.c                                         | 1 +
 fs/libfs.c                                            | 1 +
 fs/namespace.c                                        | 1 +
 fs/ncpfs/ioctl.c                                      | 1 +
 fs/notify/fanotify/fanotify.c                         | 1 +
 fs/notify/inotify/inotify_fsnotify.c                  | 1 +
 fs/omfs/inode.c                                       | 1 +
 fs/overlayfs/copy_up.c                                | 1 +
 fs/overlayfs/inode.c                                  | 1 +
 fs/overlayfs/namei.c                                  | 1 +
 fs/overlayfs/super.c                                  | 1 +
 fs/overlayfs/util.c                                   | 1 +
 fs/posix_acl.c                                        | 1 +
 fs/proc/proc_sysctl.c                                 | 1 +
 fs/proc/root.c                                        | 1 +
 fs/quota/dquot.c                                      | 1 +
 fs/stat.c                                             | 1 +
 fs/xfs/xfs_ioctl.c                                    | 1 +
 include/linux/cred.h                                  | 1 +
 include/linux/sched/signal.h                          | 1 +
 include/linux/wait.h                                  | 1 +
 include/net/scm.h                                     | 1 +
 include/rdma/ib.h                                     | 1 +
 ipc/namespace.c                                       | 1 +
 kernel/pid_namespace.c                                | 1 +
 kernel/ucount.c                                       | 1 +
 kernel/uid16.c                                        | 1 +
 kernel/utsname.c                                      | 1 +
 mm/usercopy.c                                         | 1 +
 net/dns_resolver/dns_query.c                          | 2 ++
 net/netfilter/xt_owner.c                              | 2 ++
 net/sunrpc/auth.c                                     | 1 +
 security/apparmor/policy.c                            | 1 +
 security/keys/internal.h                              | 1 +
 security/keys/keyctl.c                                | 1 +
 security/keys/persistent.c                            | 2 ++
 65 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 5f221acd21ae..b9786f491873 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -76,6 +76,7 @@
 #include <linux/syscalls.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/cred.h>
 #include <linux/fcntl.h>
 #include <linux/eventpoll.h>
 #include <linux/sem.h>
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 1a0a3b4ecc3e..ffc6bd3464d9 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/security.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 95ce5c85b009..0d64397cee58 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_INTEL_RDT_A
 
+#include <linux/sched.h>
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0282b0df004a..c55fb2cb2acc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -11,6 +11,7 @@
 #include <linux/cacheinfo.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 #include <linux/sysfs.h>
 #include <linux/pci.h>
 
diff --git a/block/ioprio.c b/block/ioprio.c
index 89c43e07787d..4b9ab8367dda 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/ioprio.h>
+#include <linux/cred.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/sched/user.h>
diff --git a/drivers/misc/eeprom/eeprom.c b/drivers/misc/eeprom/eeprom.c
index 3d1d55157e5f..2fad790db3bf 100644
--- a/drivers/misc/eeprom/eeprom.c
+++ b/drivers/misc/eeprom/eeprom.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/capability.h>
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
 #include <linux/mutex.h>
diff --git a/drivers/misc/lkdtm_heap.c b/drivers/misc/lkdtm_heap.c
index 0f1581664c1c..ffb6aeac07b3 100644
--- a/drivers/misc/lkdtm_heap.c
+++ b/drivers/misc/lkdtm_heap.c
@@ -4,6 +4,7 @@
  */
 #include "lkdtm.h"
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 /*
  * This tries to stay within the next largest power-of-2 kmalloc cache
diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c
index f35f0c8606b9..21d0fa592145 100644
--- a/drivers/misc/vmw_vmci/vmci_context.c
+++ b/drivers/misc/vmw_vmci/vmci_context.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/slab.h>
 
 #include "vmci_queue_pair.h"
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index ec090105eb4b..8a16a26e9658 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/init.h>
diff --git a/drivers/staging/lustre/lustre/include/lustre_compat.h b/drivers/staging/lustre/lustre/include/lustre_compat.h
index 300e96fb032a..da9ce195c52e 100644
--- a/drivers/staging/lustre/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustre/lustre/include/lustre_compat.h
@@ -35,6 +35,7 @@
 
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
+#include <linux/cred.h>
 
 #include "lustre_patchless_compat.h"
 
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
index e860df7c45a2..49f34fd655c3 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -38,6 +38,7 @@
 
 #include "../../include/linux/libcfs/libcfs.h"
 #include <linux/crypto.h>
+#include <linux/cred.h>
 #include <linux/key.h>
 
 #include "../include/obd.h"
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index db107fa50ca1..a6d4378eb8d9 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -41,6 +41,7 @@
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/bootmem.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 072e7599583a..a89f3cfe3c7d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/parser.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a5e6097eb5a9..abcc59899229 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/gfp.h>
 #include "affs.h"
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 37532538e8ab..c2c27a8f128e 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 806df746f1a9..734cbf8d9676 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -17,6 +17,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
 #include <linux/magic.h>
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 19407165f4aa..c500e954debb 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -18,6 +18,7 @@
 #include <linux/parser.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/exportfs.h>
 
 #include "befs.h"
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbbe52e1250e..2c95257fa4da 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,6 +36,7 @@
 #include <linux/coredump.h>
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
+#include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
 #include <asm/param.h>
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cd1effee8a49..9bf90bcc56ac 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -19,6 +19,7 @@
 #include <linux/fscache-cache.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
+#include <linux/cred.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
 
diff --git a/fs/compat.c b/fs/compat.c
index e50a2114f474..c61b506f5bc9 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -21,6 +21,7 @@
 #include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/time.h>
+#include <linux/cred.h>
 #include <linux/fs.h>
 #include <linux/fcntl.h>
 #include <linux/namei.h>
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index a4b531be9168..9ec1038f937e 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -15,6 +15,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 
 #define dprintk(fmt, args...) do{}while(0)
 
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 4c40c0786e16..d0bdb74f0e15 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -15,6 +15,7 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b14bae2598bc..17bc043308f3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -21,6 +21,8 @@
 #include <linux/random.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/cred.h>
+
 #include <asm/byteorder.h>
 
 #include "ext4.h"
diff --git a/fs/file_table.c b/fs/file_table.c
index 6d982b57de92..954d510b765a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index eb7724b8578a..9d28f55fbd1d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
+#include <linux/cred.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/gfs2_ondisk.h>
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index f8d30e41d1d3..7a515345610c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -10,6 +10,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f776acf2378a..bfbba799430f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/uio.h>
 #include <linux/xattr.h>
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 2e796f8302ff..e8638d528195 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/uio.h>
 
 #include "hfsplus_fs.h"
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 871c8b392099..020ba0936146 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/nls.h>
 #include <linux/ctype.h>
 #include <linux/statfs.h>
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 567653f7c0ce..76fa814df3d1 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -15,6 +15,7 @@
 #include <linux/capability.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/mtd/mtd.h>
diff --git a/fs/libfs.c b/fs/libfs.c
index 28d6f35feed6..217896ca4fae 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/quotaops.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 8bfad42c1ccf..131cd7b94f47 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -15,6 +15,7 @@
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/idr.h>
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 4434e4977cf3..12550c2320cc 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -19,6 +19,7 @@
 #include <linux/highuid.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 
 #include <linux/uaccess.h>
 
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index a4c46221755e..e5f7e47de68e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h> /* UINT_MAX */
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/types.h>
 #include <linux/wait.h>
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f36c29398de3..1aeb837ae414 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 
 #include "inotify.h"
 
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index df7ea8543a2e..8c9034ee7383 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/cred.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/vmalloc.h>
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f57043dace62..53d3f830358f 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/uaccess.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/fdtable.h>
 #include <linux/ratelimit.h>
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 08643ac44a02..6639f487f835 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -9,6 +9,7 @@
 
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include "overlayfs.h"
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 023bb0b03352..b8b077821fb0 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
 #include <linux/ratelimit.h>
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 20f48abbb82f..9aa37c2f7f7d 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -7,6 +7,7 @@
  * the Free Software Foundation.
  */
 
+#include <uapi/linux/magic.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 952286f4826c..9dc1c0af586b 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/xattr.h>
 #include "overlayfs.h"
 #include "ovl_entry.h"
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index c9d48dc78495..eebf5f6cf6d5 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -15,6 +15,7 @@
 #include <linux/atomic.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 3e64c6502dc8..3d203b1f5a02 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -8,6 +8,7 @@
 #include <linux/printk.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b90da888b81a..5d5fed20bfff 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -20,6 +20,7 @@
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 #include <linux/parser.h>
+#include <linux/cred.h>
 
 #include "internal.h"
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 406fed92362a..74b489e3714d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -72,6 +72,7 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/kmod.h>
 #include <linux/namei.h>
 #include <linux/capability.h>
diff --git a/fs/stat.c b/fs/stat.c
index 3f14d1ef0868..95bd41762770 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index cf1363dbf32b..2fd7fdf5438f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -43,6 +43,7 @@
 #include "xfs_acl.h"
 
 #include <linux/capability.h>
+#include <linux/cred.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 045d33e48069..b03e7d049a64 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -18,6 +18,7 @@
 #include <linux/selinux.h>
 #include <linux/atomic.h>
 #include <linux/uidgid.h>
+#include <linux/sched.h>
 #include <linux/sched/user.h>
 
 struct cred;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index ce93c4a02b79..0f4e9f4a43fd 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_SCHED_SIGNAL_H
 #define _LINUX_SCHED_SIGNAL_H
 
+#include <linux/cred.h>
 #include <linux/sched.h>
 #include <linux/sched/jobctl.h>
 
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1421132e9086..aacb1282d19a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -6,6 +6,7 @@
 #include <linux/list.h>
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
+
 #include <asm/current.h>
 #include <uapi/linux/wait.h>
 
diff --git a/include/net/scm.h b/include/net/scm.h
index 59fa93c01d2a..142ea9e7a6d0 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -3,6 +3,7 @@
 
 #include <linux/limits.h>
 #include <linux/net.h>
+#include <linux/cred.h>
 #include <linux/security.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
index a6b93706b0fc..9b4c22a36931 100644
--- a/include/rdma/ib.h
+++ b/include/rdma/ib.h
@@ -35,6 +35,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 
 struct ib_addr {
 	union {
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 0abdea496493..1f1d713ac19c 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -9,6 +9,7 @@
 #include <linux/rcupdate.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index eef2ce968636..25314c96fbe6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/syscalls.h>
+#include <linux/cred.h>
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8a11fc0cb459..62630a40ab3a 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,7 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/hash.h>
 #include <linux/user_namespace.h>
 
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 71645ae9303a..5c2dc5b2bf4f 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/syscalls.h>
 
 #include <linux/uaccess.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 6976cd47dcf6..06585ad296ff 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 8345299e3e3b..7ccad05c0d5c 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -16,6 +16,7 @@
 
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include <asm/sections.h>
 
 enum {
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ecc28cff08ab..ab9fec00a788 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -37,8 +37,10 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/dns_resolver.h>
 #include <linux/err.h>
+
 #include <keys/dns_resolver-type.h>
 #include <keys/user-type.h>
 
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 16477df45b3b..3d705c688a27 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -13,6 +13,8 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/file.h>
+#include <linux/cred.h>
+
 #include <net/sock.h>
 #include <net/inet_sock.h>
 #include <linux/netfilter/x_tables.h>
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a1ee933e3029..d2623b9f23d6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -8,6 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c
index f44312a19522..462c5d36b871 100644
--- a/security/apparmor/policy.c
+++ b/security/apparmor/policy.c
@@ -76,6 +76,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/cred.h>
 #include <linux/user_namespace.h>
 
 #include "include/apparmor.h"
diff --git a/security/keys/internal.h b/security/keys/internal.h
index a705a7d92ad7..a2f4c0abb8d8 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -13,6 +13,7 @@
 #define _INTERNAL_H
 
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/key-type.h>
 #include <linux/task_work.h>
 #include <linux/keyctl.h>
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 04a764f71ec8..bcb0b597c391 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -18,6 +18,7 @@
 #include <linux/keyctl.h>
 #include <linux/fs.h>
 #include <linux/capability.h>
+#include <linux/cred.h>
 #include <linux/string.h>
 #include <linux/err.h>
 #include <linux/vmalloc.h>
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index 1edc1f0a0ce2..d0cb5b32eff7 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -10,6 +10,8 @@
  */
 
 #include <linux/user_namespace.h>
+#include <linux/cred.h>
+
 #include "internal.h"
 
 unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */
-- 
cgit v1.2.3


From 037741a6d4ab2e4b0580dae97aca963d8a8dc68f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 10:08:30 +0100
Subject: sched/headers: Prepare for the removal of <linux/rtmutex.h> from
 <linux/sched.h>

Fix up missing #includes in other places that rely on sched.h doing that for them.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/i2c.h          | 1 +
 kernel/fork.c                | 1 +
 kernel/locking/locktorture.c | 1 +
 kernel/rcu/tree.h            | 1 +
 4 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index bed8fbb45f31..6b183521c616 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -30,6 +30,7 @@
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
 #include <linux/mutex.h>
+#include <linux/rtmutex.h>
 #include <linux/irqdomain.h>		/* for Host Notify IRQ */
 #include <linux/of.h>		/* for struct device_node */
 #include <linux/swab.h>		/* for swab16 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 08d6c091ac7c..8fbe8bcd1e20 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,6 +17,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/user.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5ea0a8969ee2..f24582d4dad3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,6 +33,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <uapi/linux/sched/types.h>
+#include <linux/rtmutex.h>
 #include <linux/atomic.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b60f2b6caa14..ec62a05bfdb3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -24,6 +24,7 @@
 
 #include <linux/cache.h>
 #include <linux/spinlock.h>
+#include <linux/rtmutex.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
-- 
cgit v1.2.3


From cc5efc2323a89dcf1a02c17b9b9f255c5a6e0492 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 10:06:45 +0100
Subject: sched/headers: Prepare for the removal of various unrelated headers
 from <linux/sched.h>

We are going to remove the following header inclusions from <linux/sched.h>:

	#include <asm/param.h>
	#include <linux/threads.h>
	#include <linux/kernel.h>
	#include <linux/types.h>
	#include <linux/timex.h>
	#include <linux/jiffies.h>
	#include <linux/rbtree.h>
	#include <linux/thread_info.h>
	#include <linux/cpumask.h>
	#include <linux/errno.h>
	#include <linux/nodemask.h>
	#include <linux/preempt.h>
	#include <asm/page.h>
	#include <linux/smp.h>
	#include <linux/compiler.h>
	#include <linux/completion.h>
	#include <linux/percpu.h>
	#include <linux/topology.h>
	#include <linux/rcupdate.h>
	#include <linux/time.h>
	#include <linux/timer.h>
	#include <linux/llist.h>
	#include <linux/uidgid.h>
	#include <asm/processor.h>

Fix up a single .h file that got hold of <linux/sysctl.h> via one of these headers.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/user_namespace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 363e0e8082a9..08264641b502 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -5,6 +5,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/sched.h>
+#include <linux/sysctl.h>
 #include <linux/err.h>
 
 #define UID_GID_MAP_MAX_EXTENTS 5
-- 
cgit v1.2.3


From b12fb7f46af7d548503d75be59f493f958c6d1b3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:33 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/xacct.h>

We are going to split <linux/sched/xacct.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/xacct.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the .c file that is going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/read_write.c             | 3 ++-
 include/linux/sched/xacct.h | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/xacct.h

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index 5816d4c4cab0..dc60075653a8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -4,8 +4,9 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
-#include <linux/slab.h> 
+#include <linux/slab.h>
 #include <linux/stat.h>
+#include <linux/sched/xacct.h>
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
diff --git a/include/linux/sched/xacct.h b/include/linux/sched/xacct.h
new file mode 100644
index 000000000000..890f7ce5cd27
--- /dev/null
+++ b/include/linux/sched/xacct.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_XACCT_H
+#define _LINUX_SCHED_XACCT_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_XACCT_H */
-- 
cgit v1.2.3


From 174cd4b1e5fbd0d74c68cf3a74f5bd4923485512 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 19:15:33 +0100
Subject: sched/headers: Prepare to move signal wakeup & sigpending methods
 from <linux/sched.h> into <linux/sched/signal.h>

Fix up affected files that include this signal functionality via sched.h.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/common/bL_switcher.c                      |  2 +-
 arch/cris/arch-v10/drivers/sync_serial.c           |  2 +-
 arch/cris/arch-v32/drivers/sync_serial.c           |  2 +-
 arch/mips/kernel/rtlx.c                            |  2 ++
 arch/mips/kvm/mips.c                               |  2 ++
 arch/powerpc/kvm/book3s_hv.c                       |  2 +-
 arch/powerpc/kvm/powerpc.c                         |  1 +
 arch/powerpc/platforms/83xx/suspend.c              |  1 +
 arch/powerpc/platforms/cell/spufs/sched.c          |  2 +-
 arch/powerpc/platforms/cell/spufs/spufs.h          |  1 +
 arch/s390/crypto/prng.c                            |  2 ++
 arch/s390/kvm/kvm-s390.c                           |  2 ++
 arch/s390/kvm/vsie.c                               |  2 ++
 arch/sh/kernel/cpu/fpu.c                           |  2 +-
 arch/sh/kernel/disassemble.c                       |  2 ++
 arch/sh/kernel/process.c                           |  2 +-
 arch/um/drivers/random.c                           |  2 +-
 arch/x86/kernel/apm_32.c                           |  2 +-
 block/blk-cgroup.c                                 |  1 +
 block/blk-mq.c                                     |  1 +
 crypto/algboss.c                                   |  2 +-
 crypto/algif_aead.c                                |  1 +
 crypto/algif_skcipher.c                            |  1 +
 crypto/api.c                                       |  2 +-
 drivers/atm/horizon.c                              |  1 +
 drivers/base/core.c                                |  1 +
 drivers/base/power/wakeup.c                        |  2 +-
 drivers/block/drbd/drbd_main.c                     |  1 +
 drivers/block/drbd/drbd_receiver.c                 |  1 +
 drivers/block/drbd/drbd_worker.c                   |  2 +-
 drivers/block/swim3.c                              |  2 +-
 drivers/char/applicom.c                            |  2 +-
 drivers/char/hpet.c                                |  1 +
 drivers/char/hw_random/core.c                      |  1 +
 drivers/char/ipmi/ipmi_watchdog.c                  |  1 +
 drivers/char/lp.c                                  |  2 +-
 drivers/char/ppdev.c                               |  2 +-
 drivers/char/rtc.c                                 |  2 +-
 drivers/char/snsc.c                                |  2 +-
 drivers/dma-buf/dma-fence.c                        |  1 +
 drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c    |  2 +-
 drivers/gpu/drm/vc4/vc4_gem.c                      |  1 +
 drivers/gpu/vga/vgaarb.c                           |  2 +-
 drivers/hid/hid-debug.c                            |  2 +-
 drivers/hid/hid-roccat.c                           |  2 +-
 drivers/hid/hidraw.c                               |  2 +-
 drivers/hid/usbhid/hiddev.c                        |  1 +
 drivers/hsi/clients/cmt_speech.c                   |  2 +-
 drivers/i2c/busses/i2c-ibm_iic.c                   |  2 ++
 drivers/i2c/busses/i2c-mpc.c                       |  2 +-
 drivers/iio/industrialio-buffer.c                  |  2 +-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c            |  1 +
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c          |  1 +
 drivers/isdn/capi/kcapi.c                          |  2 +-
 drivers/isdn/mISDN/timerdev.c                      |  2 ++
 drivers/lguest/core.c                              |  1 +
 drivers/macintosh/adb.c                            |  2 +-
 drivers/macintosh/smu.c                            |  1 +
 drivers/macintosh/via-pmu.c                        |  2 +-
 drivers/mailbox/mailbox-test.c                     |  1 +
 drivers/md/dm.c                                    |  1 +
 drivers/media/dvb-core/dvb_ca_en50221.c            |  2 +-
 drivers/media/dvb-core/dvb_demux.c                 |  2 +-
 drivers/media/dvb-core/dvb_frontend.c              |  2 +-
 drivers/media/pci/cx18/cx18-driver.h               |  2 +-
 drivers/media/pci/ivtv/ivtv-driver.h               | 35 +++++++++++-----------
 drivers/media/pci/pt1/pt1.c                        |  1 +
 drivers/media/pci/pt3/pt3.c                        |  1 +
 drivers/media/pci/solo6x10/solo6x10-i2c.c          |  1 +
 drivers/media/pci/zoran/zoran_device.c             |  1 +
 drivers/media/platform/vivid/vivid-radio-rx.c      |  2 ++
 drivers/media/platform/vivid/vivid-radio-tx.c      |  1 +
 drivers/media/rc/lirc_dev.c                        |  2 +-
 drivers/media/usb/cpia2/cpia2_core.c               |  1 +
 drivers/media/usb/gspca/cpia1.c                    |  2 ++
 drivers/misc/cxl/fault.c                           |  2 +-
 drivers/misc/cxl/file.c                            |  2 +-
 drivers/misc/ibmasm/r_heartbeat.c                  |  2 +-
 drivers/misc/lis3lv02d/lis3lv02d.c                 |  1 +
 drivers/misc/mei/bus.c                             |  2 +-
 drivers/misc/mei/client.c                          |  2 +-
 drivers/misc/mei/main.c                            |  2 +-
 drivers/misc/mic/scif/scif_main.h                  |  2 +-
 drivers/misc/vexpress-syscfg.c                     |  2 +-
 drivers/mtd/tests/mtd_test.h                       |  2 +-
 drivers/net/bonding/bond_options.c                 |  2 ++
 drivers/net/bonding/bond_sysfs.c                   |  2 +-
 drivers/net/can/softing/softing_fw.c               |  2 +-
 drivers/net/ethernet/broadcom/tg3.c                |  1 +
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |  2 ++
 drivers/net/ethernet/sfc/falcon/falcon.c           |  2 ++
 drivers/net/irda/stir4200.c                        |  1 +
 drivers/net/macvtap.c                              |  2 +-
 drivers/net/ppp/ppp_generic.c                      |  1 +
 drivers/net/tun.c                                  |  1 +
 drivers/net/usb/hso.c                              |  2 +-
 drivers/net/usb/qmi_wwan.c                         |  1 +
 drivers/net/wan/cosa.c                             |  2 +-
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |  1 +
 drivers/net/wireless/broadcom/b43legacy/main.c     |  2 +-
 drivers/net/wireless/intersil/hostap/hostap_hw.c   |  2 +-
 .../net/wireless/intersil/hostap/hostap_ioctl.c    |  2 +-
 drivers/oprofile/event_buffer.c                    |  2 +-
 drivers/parport/daisy.c                            |  2 +-
 drivers/parport/ieee1284.c                         |  2 +-
 drivers/parport/ieee1284_ops.c                     |  2 +-
 drivers/parport/parport_ip32.c                     |  2 +-
 drivers/parport/parport_pc.c                       |  2 +-
 drivers/parport/share.c                            |  2 +-
 drivers/pci/access.c                               |  2 +-
 drivers/pci/hotplug/cpci_hotplug_core.c            |  1 +
 drivers/pci/hotplug/cpqphp.h                       |  2 +-
 drivers/pci/hotplug/pciehp.h                       |  2 +-
 drivers/pci/hotplug/shpchp.h                       |  2 +-
 drivers/rtc/rtc-dev.c                              |  2 +-
 drivers/s390/cio/device.c                          |  1 +
 drivers/scsi/lpfc/lpfc_vport.c                     |  1 +
 drivers/scsi/osst.c                                |  2 +-
 drivers/scsi/st.c                                  |  2 +-
 drivers/soc/fsl/qbman/dpaa_sys.h                   |  1 +
 drivers/staging/comedi/comedi_fops.c               |  2 +-
 drivers/staging/dgnc/dgnc_tty.c                    |  2 +-
 drivers/staging/dgnc/dgnc_utils.c                  |  2 +-
 drivers/staging/greybus/uart.c                     |  2 +-
 .../lustre/lustre/include/lustre/lustre_user.h     |  1 +
 drivers/staging/lustre/lustre/include/lustre_lib.h |  2 +-
 .../staging/lustre/lustre/include/obd_support.h    |  2 ++
 drivers/staging/media/lirc/lirc_sir.c              |  2 +-
 drivers/staging/media/lirc/lirc_zilog.c            |  2 +-
 drivers/staging/speakup/speakup_soft.c             |  2 +-
 drivers/target/iscsi/cxgbit/cxgbit_target.c        |  2 ++
 drivers/tty/n_gsm.c                                |  2 +-
 drivers/tty/serial/crisv10.c                       |  2 +-
 drivers/tty/serial/serial_core.c                   |  1 +
 drivers/tty/tty_ioctl.c                            |  2 +-
 drivers/tty/tty_port.c                             |  2 +-
 drivers/uio/uio.c                                  |  2 +-
 drivers/usb/class/cdc-acm.c                        |  1 +
 drivers/usb/class/usblp.c                          |  2 +-
 drivers/usb/gadget/function/f_fs.c                 |  1 +
 drivers/usb/image/mdc800.c                         |  2 +-
 drivers/usb/misc/adutux.c                          |  1 +
 drivers/usb/misc/idmouse.c                         |  1 +
 drivers/usb/misc/rio500.c                          |  2 +-
 drivers/usb/misc/uss720.c                          |  1 +
 drivers/usb/mon/mon_bin.c                          |  1 +
 drivers/usb/mon/mon_text.c                         |  1 +
 drivers/usb/serial/digi_acceleport.c               |  1 +
 drivers/usb/serial/generic.c                       |  1 +
 drivers/vhost/net.c                                |  1 +
 drivers/vhost/vhost.c                              |  1 +
 drivers/video/fbdev/cobalt_lcdfb.c                 |  1 +
 .../fbdev/omap2/omapfb/displays/panel-dsi-cm.c     |  2 +-
 fs/afs/rxrpc.c                                     |  2 ++
 fs/aio.c                                           |  2 +-
 fs/btrfs/ctree.h                                   |  1 +
 fs/ceph/caps.c                                     |  2 +-
 fs/cifs/inode.c                                    |  2 ++
 fs/coda/psdev.c                                    |  2 +-
 fs/dlm/user.c                                      |  1 +
 fs/ecryptfs/read_write.c                           |  2 ++
 fs/eventfd.c                                       |  2 +-
 fs/eventpoll.c                                     |  2 +-
 fs/ext4/ext4.h                                     |  1 +
 fs/f2fs/data.c                                     |  1 +
 fs/fuse/dev.c                                      |  1 +
 fs/gfs2/lock_dlm.c                                 |  1 +
 fs/gfs2/super.c                                    |  2 +-
 fs/hpfs/hpfs_fn.h                                  |  2 +-
 fs/hugetlbfs/inode.c                               |  2 +-
 fs/jffs2/nodemgmt.c                                |  2 +-
 fs/nfs/inode.c                                     |  2 +-
 fs/nilfs2/segment.c                                |  2 ++
 fs/notify/fanotify/fanotify_user.c                 |  1 +
 fs/notify/inotify/inotify_user.c                   |  2 +-
 fs/ntfs/file.c                                     |  2 +-
 fs/ocfs2/alloc.c                                   |  1 +
 fs/ocfs2/dlm/dlmdomain.c                           |  1 +
 fs/ocfs2/dlmfs/userdlm.c                           |  1 +
 fs/ocfs2/dlmglue.c                                 |  1 +
 fs/orangefs/orangefs-kernel.h                      |  2 +-
 fs/overlayfs/copy_up.c                             |  2 +-
 fs/splice.c                                        |  2 ++
 fs/userfaultfd.c                                   |  2 +-
 fs/xfs/xfs_linux.h                                 |  2 +-
 include/drm/drm_os_linux.h                         |  1 +
 include/linux/sunrpc/types.h                       |  1 +
 include/media/v4l2-ioctl.h                         |  1 +
 include/net/busy_poll.h                            |  1 +
 kernel/locking/mutex.c                             |  2 +-
 kernel/locking/rtmutex.c                           |  2 +-
 kernel/locking/rwsem-spinlock.c                    |  2 +-
 kernel/locking/rwsem-xadd.c                        |  2 +-
 kernel/rcu/rcutorture.c                            |  2 +-
 kernel/sched/completion.c                          |  2 +-
 kernel/sched/swait.c                               |  2 +-
 kernel/sched/wait.c                                |  2 +-
 kernel/time/alarmtimer.c                           |  1 +
 kernel/time/hrtimer.c                              |  2 +-
 kernel/time/timer.c                                |  2 +-
 lib/percpu_ida.c                                   |  2 +-
 mm/compaction.c                                    |  1 +
 mm/gup.c                                           |  2 +-
 mm/hugetlb.c                                       |  1 +
 mm/memory_hotplug.c                                |  1 +
 mm/shmem.c                                         |  1 +
 mm/userfaultfd.c                                   |  1 +
 net/atm/svc.c                                      |  2 +-
 net/bluetooth/af_bluetooth.c                       |  2 ++
 net/bluetooth/cmtp/capi.c                          |  2 +-
 net/bluetooth/hci_request.c                        |  2 ++
 net/bluetooth/l2cap_sock.c                         |  1 +
 net/bluetooth/rfcomm/sock.c                        |  1 +
 net/bluetooth/sco.c                                |  1 +
 net/bridge/br_sysfs_br.c                           |  1 +
 net/bridge/br_sysfs_if.c                           |  1 +
 net/core/ethtool.c                                 |  2 +-
 net/core/net-sysfs.c                               |  1 +
 net/dccp/output.c                                  |  1 +
 net/ipv4/devinet.c                                 |  1 +
 net/ipv6/addrconf.c                                |  1 +
 net/irda/ircomm/ircomm_tty.c                       |  2 +-
 net/irda/irnet/irnet_ppp.c                         |  3 +-
 net/iucv/af_iucv.c                                 |  2 +-
 net/kcm/kcmsock.c                                  |  2 ++
 net/llc/af_llc.c                                   |  2 ++
 net/nfc/llcp_sock.c                                |  1 +
 net/phonet/pep.c                                   |  1 +
 net/phonet/socket.c                                |  2 ++
 net/rxrpc/conn_client.c                            |  2 ++
 net/rxrpc/recvmsg.c                                |  2 ++
 net/rxrpc/sendmsg.c                                |  2 ++
 net/tipc/socket.c                                  |  2 ++
 net/vmw_vsock/af_vsock.c                           |  1 +
 net/vmw_vsock/virtio_transport_common.c            |  1 +
 sound/core/control.c                               |  1 +
 sound/core/hwdep.c                                 |  1 +
 sound/core/oss/pcm_oss.c                           |  1 +
 sound/core/pcm_lib.c                               |  1 +
 sound/core/pcm_native.c                            |  1 +
 sound/core/rawmidi.c                               |  2 +-
 sound/core/seq/oss/seq_oss_device.h                |  2 +-
 sound/core/seq/oss/seq_oss_writeq.c                |  1 +
 sound/core/seq/seq_fifo.c                          |  2 ++
 sound/core/seq/seq_memory.c                        |  1 +
 sound/core/timer.c                                 |  1 +
 sound/firewire/bebob/bebob.h                       |  1 +
 sound/firewire/dice/dice.h                         |  1 +
 sound/firewire/digi00x/digi00x.h                   |  1 +
 sound/firewire/fireworks/fireworks.h               |  1 +
 sound/firewire/oxfw/oxfw.h                         |  1 +
 sound/firewire/tascam/tascam.h                     |  1 +
 sound/isa/gus/gus_pcm.c                            |  2 ++
 sound/isa/msnd/msnd.c                              |  1 +
 sound/isa/sb/emu8000.c                             |  2 +-
 sound/isa/sb/emu8000_patch.c                       |  2 ++
 sound/isa/sb/emu8000_pcm.c                         |  2 ++
 sound/isa/wavefront/wavefront_synth.c              |  1 +
 sound/oss/dmabuf.c                                 |  2 ++
 sound/oss/dmasound/dmasound_core.c                 |  1 +
 sound/oss/midibuf.c                                |  2 ++
 sound/oss/msnd_pinnacle.c                          |  2 ++
 sound/oss/sound_config.h                           |  1 +
 sound/oss/swarm_cs4297a.c                          |  2 +-
 virt/kvm/kvm_main.c                                |  2 +-
 265 files changed, 319 insertions(+), 139 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
index 083c9e517d22..57f3b7512636 100644
--- a/arch/arm/common/bL_switcher.c
+++ b/arch/arm/common/bL_switcher.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/interrupt.h>
 #include <linux/cpu_pm.h>
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 9ac75d68f184..cc62572c1b94 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -16,7 +16,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/major.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/poll.h>
 #include <linux/init.h>
diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c
index ef515af1a377..8efcc1a899a8 100644
--- a/arch/cris/arch-v32/drivers/sync_serial.c
+++ b/arch/cris/arch-v32/drivers/sync_serial.c
@@ -11,7 +11,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/major.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mutex.h>
 #include <linux/interrupt.h>
 #include <linux/poll.h>
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c
index c5c4fd54d797..b80dd8b17a76 100644
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -12,6 +12,8 @@
 #include <linux/syscalls.h>
 #include <linux/moduleloader.h>
 #include <linux/atomic.h>
+#include <linux/sched/signal.h>
+
 #include <asm/mipsmtregs.h>
 #include <asm/mips_mt.h>
 #include <asm/processor.h>
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index ed81e5ac1426..15a1b1716c2e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -16,8 +16,10 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
+
 #include <asm/fpu.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1e107ece4e37..fc56a9ce785d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -22,7 +22,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/fs.h>
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2b38d824e9e5..95c91a9de351 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -23,6 +23,7 @@
 #include <linux/kvm_host.h>
 #include <linux/vmalloc.h>
 #include <linux/hrtimer.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/file.h>
diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c
index 08f92f6ed228..978b85bb3233 100644
--- a/arch/powerpc/platforms/83xx/suspend.c
+++ b/arch/powerpc/platforms/83xx/suspend.c
@@ -15,6 +15,7 @@
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/suspend.h>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index d317c84dc794..1fbb5da17dd2 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -23,7 +23,7 @@
 #undef DEBUG
 
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/rt.h>
 #include <linux/kernel.h>
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index aac733966092..5e59f80e95db 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -27,6 +27,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/cpumask.h>
+#include <linux/sched/signal.h>
 
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 85b7f5efe06a..5a3ec04a7082 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -20,6 +20,8 @@
 #include <linux/cpufeature.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <asm/debug.h>
 #include <linux/uaccess.h>
 #include <asm/timex.h>
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f5694838234d..fd6cd05bb6a7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -29,6 +29,8 @@
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
 #include <linux/bitmap.h>
+#include <linux/sched/signal.h>
+
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 38556e395915..5491be39776b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -14,6 +14,8 @@
 #include <linux/bug.h>
 #include <linux/list.h>
 #include <linux/bitmap.h>
+#include <linux/sched/signal.h>
+
 #include <asm/gmap.h>
 #include <asm/mmu_context.h>
 #include <asm/sclp.h>
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
index 4d4737d099c6..37e35330aae6 100644
--- a/arch/sh/kernel/cpu/fpu.c
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/disassemble.c b/arch/sh/kernel/disassemble.c
index 64d5d8dded7c..015fee58014b 100644
--- a/arch/sh/kernel/disassemble.c
+++ b/arch/sh/kernel/disassemble.c
@@ -12,6 +12,8 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 
+#include <asm/ptrace.h>
+
 /*
  * Format of an instruction in memory.
  */
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 30bff33e810e..80a61c5e3015 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -1,7 +1,7 @@
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/stackprotector.h>
 #include <asm/fpu.h>
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 57f03050c850..37c51a6be690 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -6,7 +6,7 @@
  * This software may be used and distributed according to the terms
  * of the GNU General Public License, incorporated herein by reference.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/interrupt.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4a7080c84a5a..dc04b30cbd60 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -218,7 +218,7 @@
 #include <linux/apm_bios.h>
 #include <linux/init.h>
 #include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pm.h>
 #include <linux/capability.h>
 #include <linux/device.h>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 295e98c2c8cc..bbe7ee00bd3d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -17,6 +17,7 @@
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 746c14e0d157..6f35b6fd4799 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -21,6 +21,7 @@
 #include <linux/cache.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
diff --git a/crypto/algboss.c b/crypto/algboss.c
index ccb85e1798f2..960d8548171b 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/rtnetlink.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 533265f110e0..5a8053758657 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index a9e79d8eff87..43839b00fe6c 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
diff --git a/crypto/api.c b/crypto/api.c
index b16ce1653284..941cd4c6c7ec 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -21,7 +21,7 @@
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/param.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include "internal.h"
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index 2bf1ef1c3c78..0f18480b33b5 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -27,6 +27,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
 #include <linux/errno.h>
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 3050e6f99403..684bda4d14a1 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -26,6 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/pm_runtime.h>
 #include <linux/netdevice.h>
+#include <linux/sched/signal.h>
 #include <linux/sysfs.h>
 
 #include "base.h"
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index f546f8f107b0..136854970489 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -8,7 +8,7 @@
 
 #include <linux/device.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/capability.h>
 #include <linux/export.h>
 #include <linux/suspend.h>
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 116509852a34..c7d530a95e53 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -52,6 +52,7 @@
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
 
 #include <linux/drbd_limits.h>
 #include "drbd_int.h"
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8b40a5b2f8e6..aa6bf9692eff 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -37,6 +37,7 @@
 #include <linux/mm_inline.h>
 #include <linux/slab.h>
 #include <uapi/linux/sched/types.h>
+#include <linux/sched/signal.h>
 #include <linux/pkt_sched.h>
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index c6755c9a0aea..3bff33f21435 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -25,7 +25,7 @@
 
 #include <linux/module.h>
 #include <linux/drbd.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/memcontrol.h>
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index aabd8e9d3035..61b3ffa4f458 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -20,7 +20,7 @@
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/fd.h>
diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c
index e5c62dcf2c11..e770ad977472 100644
--- a/drivers/char/applicom.c
+++ b/drivers/char/applicom.c
@@ -23,7 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/mutex.h>
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 20b32bb8c2af..8bdc38d81adf 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/bcd.h>
 #include <linux/seq_file.h>
 #include <linux/bitops.h>
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 5c654b5d4adf..503a41dfa193 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -17,6 +17,7 @@
 #include <linux/hw_random.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/random.h>
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 30b9e83bf1bf..5ca24d9b101b 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -53,6 +53,7 @@
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/sched/signal.h>
 
 #ifdef CONFIG_X86
 /*
diff --git a/drivers/char/lp.c b/drivers/char/lp.c
index 5b6742770656..565e4cf04a02 100644
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -117,7 +117,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/delay.h>
diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index 87885d146dbb..2a558c706581 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -58,7 +58,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 #include <linux/ioctl.h>
 #include <linux/parport.h>
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 35259961cc38..974d48927b07 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -74,7 +74,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
 #include <linux/bcd.h>
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index ec07f0e99732..6aa32679fd58 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -16,7 +16,7 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 #include <linux/poll.h>
 #include <linux/init.h>
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index d1f1f456f5c4..d195d617076d 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -22,6 +22,7 @@
 #include <linux/export.h>
 #include <linux/atomic.h>
 #include <linux/dma-fence.h>
+#include <linux/sched/signal.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/dma_fence.h>
diff --git a/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c b/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c
index a2bb855a2851..ac5800c72cb4 100644
--- a/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c
+++ b/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c
@@ -18,7 +18,7 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/of_device.h>
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index ab3016982466..1eef98c3331d 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -26,6 +26,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/sched/signal.h>
 
 #include "uapi/drm/vc4_drm.h"
 #include "vc4_drv.h"
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 0f5b2dd24507..92f1452dad57 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -41,7 +41,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/spinlock.h>
 #include <linux/poll.h>
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index acfb522a432a..c6c9c51c806f 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -30,7 +30,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c
index 76d06cf87b2a..fb77dec720a4 100644
--- a/drivers/hid/hid-roccat.c
+++ b/drivers/hid/hid-roccat.c
@@ -25,7 +25,7 @@
 
 #include <linux/cdev.h>
 #include <linux/poll.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/hid-roccat.h>
 #include <linux/module.h>
 
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index f0e2757cb909..ec530454e6f6 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -33,7 +33,7 @@
 #include <linux/slab.h>
 #include <linux/hid.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 
 #include <linux/hidraw.h>
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 700145b15088..774bd701dae0 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -27,6 +27,7 @@
 
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/input.h>
diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 7175e6bedf21..727f968ac1cb 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -31,7 +31,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ioctl.h>
 #include <linux/uaccess.h>
 #include <linux/pm_qos.h>
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index 412b91d255ad..961c5f42d956 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -37,6 +37,8 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/sched/signal.h>
+
 #include <asm/irq.h>
 #include <linux/io.h>
 #include <linux/i2c.h>
diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c
index 565a49a0c564..96caf378b1dc 100644
--- a/drivers/i2c/busses/i2c-mpc.c
+++ b/drivers/i2c/busses/i2c-mpc.c
@@ -15,7 +15,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 4972986f6455..d2b465140a6b 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -20,7 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <linux/iio/iio.h>
 #include "iio_core.h"
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index a6d6c617b597..0cdf2b7f272f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
+#include <linux/sched/signal.h>
 
 #include "ipoib.h"
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index deedb6fc1b05..3e10e3dac2e7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 
 #include <linux/init.h>
 #include <linux/seq_file.h>
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 49d0f70c2bae..1dfd1085a04f 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -18,7 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/seq_file.h>
 #include <linux/skbuff.h>
 #include <linux/workqueue.h>
diff --git a/drivers/isdn/mISDN/timerdev.c b/drivers/isdn/mISDN/timerdev.c
index 9438d7ec3308..b1e135fc1fb5 100644
--- a/drivers/isdn/mISDN/timerdev.c
+++ b/drivers/isdn/mISDN/timerdev.c
@@ -25,6 +25,8 @@
 #include <linux/module.h>
 #include <linux/mISDNif.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
+
 #include "core.h"
 
 static DEFINE_MUTEX(mISDN_mutex);
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index ac219045daf7..395ed1961dbf 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -8,6 +8,7 @@
 #include <linux/stddef.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 152414e6378a..fee939efc4fc 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/adb.h>
 #include <linux/cuda.h>
 #include <linux/pmu.h>
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 227869159ac0..1ac66421877a 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -39,6 +39,7 @@
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/sched/signal.h>
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 43b8db2b5445..cce99f72e4ae 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -23,7 +23,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/miscdevice.h>
 #include <linux/blkdev.h>
 #include <linux/pci.h>
diff --git a/drivers/mailbox/mailbox-test.c b/drivers/mailbox/mailbox-test.c
index 9c79f8019d2a..97fb956bb6e0 100644
--- a/drivers/mailbox/mailbox-test.c
+++ b/drivers/mailbox/mailbox-test.c
@@ -21,6 +21,7 @@
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/sched/signal.h>
 
 #define MBOX_MAX_SIG_LEN	8
 #define MBOX_MAX_MSG_LEN	128
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9f37d7fc2786..f4ffd1eb8f44 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/mempool.h>
diff --git a/drivers/media/dvb-core/dvb_ca_en50221.c b/drivers/media/dvb-core/dvb_ca_en50221.c
index 000d737ad827..8d65028c7a74 100644
--- a/drivers/media/dvb-core/dvb_ca_en50221.c
+++ b/drivers/media/dvb-core/dvb_ca_en50221.c
@@ -34,7 +34,7 @@
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 
 #include "dvb_ca_en50221.h"
diff --git a/drivers/media/dvb-core/dvb_demux.c b/drivers/media/dvb-core/dvb_demux.c
index 4eac71e50c5f..6628f80d184f 100644
--- a/drivers/media/dvb-core/dvb_demux.c
+++ b/drivers/media/dvb-core/dvb_demux.c
@@ -19,7 +19,7 @@
 
 #define pr_fmt(fmt) "dvb_demux: " fmt
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index 85ae3669aa66..e3fff8f64d37 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -29,7 +29,7 @@
 
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
diff --git a/drivers/media/pci/cx18/cx18-driver.h b/drivers/media/pci/cx18/cx18-driver.h
index fef3c736fcba..7be2088c45fe 100644
--- a/drivers/media/pci/cx18/cx18-driver.h
+++ b/drivers/media/pci/cx18/cx18-driver.h
@@ -24,7 +24,7 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
diff --git a/drivers/media/pci/ivtv/ivtv-driver.h b/drivers/media/pci/ivtv/ivtv-driver.h
index cde452e30746..d27c5c2c07ea 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.h
+++ b/drivers/media/pci/ivtv/ivtv-driver.h
@@ -38,37 +38,38 @@
  *                using information provided by Jiun-Kuei Jung @ AVerMedia.
  */
 
-#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/device.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ivtv.h>
-#include <linux/kernel.h>
-#include <linux/kthread.h>
 #include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
+#include <linux/unistd.h>
 #include <linux/pagemap.h>
-#include <linux/pci.h>
 #include <linux/scatterlist.h>
-#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/spinlock.h>
 #include <linux/uaccess.h>
-#include <linux/unistd.h>
+#include <asm/byteorder.h>
 
-#include <media/drv-intf/cx2341x.h>
-#include <media/i2c/ir-kbd-i2c.h>
-#include <media/tuner.h>
+#include <linux/dvb/video.h>
+#include <linux/dvb/audio.h>
 #include <media/v4l2-common.h>
+#include <media/v4l2-ioctl.h>
 #include <media/v4l2-ctrls.h>
 #include <media/v4l2-device.h>
 #include <media/v4l2-fh.h>
-#include <media/v4l2-ioctl.h>
+#include <media/tuner.h>
+#include <media/drv-intf/cx2341x.h>
+#include <media/i2c/ir-kbd-i2c.h>
+
+#include <linux/ivtv.h>
 
 /* Memory layout */
 #define IVTV_ENCODER_OFFSET	0x00000000
diff --git a/drivers/media/pci/pt1/pt1.c b/drivers/media/pci/pt1/pt1.c
index da1eebd2016f..3219d2f3271e 100644
--- a/drivers/media/pci/pt1/pt1.c
+++ b/drivers/media/pci/pt1/pt1.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/media/pci/pt3/pt3.c b/drivers/media/pci/pt3/pt3.c
index 77f4d15f322b..e8b5d0992157 100644
--- a/drivers/media/pci/pt3/pt3.c
+++ b/drivers/media/pci/pt3/pt3.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/media/pci/solo6x10/solo6x10-i2c.c b/drivers/media/pci/solo6x10/solo6x10-i2c.c
index c908672b2c40..e83bb79f9349 100644
--- a/drivers/media/pci/solo6x10/solo6x10-i2c.c
+++ b/drivers/media/pci/solo6x10/solo6x10-i2c.c
@@ -27,6 +27,7 @@
  * thread context, ACK the interrupt, and move on. -- BenC */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 
 #include "solo6x10.h"
 
diff --git a/drivers/media/pci/zoran/zoran_device.c b/drivers/media/pci/zoran/zoran_device.c
index 671907a6e6b6..40adceebca7e 100644
--- a/drivers/media/pci/zoran/zoran_device.c
+++ b/drivers/media/pci/zoran/zoran_device.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/ktime.h>
+#include <linux/sched/signal.h>
 
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/media/platform/vivid/vivid-radio-rx.c b/drivers/media/platform/vivid/vivid-radio-rx.c
index f99092ca8f5c..47c36c26096b 100644
--- a/drivers/media/platform/vivid/vivid-radio-rx.c
+++ b/drivers/media/platform/vivid/vivid-radio-rx.c
@@ -22,6 +22,8 @@
 #include <linux/delay.h>
 #include <linux/videodev2.h>
 #include <linux/v4l2-dv-timings.h>
+#include <linux/sched/signal.h>
+
 #include <media/v4l2-common.h>
 #include <media/v4l2-event.h>
 #include <media/v4l2-dv-timings.h>
diff --git a/drivers/media/platform/vivid/vivid-radio-tx.c b/drivers/media/platform/vivid/vivid-radio-tx.c
index 8c59d4f53200..0e8025b7b4dd 100644
--- a/drivers/media/platform/vivid/vivid-radio-tx.c
+++ b/drivers/media/platform/vivid/vivid-radio-tx.c
@@ -19,6 +19,7 @@
 
 #include <linux/errno.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/videodev2.h>
 #include <linux/v4l2-dv-timings.h>
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index a54ca531d8ef..393dccaabdd0 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -19,7 +19,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/ioctl.h>
 #include <linux/fs.h>
diff --git a/drivers/media/usb/cpia2/cpia2_core.c b/drivers/media/usb/cpia2/cpia2_core.c
index 431dd0b4b332..b1d13444ff30 100644
--- a/drivers/media/usb/cpia2/cpia2_core.c
+++ b/drivers/media/usb/cpia2/cpia2_core.c
@@ -32,6 +32,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/firmware.h>
+#include <linux/sched/signal.h>
 
 #define FIRMWARE "cpia2/stv0672_vp4.bin"
 MODULE_FIRMWARE(FIRMWARE);
diff --git a/drivers/media/usb/gspca/cpia1.c b/drivers/media/usb/gspca/cpia1.c
index 23d3285f182a..e91d00762e94 100644
--- a/drivers/media/usb/gspca/cpia1.c
+++ b/drivers/media/usb/gspca/cpia1.c
@@ -27,6 +27,8 @@
 #define MODULE_NAME "cpia1"
 
 #include <linux/input.h>
+#include <linux/sched/signal.h>
+
 #include "gspca.h"
 
 MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>");
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index e33011e3e7e2..2fa015c05561 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -8,7 +8,7 @@
  */
 
 #include <linux/workqueue.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/pid.h>
 #include <linux/mm.h>
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 859959f19f10..e7139c76f961 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -12,7 +12,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/bitmap.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/poll.h>
 #include <linux/pid.h>
 #include <linux/fs.h>
diff --git a/drivers/misc/ibmasm/r_heartbeat.c b/drivers/misc/ibmasm/r_heartbeat.c
index 232034f5da48..5c7dd26db716 100644
--- a/drivers/misc/ibmasm/r_heartbeat.c
+++ b/drivers/misc/ibmasm/r_heartbeat.c
@@ -20,7 +20,7 @@
  *
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include "ibmasm.h"
 #include "dot_command.h"
 
diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c b/drivers/misc/lis3lv02d/lis3lv02d.c
index fb8705fc3aca..e389b0b5278d 100644
--- a/drivers/misc/lis3lv02d/lis3lv02d.c
+++ b/drivers/misc/lis3lv02d/lis3lv02d.c
@@ -23,6 +23,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/dmi.h>
 #include <linux/module.h>
 #include <linux/types.h>
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index cb3e9e0ca049..df5f78ae3d25 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index 68fe37b5bc52..d3e3372424d6 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c
@@ -14,7 +14,7 @@
  *
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 9d0b7050c79a..bf816449cd40 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/ioctl.h>
 #include <linux/cdev.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uuid.h>
 #include <linux/compat.h>
 #include <linux/jiffies.h>
diff --git a/drivers/misc/mic/scif/scif_main.h b/drivers/misc/mic/scif/scif_main.h
index a08f0b600a9e..0e5eff9ad080 100644
--- a/drivers/misc/mic/scif/scif_main.h
+++ b/drivers/misc/mic/scif/scif_main.h
@@ -18,7 +18,7 @@
 #ifndef SCIF_MAIN_H
 #define SCIF_MAIN_H
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pci.h>
 #include <linux/miscdevice.h>
 #include <linux/dmaengine.h>
diff --git a/drivers/misc/vexpress-syscfg.c b/drivers/misc/vexpress-syscfg.c
index c344483fa7d6..2cde80c7bb93 100644
--- a/drivers/misc/vexpress-syscfg.c
+++ b/drivers/misc/vexpress-syscfg.c
@@ -16,7 +16,7 @@
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/syscore_ops.h>
 #include <linux/vexpress.h>
diff --git a/drivers/mtd/tests/mtd_test.h b/drivers/mtd/tests/mtd_test.h
index 4b7bee17c924..04afd0e7074f 100644
--- a/drivers/mtd/tests/mtd_test.h
+++ b/drivers/mtd/tests/mtd_test.h
@@ -1,5 +1,5 @@
 #include <linux/mtd/mtd.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 static inline int mtdtest_relax(void)
 {
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 577e57cad1dc..1bcbb8913e17 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -16,6 +16,8 @@
 #include <linux/rcupdate.h>
 #include <linux/ctype.h>
 #include <linux/inet.h>
+#include <linux/sched/signal.h>
+
 #include <net/bonding.h>
 
 static int bond_option_active_slave_set(struct bonding *bond,
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index e23c3ed737de..770623a0cc01 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -24,7 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/string.h>
diff --git a/drivers/net/can/softing/softing_fw.c b/drivers/net/can/softing/softing_fw.c
index 4063215c9b54..aac58ce6e371 100644
--- a/drivers/net/can/softing/softing_fw.c
+++ b/drivers/net/can/softing/softing_fw.c
@@ -17,7 +17,7 @@
  */
 
 #include <linux/firmware.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <asm/div64.h>
 #include <asm/io.h>
 
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index a448177990fe..30d1eb9ebec9 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -20,6 +20,7 @@
 #include <linux/moduleparam.h>
 #include <linux/stringify.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/slab.h>
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
index 8cd389148166..aa36e9ae7676 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
@@ -23,6 +23,8 @@
 #ifndef _OCTEON_MAIN_H_
 #define  _OCTEON_MAIN_H_
 
+#include <linux/sched/signal.h>
+
 #if BITS_PER_LONG == 32
 #define CVM_CAST64(v) ((long long)(v))
 #elif BITS_PER_LONG == 64
diff --git a/drivers/net/ethernet/sfc/falcon/falcon.c b/drivers/net/ethernet/sfc/falcon/falcon.c
index c6ff0cc5ef18..93c713c1f627 100644
--- a/drivers/net/ethernet/sfc/falcon/falcon.c
+++ b/drivers/net/ethernet/sfc/falcon/falcon.c
@@ -16,6 +16,8 @@
 #include <linux/i2c.h>
 #include <linux/mii.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "net_driver.h"
 #include "bitfield.h"
 #include "efx.h"
diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
index 42da094b68dd..7ee514879531 100644
--- a/drivers/net/irda/stir4200.c
+++ b/drivers/net/irda/stir4200.c
@@ -40,6 +40,7 @@
 #include <linux/moduleparam.h>
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/ktime.h>
 #include <linux/types.h>
 #include <linux/time.h>
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index a4bfc10b61dd..da85057680d6 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -9,7 +9,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/cache.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index a411b43a69eb..f9c0e62716ea 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -24,6 +24,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/list.h>
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 30863e378925..dc1b1dd9157c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -44,6 +44,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index e7b516342678..4f2e8141dbe2 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -52,7 +52,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/delay.h>
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 24d5272cdce5..805674550683 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/netdevice.h>
 #include <linux/ethtool.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index 087eb266601f..4ca71bca39ac 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -78,7 +78,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fs.h>
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index b7fe0af4cb24..363b30a549c2 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -20,6 +20,7 @@
 #include <linux/moduleparam.h>
 #include <linux/inetdevice.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
 
 #include "core.h"
 #include "cfg80211.h"
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index e97ab2b91663..cdafebb9c936 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -36,7 +36,7 @@
 #include <linux/etherdevice.h>
 #include <linux/firmware.h>
 #include <linux/workqueue.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/skbuff.h>
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 544ef7adde7d..04dfd040a650 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -43,7 +43,7 @@
 #include <linux/delay.h>
 #include <linux/random.h>
 #include <linux/wait.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/rtnetlink.h>
 #include <linux/wireless.h>
 #include <net/iw_handler.h>
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
index a5656bc0e6aa..b2c6b065b542 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
@@ -2,7 +2,7 @@
 
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ethtool.h>
 #include <linux/if_arp.h>
 #include <linux/module.h>
diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
index 67935fbbbcab..32888f2bd1a9 100644
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -14,7 +14,7 @@
 
 #include <linux/vmalloc.h>
 #include <linux/oprofile.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/capability.h>
 #include <linux/dcookies.h>
 #include <linux/fs.h>
diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index d998d0ed2bec..46eb15fb57ff 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -23,7 +23,7 @@
 #include <linux/parport.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <asm/current.h>
 #include <linux/uaccess.h>
diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
index f9fd4b33a546..74cc6dd982d2 100644
--- a/drivers/parport/ieee1284.c
+++ b/drivers/parport/ieee1284.c
@@ -23,7 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/timer.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #undef DEBUG /* undef me for production */
 
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index 75071605d22f..a959224d011b 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -17,7 +17,7 @@
 #include <linux/module.h>
 #include <linux/parport.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #undef DEBUG /* undef me for production */
diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c
index 30e981be14c2..dcbeeb220dda 100644
--- a/drivers/parport/parport_ip32.c
+++ b/drivers/parport/parport_ip32.c
@@ -102,7 +102,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/parport.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 3e56e7deab8e..9d42dfe65d44 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -44,7 +44,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 3308427ed9f7..bc090daa850a 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -27,7 +27,7 @@
 #include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kmod.h>
 #include <linux/device.h>
 
diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index b9dd37c8c9ce..8b7382705bf2 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -1,7 +1,7 @@
 #include <linux/delay.h>
 #include <linux/pci.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/wait.h>
diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c
index 7ec8a8f72c69..95f689f53920 100644
--- a/drivers/pci/hotplug/cpci_hotplug_core.c
+++ b/drivers/pci/hotplug/cpci_hotplug_core.c
@@ -27,6 +27,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
diff --git a/drivers/pci/hotplug/cpqphp.h b/drivers/pci/hotplug/cpqphp.h
index 9103a7b9f3b9..48c8a066a6b7 100644
--- a/drivers/pci/hotplug/cpqphp.h
+++ b/drivers/pci/hotplug/cpqphp.h
@@ -32,7 +32,7 @@
 #include <asm/io.h>		/* for read? and write? functions */
 #include <linux/delay.h>	/* for delays */
 #include <linux/mutex.h>
-#include <linux/sched.h>	/* for signal_pending() */
+#include <linux/sched/signal.h>	/* for signal_pending() */
 
 #define MY_NAME	"cpqphp"
 
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 37d70b5ad22f..06109d40c4ac 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -33,7 +33,7 @@
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
 #include <linux/delay.h>
-#include <linux/sched.h>		/* signal_pending() */
+#include <linux/sched/signal.h>		/* signal_pending() */
 #include <linux/pcieport_if.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index 4da8fc601467..70c7ea6af034 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -33,7 +33,7 @@
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
 #include <linux/delay.h>
-#include <linux/sched.h>	/* signal_pending(), struct timer_list */
+#include <linux/sched/signal.h>	/* signal_pending(), struct timer_list */
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index a6d9434addf6..6dc8f29697ab 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -15,7 +15,7 @@
 
 #include <linux/module.h>
 #include <linux/rtc.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include "rtc-core.h"
 
 static dev_t rtc_devt;
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 79823ee9c100..b8006ea9099c 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -24,6 +24,7 @@
 #include <linux/delay.h>
 #include <linux/timer.h>
 #include <linux/kernel_stat.h>
+#include <linux/sched/signal.h>
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c
index e18bbc66e83b..4e36998a266c 100644
--- a/drivers/scsi/lpfc/lpfc_vport.c
+++ b/drivers/scsi/lpfc/lpfc_vport.c
@@ -28,6 +28,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_device.h>
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 75ac662793a3..c47f4b349bac 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -35,7 +35,7 @@ static const char * osst_version = "0.99.4";
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 81212d4bd9bf..e5ef78a6848e 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -23,7 +23,7 @@ static const char *verstr = "20160209";
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/string.h>
diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
index 2eaf3184f61d..2ce394aa4c95 100644
--- a/drivers/soc/fsl/qbman/dpaa_sys.h
+++ b/drivers/soc/fsl/qbman/dpaa_sys.h
@@ -36,6 +36,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index 57e8599b54e6..8deac8d9225d 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fcntl.h>
 #include <linux/delay.h>
 #include <linux/mm.h>
diff --git a/drivers/staging/dgnc/dgnc_tty.c b/drivers/staging/dgnc/dgnc_tty.c
index c63e591631f6..c3b8fc54883d 100644
--- a/drivers/staging/dgnc/dgnc_tty.c
+++ b/drivers/staging/dgnc/dgnc_tty.c
@@ -19,7 +19,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>	/* For jiffies, task states */
+#include <linux/sched/signal.h>	/* For jiffies, task states, etc. */
 #include <linux/interrupt.h>	/* For tasklet and interrupt structs/defines */
 #include <linux/module.h>
 #include <linux/ctype.h>
diff --git a/drivers/staging/dgnc/dgnc_utils.c b/drivers/staging/dgnc/dgnc_utils.c
index 95272f4765fc..6f59240024d1 100644
--- a/drivers/staging/dgnc/dgnc_utils.c
+++ b/drivers/staging/dgnc/dgnc_utils.c
@@ -1,5 +1,5 @@
 #include <linux/tty.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include "dgnc_utils.h"
 
 /*
diff --git a/drivers/staging/greybus/uart.c b/drivers/staging/greybus/uart.c
index ab0dbf5cab5a..43255e2e9276 100644
--- a/drivers/staging/greybus/uart.c
+++ b/drivers/staging/greybus/uart.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
index 21aec0ca9ad3..7d8628ce0d3b 100644
--- a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
@@ -44,6 +44,7 @@
 
 #ifdef __KERNEL__
 # include <linux/quota.h>
+# include <linux/sched/signal.h>
 # include <linux/string.h> /* snprintf() */
 # include <linux/version.h>
 #else /* !__KERNEL__ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
index 27f3148c4344..b04d613846ee 100644
--- a/drivers/staging/lustre/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -42,7 +42,7 @@
  * @{
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/types.h>
 #include "../../include/linux/libcfs/libcfs.h"
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
index aaedec7d793c..dace6591a0a4 100644
--- a/drivers/staging/lustre/lustre/include/obd_support.h
+++ b/drivers/staging/lustre/lustre/include/obd_support.h
@@ -34,6 +34,8 @@
 #define _OBD_SUPPORT
 
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "../../include/linux/libcfs/libcfs.h"
 #include "lustre_compat.h"
 #include "lprocfs_status.h"
diff --git a/drivers/staging/media/lirc/lirc_sir.c b/drivers/staging/media/lirc/lirc_sir.c
index c75ae43095ba..c6c3de94adaa 100644
--- a/drivers/staging/media/lirc/lirc_sir.c
+++ b/drivers/staging/media/lirc/lirc_sir.c
@@ -36,7 +36,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fs.h>
diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c
index 34aac3e2eb87..e4a533b6beb3 100644
--- a/drivers/staging/media/lirc/lirc_zilog.c
+++ b/drivers/staging/media/lirc/lirc_zilog.c
@@ -42,7 +42,7 @@
 #include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/string.h>
diff --git a/drivers/staging/speakup/speakup_soft.c b/drivers/staging/speakup/speakup_soft.c
index ff68a384f9c2..d2ff0afd685a 100644
--- a/drivers/staging/speakup/speakup_soft.c
+++ b/drivers/staging/speakup/speakup_soft.c
@@ -22,7 +22,7 @@
 #include <linux/unistd.h>
 #include <linux/miscdevice.h>	/* for misc_register, and SYNTH_MINOR */
 #include <linux/poll.h>		/* for poll_wait() */
-#include <linux/sched.h> /* schedule(), signal_pending(), TASK_INTERRUPTIBLE */
+#include <linux/sched/signal.h> /* schedule(), signal_pending(), TASK_INTERRUPTIBLE */
 
 #include "spk_priv.h"
 #include "speakup.h"
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_target.c b/drivers/target/iscsi/cxgbit/cxgbit_target.c
index 8bcb9b71f764..0c3e8fce3695 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_target.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_target.c
@@ -8,6 +8,8 @@
 
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
+
 #include <asm/unaligned.h>
 #include <net/tcp.h>
 #include <target/target_core_base.h>
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index f3932baed07d..55577cf9b6a4 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -39,7 +39,7 @@
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/ctype.h>
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index e92c23470e51..59a2a7e18b5a 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -12,7 +12,7 @@ static char *serial_version = "$Revision: 1.25 $";
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 9939c3d9912b..3fe56894974a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/of.h>
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index f27fc0f14c11..a9a978731c5b 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -9,7 +9,7 @@
 #include <linux/types.h>
 #include <linux/termios.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/tty.h>
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 5cd3cd932293..1d21a9c1d33e 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -11,7 +11,7 @@
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 31d95dc9c202..60ce7fd54e89 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/idr.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/kobject.h>
 #include <linux/cdev.h>
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 235e305f8473..d5388938bc7a 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -32,6 +32,7 @@
 #undef VERBOSE_DEBUG
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 071964c7847f..cc61055fb9be 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -49,7 +49,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 87fccf611b69..a5b7cd615698 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -23,6 +23,7 @@
 #include <linux/export.h>
 #include <linux/hid.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/uio.h>
 #include <asm/unaligned.h>
 
diff --git a/drivers/usb/image/mdc800.c b/drivers/usb/image/mdc800.c
index 5cf2633cdb04..e92540a21b6b 100644
--- a/drivers/usb/image/mdc800.c
+++ b/drivers/usb/image/mdc800.c
@@ -85,7 +85,7 @@
  * (20/10/1999)
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c
index c5fa584d8f0a..db9a9e6ff6be 100644
--- a/drivers/usb/misc/adutux.c
+++ b/drivers/usb/misc/adutux.c
@@ -21,6 +21,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/module.h>
diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c
index debc1fd74b0d..8b9fd7534f69 100644
--- a/drivers/usb/misc/idmouse.c
+++ b/drivers/usb/misc/idmouse.c
@@ -17,6 +17,7 @@
 */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c
index fc329c98a6e8..b106ce76997b 100644
--- a/drivers/usb/misc/rio500.c
+++ b/drivers/usb/misc/rio500.c
@@ -31,7 +31,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mutex.h>
 #include <linux/errno.h>
 #include <linux/random.h>
diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c
index 0a643fa74cab..e45a3a680db8 100644
--- a/drivers/usb/misc/uss720.c
+++ b/drivers/usb/misc/uss720.c
@@ -50,6 +50,7 @@
 #include <linux/completion.h>
 #include <linux/kref.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 9fb8b1e6ecc2..b6d8bf475c92 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/cdev.h>
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index db1a4abf2806..19c416d69eb9 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -8,6 +8,7 @@
 #include <linux/list.h>
 #include <linux/usb.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/ktime.h>
 #include <linux/export.h>
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index eb433922598c..ab78111e0968 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <linux/usb.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/usb/serial.h>
 
 /* Defines */
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 944de657a07a..49ce2be90fa0 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/sysrq.h>
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5c98ad4d2f4c..9b519897cc17 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -18,6 +18,7 @@
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 
 #include <linux/net.h>
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4a00140a7624..dcbe2e29bf17 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/sort.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/interval_tree_generic.h>
 
 #include "vhost.h"
diff --git a/drivers/video/fbdev/cobalt_lcdfb.c b/drivers/video/fbdev/cobalt_lcdfb.c
index 038ac6934fe9..9da90bd242f4 100644
--- a/drivers/video/fbdev/cobalt_lcdfb.c
+++ b/drivers/video/fbdev/cobalt_lcdfb.c
@@ -26,6 +26,7 @@
 #include <linux/uaccess.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 
 /*
  * Cursor position address
diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
index 8b810696a42b..fd2b372d0264 100644
--- a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
+++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
@@ -19,7 +19,7 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/of_device.h>
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 95f42872b787..4f6b00efc27c 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -10,6 +10,8 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <rxrpc/packet.h>
diff --git a/fs/aio.c b/fs/aio.c
index 7e2ab9c8e39c..0bb108476de2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -20,7 +20,7 @@
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 105d4d43993e..a8812d95359d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -20,6 +20,7 @@
 #define __BTRFS_CTREE__
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/rwsem.h>
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cd966f276a8d..68c78be19d5b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2,7 +2,7 @@
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7ab5be7944aa..1858fc20eb7d 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -23,6 +23,8 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/freezer.h>
+#include <linux/sched/signal.h>
+
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 822629126e89..f40e3953e7fe 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,7 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/fcntl.h>
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1ce908c2232c..23488f559cf9 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include "dlm_internal.h"
 #include "lockspace.h"
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 158a3a39f82d..039e627194a9 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -22,6 +22,8 @@
 
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 1231cd1999d8..68b9fffcb2c8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -9,7 +9,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/list.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5ec16313da1a..341251421ced 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -13,7 +13,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/signal.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2fd17e8e4984..77798a46b0c6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -28,6 +28,7 @@
 #include <linux/timer.h>
 #include <linux/version.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1375fef11146..1602b4bccae6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/sched/signal.h>
 
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f11792672977..b681b43c766e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/poll.h>
+#include <linux/sched/signal.h>
 #include <linux/uio.h>
 #include <linux/miscdevice.h>
 #include <linux/pagemap.h>
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 8b907c5cc913..0515f0a68637 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 #include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/sched/signal.h>
 
 #include "incore.h"
 #include "glock.h"
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e3ee387a6dfe..361796a84fce 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -10,7 +10,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/bio.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index aebb78f9e47f..d352f3a6af7f 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -18,7 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/blkdev.h>
 #include <asm/unaligned.h>
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 54de77e78775..8f96461236f6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -11,7 +11,7 @@
 
 #include <linux/thread_info.h>
 #include <asm/current.h>
-#include <linux/sched.h>		/* remove ASAP */
+#include <linux/sched/signal.h>		/* remove ASAP */
 #include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index cda0774c2c9c..a7bbe879cfc3 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/mtd/mtd.h>
 #include <linux/compiler.h>
-#include <linux/sched.h> /* For cond_resched() */
+#include <linux/sched/signal.h>
 #include "nodelist.h"
 #include "debug.h"
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5ca4d96b1942..685565b229c3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -15,7 +15,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7d18d62e8e07..febed1217b3f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -30,6 +30,8 @@
 #include <linux/crc32.h>
 #include <linux/pagevec.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "nilfs.h"
 #include "btnode.h"
 #include "page.h"
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 7ebfca6a1427..2b37f2785834 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 
 #include <asm/ioctls.h>
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1cf41c623be1..498d609b26c7 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -30,7 +30,7 @@
 #include <linux/inotify.h>
 #include <linux/kernel.h> /* roundup() */
 #include <linux/namei.h> /* LOOKUP_FOLLOW */
-#include <linux/sched.h> /* struct user */
+#include <linux/sched/signal.h>
 #include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
 #include <linux/types.h>
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 358ed7e1195a..c4f68c338735 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -24,7 +24,7 @@
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d4ec0d8961a6..fb15a96df0b6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -30,6 +30,7 @@
 #include <linux/swap.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/sched/signal.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 32fd261ae13d..a2b19fbdcf46 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/debugfs.h>
+#include <linux/sched/signal.h>
 
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index f70cda2f090d..9cecf4857195 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -28,6 +28,7 @@
  */
 
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8dce4099a6ca..3b7c937a36b5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -33,6 +33,7 @@
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/quotaops.h>
+#include <linux/sched/signal.h>
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 70355a9a2596..8948683b367f 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -41,7 +41,7 @@
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/uio.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/dcache.h>
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 53d3f830358f..a34aa7aa2563 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -15,7 +15,7 @@
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/fdtable.h>
diff --git a/fs/splice.c b/fs/splice.c
index 4ef78aa8ef61..e49336555739 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,6 +33,8 @@
 #include <linux/gfp.h>
 #include <linux/socket.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
+
 #include "internal.h"
 
 /*
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 18d12bfff770..973607df579d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -14,7 +14,7 @@
 
 #include <linux/list.h>
 #include <linux/hashtable.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/mm.h>
 #include <linux/poll.h>
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 7a989de224f4..592fdf7111cb 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -55,7 +55,7 @@ typedef __u32			xfs_nlink_t;
 #include <linux/file.h>
 #include <linux/swap.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/bitops.h>
 #include <linux/major.h>
 #include <linux/pagemap.h>
diff --git a/include/drm/drm_os_linux.h b/include/drm/drm_os_linux.h
index 86ab99bc0ac5..35e1482ba8a1 100644
--- a/include/drm/drm_os_linux.h
+++ b/include/drm/drm_os_linux.h
@@ -4,6 +4,7 @@
  */
 
 #include <linux/interrupt.h>	/* For task queue support */
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 
 #ifndef readq
diff --git a/include/linux/sunrpc/types.h b/include/linux/sunrpc/types.h
index d222f47550af..11a7536c0fd2 100644
--- a/include/linux/sunrpc/types.h
+++ b/include/linux/sunrpc/types.h
@@ -10,6 +10,7 @@
 #define _LINUX_SUNRPC_TYPES_H_
 
 #include <linux/timer.h>
+#include <linux/sched/signal.h>
 #include <linux/workqueue.h>
 #include <linux/sunrpc/debug.h>
 #include <linux/list.h>
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index 574ff2ae94be..6cd94e5ee113 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -12,6 +12,7 @@
 #include <linux/poll.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 #include <linux/compiler.h> /* need __user */
 #include <linux/videodev2.h>
 
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b96832df239e..c0452de83086 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -26,6 +26,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/signal.h>
 #include <net/ip.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 57f6311e2405..82ff5726bc1b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -19,7 +19,7 @@
  */
 #include <linux/mutex.h>
 #include <linux/ww_mutex.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
 #include <linux/export.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d4f798491361..1c8c8707853a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -12,7 +12,7 @@
  */
 #include <linux/spinlock.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/sched/wake_q.h>
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 5eacab880f67..91e7bc8e9d5a 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -6,7 +6,7 @@
  * - Derived also from comments by Linus
  */
 #include <linux/rwsem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 
 enum rwsem_waiter_type {
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 4fe8d8ad4396..9bd6e768164e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -12,7 +12,7 @@
 #include <linux/rwsem.h>
 #include <linux/init.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
 #include <linux/osq_lock.h>
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 6a28b79710f0..cccc417a8135 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -32,7 +32,7 @@
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index f063a25d4449..b294a8ee2842 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,7 +11,7 @@
  * Waiting for completion is a typically sync point, but not an exclusion point.
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/completion.h>
 
 /**
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 82f0dff90030..3d5610dcce11 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/swait.h>
 
 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9453efe9b25a..1fedfcf6fc9b 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -5,7 +5,7 @@
  */
 #include <linux/init.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e6dc9a538efa..04939053c823 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -19,6 +19,7 @@
 #include <linux/hrtimer.h>
 #include <linux/timerqueue.h>
 #include <linux/rtc.h>
+#include <linux/sched/signal.h>
 #include <linux/alarmtimer.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8e11d8d9f419..df512e8c04d4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -43,7 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/debugobjects.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 82a6bfa0c307..f4465d2a25d1 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -38,7 +38,7 @@
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index 6d40944960de..010410990bc6 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -22,7 +22,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/percpu.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/percpu_ida.h>
diff --git a/mm/compaction.c b/mm/compaction.c
index 0fdfde016ee2..81e1eaa2a2cf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -12,6 +12,7 @@
 #include <linux/migrate.h>
 #include <linux/compaction.h>
 #include <linux/mm_inline.h>
+#include <linux/sched/signal.h>
 #include <linux/backing-dev.h>
 #include <linux/sysctl.h>
 #include <linux/sysfs.h>
diff --git a/mm/gup.c b/mm/gup.c
index 94fab8fa432b..9c047e951aa3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,7 +10,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/rwsem.h>
 #include <linux/hugetlb.h>
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2e0e8159ce8e..a7aa811b7d14 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1d3ed58f92ab..295479b792ec 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -6,6 +6,7 @@
 
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index a26649a6633f..de8cdef4ef9b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,6 +29,7 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/swap.h>
 #include <linux/uio.h>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9f0ad2a4f102..479e631d43c2 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 878563a8354d..db9794ec61d8 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -10,7 +10,7 @@
 #include <linux/kernel.h>	/* printk */
 #include <linux/skbuff.h>
 #include <linux/wait.h>
-#include <linux/sched.h>	/* jiffies and HZ */
+#include <linux/sched/signal.h>
 #include <linux/fcntl.h>	/* O_NONBLOCK */
 #include <linux/init.h>
 #include <linux/atm.h>		/* ATM stuff */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index cfb2faba46de..69e1f7d362a8 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -27,6 +27,8 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/stringify.h>
+#include <linux/sched/signal.h>
+
 #include <asm/ioctls.h>
 
 #include <net/bluetooth/bluetooth.h>
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 46ac686c8911..bb308224099c 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fcntl.h>
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 1015d9c8d97d..b5faff458d8b 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,6 +21,8 @@
    SOFTWARE IS DISCLAIMED.
 */
 
+#include <linux/sched/signal.h>
+
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
 #include <net/bluetooth/mgmt.h>
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a8ba752732c9..f307b145ea54 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -29,6 +29,7 @@
 
 #include <linux/module.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 7511df72347f..aa1a814ceddc 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -27,6 +27,7 @@
 
 #include <linux/export.h>
 #include <linux/debugfs.h>
+#include <linux/sched/signal.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 3125ce670c2f..e4e9a2da1e7e 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/sched/signal.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 0f4034934d56..0b5dd607444c 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -19,6 +19,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/spinlock.h>
 #include <linux/times.h>
+#include <linux/sched/signal.h>
 
 #include "br_private.h"
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 05e8946ccc03..79aee759aba5 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -17,6 +17,7 @@
 #include <linux/if_bridge.h>
 #include <linux/rtnetlink.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
 
 #include "br_private.h"
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index be7bab1adcde..aecb2c7241b6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -24,7 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/rtnetlink.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/net.h>
 
 /*
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b0c04cf4851d..3945821e9c1f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -15,6 +15,7 @@
 #include <net/switchdev.h>
 #include <linux/if_arp.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/nsproxy.h>
 #include <net/sock.h>
 #include <net/net_namespace.h>
diff --git a/net/dccp/output.c b/net/dccp/output.c
index b66c84db0766..91a15b3c4915 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <net/inet_sock.h>
 #include <net/sock.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5d367b7ff542..cebedd545e5e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/socket.h>
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3a2025f5bf2c..77362b88a661 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -43,6 +43,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/net.h>
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 817b1b186aff..f6061c4bb0a8 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -32,7 +32,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/seq_file.h>
 #include <linux/termios.h>
 #include <linux/tty.h>
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 35dbf3dc3d28..7025dcb853d0 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -13,8 +13,9 @@
  *	2) as a control channel (write commands, read events)
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
+
 #include "irnet_ppp.h"		/* Private header */
 /* Please put other headers in irnet.h - Thanks */
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 13190b38f22e..89bbde1081ce 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -17,7 +17,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index a646f3481240..309062f3debe 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -24,6 +24,8 @@
 #include <linux/uaccess.h>
 #include <linux/workqueue.h>
 #include <linux/syscalls.h>
+#include <linux/sched/signal.h>
+
 #include <net/kcm.h>
 #include <net/netns/generic.h>
 #include <net/sock.h>
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5e9296382420..06186d608a27 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -26,6 +26,8 @@
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <net/llc.h>
 #include <net/llc_sap.h>
 #include <net/llc_pdu.h>
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b9edf5fae6ae..879885b31cce 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/nfc.h>
+#include <linux/sched/signal.h>
 
 #include "nfc.h"
 #include "llcp.h"
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 8bad5624a27a..222bedcd9575 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <net/sock.h>
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index ffd5f2297584..a6c8da3ee893 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -27,6 +27,8 @@
 #include <linux/kernel.h>
 #include <linux/net.h>
 #include <linux/poll.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/tcp_states.h>
 
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 40a1ef2adeb4..c3be03e8d098 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -76,6 +76,8 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/timer.h>
+#include <linux/sched/signal.h>
+
 #include "ar-internal.h"
 
 __read_mostly unsigned int rxrpc_max_client_connections = 1000;
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index f3a688e10843..28274a3c9831 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -14,6 +14,8 @@
 #include <linux/net.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 0a6ef217aa8a..19b36c60fb4c 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,6 +15,8 @@
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 6b09a778cc71..43e4045e72bc 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -35,6 +35,8 @@
  */
 
 #include <linux/rhashtable.h>
+#include <linux/sched/signal.h>
+
 #include "core.h"
 #include "name_table.h"
 #include "node.h"
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8a398b3fb532..9192ead66751 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -90,6 +90,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
 #include <linux/miscdevice.h>
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 849c4ad0411e..8d592a45b597 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -9,6 +9,7 @@
  */
 #include <linux/spinlock.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/virtio.h>
diff --git a/sound/core/control.c b/sound/core/control.c
index fb096cb20a80..c109b82eef4b 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/minors.h>
 #include <sound/info.h>
diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c
index 36d2416f90d9..9602a7e38d8a 100644
--- a/sound/core/hwdep.c
+++ b/sound/core/hwdep.c
@@ -25,6 +25,7 @@
 #include <linux/time.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/minors.h>
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 698a01419515..36baf962f9b0 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -28,6 +28,7 @@
 
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index bb1261591a1f..5088d4b8db22 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/math64.h>
 #include <linux/export.h>
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index aec9c92250fd..13dec5ec93f2 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/pm_qos.h>
 #include <linux/io.h>
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index 8da9cb245d01..ab890336175f 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -22,7 +22,7 @@
 #include <sound/core.h>
 #include <linux/major.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/wait.h>
diff --git a/sound/core/seq/oss/seq_oss_device.h b/sound/core/seq/oss/seq_oss_device.h
index d7b4d016b547..afa007c0cc2d 100644
--- a/sound/core/seq/oss/seq_oss_device.h
+++ b/sound/core/seq/oss/seq_oss_device.h
@@ -24,7 +24,7 @@
 #include <linux/time.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/seq_oss.h>
 #include <sound/rawmidi.h>
diff --git a/sound/core/seq/oss/seq_oss_writeq.c b/sound/core/seq/oss/seq_oss_writeq.c
index 1f6788a18444..5e04f4df10e4 100644
--- a/sound/core/seq/oss/seq_oss_writeq.c
+++ b/sound/core/seq/oss/seq_oss_writeq.c
@@ -28,6 +28,7 @@
 #include "../seq_clientmgr.h"
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 
 /*
diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c
index 86240d02b530..448efd4e980e 100644
--- a/sound/core/seq/seq_fifo.c
+++ b/sound/core/seq/seq_fifo.c
@@ -21,6 +21,8 @@
 
 #include <sound/core.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "seq_fifo.h"
 #include "seq_lock.h"
 
diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c
index dfa5156f3585..1a1acf3ddda4 100644
--- a/sound/core/seq/seq_memory.c
+++ b/sound/core/seq/seq_memory.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <sound/core.h>
 
diff --git a/sound/core/timer.c b/sound/core/timer.c
index ad153149b231..6d4fbc439246 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -27,6 +27,7 @@
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/timer.h>
 #include <sound/control.h>
diff --git a/sound/firewire/bebob/bebob.h b/sound/firewire/bebob/bebob.h
index 175da875162d..17678d6ab5a2 100644
--- a/sound/firewire/bebob/bebob.h
+++ b/sound/firewire/bebob/bebob.h
@@ -17,6 +17,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/firewire/dice/dice.h b/sound/firewire/dice/dice.h
index e6c07857f475..da00e75e09d4 100644
--- a/sound/firewire/dice/dice.h
+++ b/sound/firewire/dice/dice.h
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 
 #include <sound/control.h>
 #include <sound/core.h>
diff --git a/sound/firewire/digi00x/digi00x.h b/sound/firewire/digi00x/digi00x.h
index 2cd465c0caae..9dc761bdacca 100644
--- a/sound/firewire/digi00x/digi00x.h
+++ b/sound/firewire/digi00x/digi00x.h
@@ -16,6 +16,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/firewire/fireworks/fireworks.h b/sound/firewire/fireworks/fireworks.h
index d73c12b8753d..9b19c7f05d57 100644
--- a/sound/firewire/fireworks/fireworks.h
+++ b/sound/firewire/fireworks/fireworks.h
@@ -17,6 +17,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/firewire/oxfw/oxfw.h b/sound/firewire/oxfw/oxfw.h
index 2047dcb27625..d54d4a9ac4a1 100644
--- a/sound/firewire/oxfw/oxfw.h
+++ b/sound/firewire/oxfw/oxfw.h
@@ -13,6 +13,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 
 #include <sound/control.h>
 #include <sound/core.h>
diff --git a/sound/firewire/tascam/tascam.h b/sound/firewire/tascam/tascam.h
index 1f61011579a7..d3cd4065722b 100644
--- a/sound/firewire/tascam/tascam.h
+++ b/sound/firewire/tascam/tascam.h
@@ -17,6 +17,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/isa/gus/gus_pcm.c b/sound/isa/gus/gus_pcm.c
index 25f6788ccef3..06505999155f 100644
--- a/sound/isa/gus/gus_pcm.c
+++ b/sound/isa/gus/gus_pcm.c
@@ -27,6 +27,8 @@
 
 #include <asm/dma.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/gus.h>
diff --git a/sound/isa/msnd/msnd.c b/sound/isa/msnd/msnd.c
index 835d4aa26761..8109ab3d29d1 100644
--- a/sound/isa/msnd/msnd.c
+++ b/sound/isa/msnd/msnd.c
@@ -36,6 +36,7 @@
  ********************************************************************/
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
diff --git a/sound/isa/sb/emu8000.c b/sound/isa/sb/emu8000.c
index 94c411299e5a..ec180708f160 100644
--- a/sound/isa/sb/emu8000.c
+++ b/sound/isa/sb/emu8000.c
@@ -21,7 +21,7 @@
  */
 
 #include <linux/wait.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/export.h>
diff --git a/sound/isa/sb/emu8000_patch.c b/sound/isa/sb/emu8000_patch.c
index 71d13c0bb746..c2e41d2762f7 100644
--- a/sound/isa/sb/emu8000_patch.c
+++ b/sound/isa/sb/emu8000_patch.c
@@ -20,6 +20,8 @@
  */
 
 #include "emu8000_local.h"
+
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/moduleparam.h>
 
diff --git a/sound/isa/sb/emu8000_pcm.c b/sound/isa/sb/emu8000_pcm.c
index 250fd0006b53..32f234f494e5 100644
--- a/sound/isa/sb/emu8000_pcm.c
+++ b/sound/isa/sb/emu8000_pcm.c
@@ -19,6 +19,8 @@
  */
 
 #include "emu8000_local.h"
+
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <sound/initval.h>
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 718d5e3b7806..4dae9ff9ef5a 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/time.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/firmware.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
diff --git a/sound/oss/dmabuf.c b/sound/oss/dmabuf.c
index e3f29132d3ac..c5dd396c66a2 100644
--- a/sound/oss/dmabuf.c
+++ b/sound/oss/dmabuf.c
@@ -27,6 +27,8 @@
 
 #include <linux/mm.h>
 #include <linux/gfp.h>
+#include <linux/sched/signal.h>
+
 #include "sound_config.h"
 #include "sleep.h"
 
diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c
index 5f248fb41bea..fb3bbceb1fef 100644
--- a/sound/oss/dmasound/dmasound_core.c
+++ b/sound/oss/dmasound/dmasound_core.c
@@ -182,6 +182,7 @@
 #include <linux/soundcard.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 
 #include <linux/uaccess.h>
 
diff --git a/sound/oss/midibuf.c b/sound/oss/midibuf.c
index 8f45cd999965..701c7625c971 100644
--- a/sound/oss/midibuf.c
+++ b/sound/oss/midibuf.c
@@ -16,6 +16,8 @@
 #include <linux/stddef.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
+
 #define MIDIBUF_C
 
 #include "sound_config.h"
diff --git a/sound/oss/msnd_pinnacle.c b/sound/oss/msnd_pinnacle.c
index a8bb4a06ba6f..f34ec01d2239 100644
--- a/sound/oss/msnd_pinnacle.c
+++ b/sound/oss/msnd_pinnacle.c
@@ -41,6 +41,8 @@
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
+#include <linux/sched/signal.h>
+
 #include <asm/irq.h>
 #include <asm/io.h>
 #include "sound_config.h"
diff --git a/sound/oss/sound_config.h b/sound/oss/sound_config.h
index f2554ab78f5e..5253b0a70437 100644
--- a/sound/oss/sound_config.h
+++ b/sound/oss/sound_config.h
@@ -16,6 +16,7 @@
 
 #include <linux/fs.h>
 #include <linux/sound.h>
+#include <linux/sched/signal.h>
 
 #include "os.h"
 #include "soundvers.h"
diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c
index f3af63e58b36..97899352b15f 100644
--- a/sound/oss/swarm_cs4297a.c
+++ b/sound/oss/swarm_cs4297a.c
@@ -64,7 +64,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/ioport.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/sound.h>
 #include <linux/slab.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dd5de09bf362..ae79f7679cc2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -32,7 +32,7 @@
 #include <linux/file.h>
 #include <linux/syscore_ops.h>
 #include <linux/cpu.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
-- 
cgit v1.2.3


From fd7712337ff09a248df424c5843c149586a3f017 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 20:56:33 +0100
Subject: sched/headers: Prepare to remove the <linux/gfp.h> include from
 <linux/sched.h>

<linux/topology.h> is still needed - also update other headers
and .c files that depend on sched.h including gfp.h (and its
sub-headers) for them.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    | 1 +
 include/linux/sched/mm.h | 1 +
 lib/percpu_ida.c         | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e87c97e1a947..8ae7b3d85658 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -56,6 +56,7 @@ struct sched_param {
 #include <linux/llist.h>
 #include <linux/uidgid.h>
 #include <linux/gfp.h>
+#include <linux/topology.h>
 #include <linux/magic.h>
 #include <linux/cgroup-defs.h>
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index a7adba1cd0a9..1cf7941bb946 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -2,5 +2,6 @@
 #define _LINUX_SCHED_MM_H
 
 #include <linux/sched.h>
+#include <linux/gfp.h>
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index 010410990bc6..6016f1deb1f5 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -14,6 +14,7 @@
  * General Public License for more details.
  */
 
+#include <linux/mm.h>
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
-- 
cgit v1.2.3


From 03441a3482a31462c93509939a388877e3cd9261 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:35 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/stat.h>

We are going to split <linux/sched/stat.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/stat.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kvm/book3s_hv.c     | 1 +
 arch/s390/appldata/appldata_os.c | 1 +
 crypto/mcryptd.c                 | 1 +
 drivers/cpuidle/governors/menu.c | 1 +
 fs/proc/loadavg.c                | 1 +
 fs/proc/root.c                   | 1 +
 fs/proc/stat.c                   | 1 +
 include/linux/sched/stat.h       | 6 ++++++
 kernel/debug/kdb/kdb_main.c      | 1 +
 kernel/exit.c                    | 1 +
 kernel/fork.c                    | 1 +
 kernel/sched/sched.h             | 1 +
 kernel/sys.c                     | 1 +
 kernel/time/tick-sched.c         | 1 +
 virt/kvm/kvm_main.c              | 1 +
 15 files changed, 20 insertions(+)
 create mode 100644 include/linux/sched/stat.h

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fc56a9ce785d..1ec86d9e2a82 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/preempt.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/stat.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/fs.h>
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 079446619f89..45b3178200ab 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -18,6 +18,7 @@
 #include <linux/netdevice.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <asm/appldata.h>
 #include <asm/smp.h>
 
diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index c207458d6299..4e6472658852 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 #include <linux/sched.h>
+#include <linux/sched/stat.h>
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index fe86060e48f7..6d8a4026a903 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,6 +19,7 @@
 #include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/math64.h>
 #include <linux/cpu.h>
 
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index add5255ce7e0..983fce5c2418 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -4,6 +4,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5d5fed20bfff..a50ba388255f 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -14,6 +14,7 @@
 #include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/stat.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/user_namespace.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e47c3e8c4dfe..b95556e036bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -5,6 +5,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
+#include <linux/sched/stat.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
new file mode 100644
index 000000000000..30fe012aecb0
--- /dev/null
+++ b/include/linux/sched/stat.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_STAT_H
+#define _LINUX_SCHED_STAT_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_STAT_H */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 308937c7a687..bf48a492b4be 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -19,6 +19,7 @@
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/sysrq.h>
 #include <linux/smp.h>
 #include <linux/utsname.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 48649ccbb649..5bad7dce1b7b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 8fbe8bcd1e20..8632905f64c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,6 +17,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/user.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/stat.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04f376cf7ba9..6c8a1db44dde 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -9,6 +9,7 @@
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/cpufreq.h>
+#include <linux/sched/stat.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index e2d743fb7f55..8c0392c8be51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -51,6 +51,7 @@
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/rcupdate.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5e9e123b4321..29d79b175141 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,6 +21,7 @@
 #include <linux/profile.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/stat.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ae79f7679cc2..799499417f5b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -34,6 +34,7 @@
 #include <linux/cpu.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 #include <linux/anon_inodes.h>
-- 
cgit v1.2.3


From 370c91355c76cbcaad8ff79b4bb15a7f2ea59433 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:35 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/nohz.h>

We are going to split <linux/sched/nohz.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/nohz.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/nohz.h | 6 ++++++
 kernel/sched/sched.h       | 1 +
 kernel/time/hrtimer.c      | 1 +
 kernel/time/tick-sched.c   | 1 +
 kernel/time/timer.c        | 1 +
 5 files changed, 10 insertions(+)
 create mode 100644 include/linux/sched/nohz.h

(limited to 'include/linux')

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
new file mode 100644
index 000000000000..fe6caf0dab10
--- /dev/null
+++ b/include/linux/sched/nohz.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_NOHZ_H
+#define _LINUX_SCHED_NOHZ_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_NOHZ_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6c8a1db44dde..5979f47c422c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,6 +10,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/cpufreq.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index df512e8c04d4..aed2207f5b2d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -47,6 +47,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/sched/nohz.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29d79b175141..7fe53be86077 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -22,6 +22,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f4465d2a25d1..d7999cb06229 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -40,6 +40,7 @@
 #include <linux/irq_work.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/nohz.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
 
-- 
cgit v1.2.3


From b17b01533b719e9949e437abf66436a875739b40 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:35 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/debug.h>

We are going to split <linux/sched/debug.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/debug.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/process.c               | 1 +
 arch/alpha/kernel/traps.c                 | 1 +
 arch/arc/kernel/ctx_sw.c                  | 1 +
 arch/arc/kernel/stacktrace.c              | 2 ++
 arch/arc/kernel/troubleshoot.c            | 1 +
 arch/arm/kernel/process.c                 | 1 +
 arch/arm/kernel/stacktrace.c              | 1 +
 arch/arm/kernel/traps.c                   | 1 +
 arch/arm/mm/alignment.c                   | 1 +
 arch/arm/mm/fault.c                       | 1 +
 arch/arm/probes/kprobes/core.c            | 1 +
 arch/arm64/kernel/probes/kprobes.c        | 1 +
 arch/arm64/kernel/process.c               | 1 +
 arch/arm64/kernel/stacktrace.c            | 1 +
 arch/arm64/kernel/traps.c                 | 1 +
 arch/arm64/mm/fault.c                     | 1 +
 arch/avr32/kernel/nmi_debug.c             | 1 +
 arch/avr32/kernel/process.c               | 1 +
 arch/blackfin/kernel/dumpstack.c          | 2 ++
 arch/blackfin/kernel/early_printk.c       | 1 +
 arch/blackfin/kernel/nmi.c                | 1 +
 arch/blackfin/kernel/process.c            | 1 +
 arch/blackfin/kernel/trace.c              | 1 +
 arch/blackfin/kernel/traps.c              | 1 +
 arch/blackfin/mach-common/ints-priority.c | 1 +
 arch/blackfin/mm/isram-driver.c           | 1 +
 arch/c6x/kernel/traps.c                   | 1 +
 arch/cris/arch-v10/kernel/process.c       | 1 +
 arch/cris/arch-v10/kernel/traps.c         | 2 ++
 arch/cris/arch-v32/kernel/process.c       | 1 +
 arch/cris/arch-v32/kernel/traps.c         | 2 ++
 arch/cris/kernel/irq.c                    | 1 +
 arch/cris/kernel/stacktrace.c             | 2 +-
 arch/cris/kernel/traps.c                  | 1 +
 arch/frv/kernel/process.c                 | 1 +
 arch/frv/kernel/traps.c                   | 1 +
 arch/h8300/kernel/process.c               | 1 +
 arch/h8300/kernel/traps.c                 | 1 +
 arch/hexagon/kernel/process.c             | 1 +
 arch/hexagon/kernel/traps.c               | 1 +
 arch/hexagon/kernel/vm_events.c           | 1 +
 arch/ia64/hp/sim/simserial.c              | 1 +
 arch/ia64/kernel/mca.c                    | 1 +
 arch/ia64/kernel/process.c                | 1 +
 arch/ia64/kernel/traps.c                  | 1 +
 arch/m32r/kernel/process.c                | 1 +
 arch/m32r/kernel/traps.c                  | 1 +
 arch/m68k/kernel/process.c                | 1 +
 arch/m68k/kernel/traps.c                  | 1 +
 arch/m68k/mac/macints.c                   | 1 +
 arch/metag/kernel/process.c               | 1 +
 arch/metag/kernel/stacktrace.c            | 1 +
 arch/metag/kernel/traps.c                 | 1 +
 arch/metag/mm/fault.c                     | 1 +
 arch/microblaze/kernel/exceptions.c       | 1 +
 arch/microblaze/kernel/process.c          | 1 +
 arch/microblaze/kernel/traps.c            | 1 +
 arch/mips/kernel/process.c                | 1 +
 arch/mips/kernel/stacktrace.c             | 1 +
 arch/mips/kernel/traps.c                  | 1 +
 arch/mips/sgi-ip22/ip28-berr.c            | 1 +
 arch/mips/sgi-ip27/ip27-berr.c            | 1 +
 arch/mips/sgi-ip32/ip32-berr.c            | 1 +
 arch/mips/sgi-ip32/ip32-irq.c             | 1 +
 arch/mn10300/kernel/process.c             | 1 +
 arch/mn10300/kernel/traps.c               | 1 +
 arch/nios2/kernel/process.c               | 1 +
 arch/nios2/kernel/traps.c                 | 1 +
 arch/nios2/mm/fault.c                     | 1 +
 arch/openrisc/kernel/process.c            | 1 +
 arch/openrisc/kernel/traps.c              | 1 +
 arch/parisc/kernel/pa7300lc.c             | 1 +
 arch/parisc/kernel/process.c              | 1 +
 arch/parisc/kernel/signal.c               | 1 +
 arch/parisc/kernel/traps.c                | 1 +
 arch/parisc/kernel/unaligned.c            | 1 +
 arch/parisc/mm/fault.c                    | 1 +
 arch/powerpc/kernel/process.c             | 1 +
 arch/powerpc/kernel/stacktrace.c          | 1 +
 arch/powerpc/kernel/traps.c               | 1 +
 arch/s390/kernel/dumpstack.c              | 1 +
 arch/s390/kernel/process.c                | 1 +
 arch/s390/kernel/stacktrace.c             | 1 +
 arch/s390/kernel/traps.c                  | 1 +
 arch/s390/mm/fault.c                      | 1 +
 arch/score/kernel/traps.c                 | 1 +
 arch/sh/kernel/dumpstack.c                | 1 +
 arch/sh/kernel/nmi_debug.c                | 1 +
 arch/sh/kernel/process_32.c               | 1 +
 arch/sh/kernel/process_64.c               | 1 +
 arch/sh/kernel/stacktrace.c               | 1 +
 arch/sh/kernel/traps.c                    | 1 +
 arch/sh/kernel/traps_64.c                 | 1 +
 arch/sparc/kernel/process_32.c            | 1 +
 arch/sparc/kernel/process_64.c            | 1 +
 arch/sparc/kernel/stacktrace.c            | 1 +
 arch/sparc/kernel/sun4m_irq.c             | 1 +
 arch/sparc/kernel/sys_sparc_32.c          | 1 +
 arch/sparc/kernel/sys_sparc_64.c          | 1 +
 arch/sparc/kernel/traps_32.c              | 1 +
 arch/sparc/kernel/traps_64.c              | 1 +
 arch/sparc/mm/fault_64.c                  | 1 +
 arch/tile/include/asm/stack.h             | 2 ++
 arch/tile/kernel/process.c                | 1 +
 arch/tile/kernel/signal.c                 | 1 +
 arch/tile/kernel/stack.c                  | 1 +
 arch/tile/kernel/traps.c                  | 1 +
 arch/tile/kernel/unaligned.c              | 1 +
 arch/tile/mm/fault.c                      | 1 +
 arch/um/drivers/mconsole_kern.c           | 1 +
 arch/um/kernel/process.c                  | 1 +
 arch/um/kernel/sysrq.c                    | 2 ++
 arch/um/kernel/trap.c                     | 1 +
 arch/unicore32/kernel/process.c           | 1 +
 arch/unicore32/kernel/stacktrace.c        | 1 +
 arch/unicore32/kernel/traps.c             | 1 +
 arch/unicore32/mm/alignment.c             | 1 +
 arch/x86/kernel/amd_gart_64.c             | 1 +
 arch/x86/kernel/doublefault.c             | 1 +
 arch/x86/kernel/dumpstack.c               | 1 +
 arch/x86/kernel/dumpstack_32.c            | 1 +
 arch/x86/kernel/dumpstack_64.c            | 1 +
 arch/x86/kernel/kprobes/core.c            | 1 +
 arch/x86/kernel/nmi.c                     | 1 +
 arch/x86/kernel/process.c                 | 1 +
 arch/x86/kernel/stacktrace.c              | 1 +
 arch/x86/mm/extable.c                     | 2 ++
 arch/x86/platform/uv/uv_nmi.c             | 1 +
 arch/x86/um/sysrq_32.c                    | 1 +
 arch/x86/um/sysrq_64.c                    | 1 +
 arch/xtensa/kernel/process.c              | 1 +
 arch/xtensa/kernel/traps.c                | 1 +
 drivers/base/power/main.c                 | 1 +
 drivers/tty/sysrq.c                       | 2 ++
 drivers/tty/tty_ldsem.c                   | 1 +
 drivers/tty/vt/keyboard.c                 | 2 ++
 fs/proc/base.c                            | 1 +
 include/linux/sched/debug.h               | 6 ++++++
 kernel/debug/kdb/kdb_bt.c                 | 1 +
 kernel/debug/kdb/kdb_main.c               | 1 +
 kernel/hung_task.c                        | 1 +
 kernel/latencytop.c                       | 1 +
 kernel/locking/mutex.c                    | 1 +
 kernel/locking/rtmutex-debug.c            | 1 +
 kernel/locking/rtmutex.c                  | 1 +
 kernel/locking/rwsem-spinlock.c           | 1 +
 kernel/locking/rwsem-xadd.c               | 1 +
 kernel/locking/rwsem.c                    | 1 +
 kernel/locking/semaphore.c                | 1 +
 kernel/panic.c                            | 1 +
 kernel/power/process.c                    | 1 +
 kernel/printk/printk.c                    | 1 +
 kernel/rcu/tree.c                         | 1 +
 kernel/rcu/tree_plugin.h                  | 1 +
 kernel/rcu/update.c                       | 1 +
 kernel/sched/completion.c                 | 1 +
 kernel/sched/sched.h                      | 2 +-
 kernel/sched/wait.c                       | 1 +
 kernel/signal.c                           | 1 +
 kernel/time/alarmtimer.c                  | 1 +
 kernel/time/hrtimer.c                     | 1 +
 kernel/time/timer.c                       | 1 +
 kernel/watchdog.c                         | 1 +
 kernel/watchdog_hld.c                     | 2 ++
 lib/dump_stack.c                          | 1 +
 lib/nmi_backtrace.c                       | 1 +
 166 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/sched/debug.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index bca963a4aa48..88f7a974d311 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 6448ab00043d..b137390e87e7 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -11,6 +11,7 @@
 #include <linux/jiffies.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/delay.h>
 #include <linux/extable.h>
diff --git a/arch/arc/kernel/ctx_sw.c b/arch/arc/kernel/ctx_sw.c
index 6f4cb0dab1b9..9e1ae9d41925 100644
--- a/arch/arc/kernel/ctx_sw.c
+++ b/arch/arc/kernel/ctx_sw.c
@@ -16,6 +16,7 @@
 
 #include <asm/asm-offsets.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #ifdef CONFIG_ARC_PLAT_EZNPS
 #include <plat/ctop.h>
 #endif
diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index b9192a653b7e..74315f302971 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -28,6 +28,8 @@
 #include <linux/export.h>
 #include <linux/stacktrace.h>
 #include <linux/kallsyms.h>
+#include <linux/sched/debug.h>
+
 #include <asm/arcregs.h>
 #include <asm/unwind.h>
 #include <asm/switch_to.h>
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 0c48907f56a9..f9caf79186d4 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -14,6 +14,7 @@
 #include <linux/proc_fs.h>
 #include <linux/file.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/debug.h>
 
 #include <asm/arcregs.h>
 #include <asm/irqflags.h>
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 91d2d5b01414..03d46395d1ff 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -12,6 +12,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c
index 92b72375c4c7..3a2fa203637a 100644
--- a/arch/arm/kernel/stacktrace.c
+++ b/arch/arm/kernel/stacktrace.c
@@ -1,5 +1,6 @@
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/stacktrace.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index dc5f1a22b3c9..a9dad001b97d 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -25,6 +25,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/irq.h>
 
 #include <linux/atomic.h>
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index d7fe2de9cc9e..2c96190e018b 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -14,6 +14,7 @@
 #include <linux/moduleparam.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/proc_fs.h>
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 520c7778d330..ff8b0aa2dfde 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -17,6 +17,7 @@
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index a4ec240ee7ba..b6dc9d838a9a 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
+#include <linux/sched/debug.h>
 #include <linux/stringify.h>
 #include <asm/traps.h>
 #include <asm/opcodes.h>
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index f0593c92279b..2a07aae5b8a2 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -22,6 +22,7 @@
 #include <linux/extable.h>
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
+#include <linux/sched/debug.h>
 #include <linux/stringify.h>
 #include <asm/traps.h>
 #include <asm/ptrace.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1ad48f93abdd..40a16cdd9873 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -24,6 +24,7 @@
 #include <linux/efi.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 8a552a33c6ef..7597e42feeea 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -19,6 +19,7 @@
 #include <linux/export.h>
 #include <linux/ftrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/irq.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5b8779a849a2..5a5d83791837 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -30,6 +30,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/syscalls.h>
 
 #include <asm/atomic.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 30dd60ab8a65..4bf899fb451b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 #include <linux/preempt.h>
diff --git a/arch/avr32/kernel/nmi_debug.c b/arch/avr32/kernel/nmi_debug.c
index 3414b8566c29..25823049bb99 100644
--- a/arch/avr32/kernel/nmi_debug.c
+++ b/arch/avr32/kernel/nmi_debug.c
@@ -9,6 +9,7 @@
 #include <linux/kdebug.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 #include <asm/irq.h>
 
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 68e5b9dac059..dac022d88d9d 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -6,6 +6,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
diff --git a/arch/blackfin/kernel/dumpstack.c b/arch/blackfin/kernel/dumpstack.c
index 95ba6d9e9a3d..3c992c1f8ef2 100644
--- a/arch/blackfin/kernel/dumpstack.c
+++ b/arch/blackfin/kernel/dumpstack.c
@@ -10,6 +10,8 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/sched/debug.h>
+
 #include <asm/trace.h>
 
 /*
diff --git a/arch/blackfin/kernel/early_printk.c b/arch/blackfin/kernel/early_printk.c
index 61fbd2de993d..4b89af9243d3 100644
--- a/arch/blackfin/kernel/early_printk.c
+++ b/arch/blackfin/kernel/early_printk.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <linux/init.h>
 #include <linux/serial_core.h>
 #include <linux/console.h>
diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c
index 9919d29287dc..633c37083e87 100644
--- a/arch/blackfin/kernel/nmi.c
+++ b/arch/blackfin/kernel/nmi.c
@@ -17,6 +17,7 @@
 #include <linux/nmi.h>
 #include <linux/smp.h>
 #include <linux/timer.h>
+#include <linux/sched/debug.h>
 #include <asm/blackfin.h>
 #include <linux/atomic.h>
 #include <asm/cacheflush.h>
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 4aa5545c4fde..e95d094c20a6 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index 01e546ad2f7a..23c05b8effb3 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/oom.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index 32676d7721b1..a323a40a46e9 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -10,6 +10,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <asm/traps.h>
 #include <asm/cplb.h>
 #include <asm/blackfin.h>
diff --git a/arch/blackfin/mach-common/ints-priority.c b/arch/blackfin/mach-common/ints-priority.c
index 4986b4fbcee9..13e94bf9d8ba 100644
--- a/arch/blackfin/mach-common/ints-priority.c
+++ b/arch/blackfin/mach-common/ints-priority.c
@@ -16,6 +16,7 @@
 #include <linux/seq_file.h>
 #include <linux/irq.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/syscore_ops.h>
 #include <linux/gpio.h>
 #include <asm/delay.h>
diff --git a/arch/blackfin/mm/isram-driver.c b/arch/blackfin/mm/isram-driver.c
index 7e2e674ed444..aaa1e64b753b 100644
--- a/arch/blackfin/mm/isram-driver.c
+++ b/arch/blackfin/mm/isram-driver.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 #include <asm/blackfin.h>
 #include <asm/dma.h>
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index dcc2c2f6d67c..09b8a40d5680 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -10,6 +10,7 @@
  */
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/bug.h>
 
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 96e5afef6b47..068a40374200 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v10/kernel/traps.c b/arch/cris/arch-v10/kernel/traps.c
index 96d004fe9740..c0a501f29bd8 100644
--- a/arch/cris/arch-v10/kernel/traps.c
+++ b/arch/cris/arch-v10/kernel/traps.c
@@ -10,6 +10,8 @@
 
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+
 #include <arch/sv_addr_ag.h>
 #include <arch/system.h>
 
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index 4d1afa9f9fd3..613c2d71bf10 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v32/kernel/traps.c b/arch/cris/arch-v32/kernel/traps.c
index ad6174e217c9..a34256515036 100644
--- a/arch/cris/arch-v32/kernel/traps.c
+++ b/arch/cris/arch-v32/kernel/traps.c
@@ -5,6 +5,8 @@
 #include <linux/ptrace.h>
 #include <linux/extable.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+
 #include <hwregs/supp_reg.h>
 #include <hwregs/intr_vect_defs.h>
 #include <asm/irq.h>
diff --git a/arch/cris/kernel/irq.c b/arch/cris/kernel/irq.c
index 694850e8f077..09b864f46f8a 100644
--- a/arch/cris/kernel/irq.c
+++ b/arch/cris/kernel/irq.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/irq.h>
+#include <linux/sched/debug.h>
 
 #include <linux/kernel_stat.h>
 #include <linux/signal.h>
diff --git a/arch/cris/kernel/stacktrace.c b/arch/cris/kernel/stacktrace.c
index 99838c74456d..f1cc3aaacd8d 100644
--- a/arch/cris/kernel/stacktrace.c
+++ b/arch/cris/kernel/stacktrace.c
@@ -1,5 +1,5 @@
 #include <linux/sched.h>
-#include <linux/stacktrace.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <asm/stacktrace.h>
 
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index b2a312a7afc6..a01636a12a6e 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/utsname.h>
+#include <linux/sched/debug.h>
 #ifdef CONFIG_KALLSYMS
 #include <linux/kallsyms.h>
 #endif
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index b306241c4ef2..cf4c34c09ab3 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index 43c134d6081c..ce29991e4219 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 891974a11704..d7cabf373fe3 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -25,6 +25,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index 044a36125846..ee7e3ce40d78 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -16,6 +16,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index d9edfd3fc52a..3bdd9592d025 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/tick.h>
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 4496bcf605ef..b55f13c70b34 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -20,6 +20,7 @@
 
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kdebug.h>
diff --git a/arch/hexagon/kernel/vm_events.c b/arch/hexagon/kernel/vm_events.c
index 741aaa917cda..04f57ef22009 100644
--- a/arch/hexagon/kernel/vm_events.c
+++ b/arch/hexagon/kernel/vm_events.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <asm/registers.h>
 #include <linux/irq.h>
 #include <linux/hardirq.h>
diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index 21fd50def270..de8cba121013 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/major.h>
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 5ac51069e453..f9fe3ac64242 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -73,6 +73,7 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 52deab683ba1..804b251ee5d1 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -20,6 +20,7 @@
 #include <linux/notifier.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 48ba46b025e1..7b1fe9462158 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/export.h>
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e0568bee60c0..1a227c1fdb1a 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/sched/debug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index c3c5fdfae920..7c05bb393597 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -14,6 +14,7 @@
 #include <linux/kallsyms.h>
 #include <linux/stddef.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/processor.h>
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index f0a8e9b332cd..b4a4675f4119 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 558f38402737..a926d2c88898 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/m68k/mac/macints.c b/arch/m68k/mac/macints.c
index b5cd06df71fd..9637dee90dac 100644
--- a/arch/m68k/mac/macints.c
+++ b/arch/m68k/mac/macints.c
@@ -110,6 +110,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 35062796edf2..2e611bd37515 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/stacktrace.c b/arch/metag/kernel/stacktrace.c
index 5510361d5bea..4f806d66a58c 100644
--- a/arch/metag/kernel/stacktrace.c
+++ b/arch/metag/kernel/stacktrace.c
@@ -1,5 +1,6 @@
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/stacktrace.h>
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 17b2e2e38d5a..5ce67f9124aa 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -10,6 +10,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index c765b3621b9b..5055477486b6 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index 42dd12a62ff5..e6f338d0496b 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 
 #include <asm/exceptions.h>
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index b2dd37196b3b..8c5fdb2e2850 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/cpu.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/pm.h>
 #include <linux/tick.h>
 #include <linux/bitops.h>
diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c
index cb619533a192..45bbba9d919f 100644
--- a/arch/microblaze/kernel/traps.c
+++ b/arch/microblaze/kernel/traps.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/debug_locks.h>
 
 #include <asm/exceptions.h>
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 803e255b6fc3..97574cdb532d 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -11,6 +11,7 @@
  */
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tick.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/mips/kernel/stacktrace.c b/arch/mips/kernel/stacktrace.c
index 506021f62549..986f910961d9 100644
--- a/arch/mips/kernel/stacktrace.c
+++ b/arch/mips/kernel/stacktrace.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2006 Atsushi Nemoto <anemo@mba.ocn.ne.jp>
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <asm/stacktrace.h>
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 49c6df20672a..1a9366a157cb 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -24,6 +24,7 @@
 #include <linux/extable.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
diff --git a/arch/mips/sgi-ip22/ip28-berr.c b/arch/mips/sgi-ip22/ip28-berr.c
index 9960a8302eac..1f2a5bc4779e 100644
--- a/arch/mips/sgi-ip22/ip28-berr.c
+++ b/arch/mips/sgi-ip22/ip28-berr.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/seq_file.h>
 
 #include <asm/addrspace.h>
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index f8919b6a24c8..d12879eb2b1f 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/signal.h>	/* for SIGBUS */
 #include <linux/sched.h>	/* schow_regs(), force_sig() */
+#include <linux/sched/debug.h>
 
 #include <asm/sn/addrs.h>
 #include <asm/sn/arch.h>
diff --git a/arch/mips/sgi-ip32/ip32-berr.c b/arch/mips/sgi-ip32/ip32-berr.c
index ba8f46d80ab8..57d8c7486fe6 100644
--- a/arch/mips/sgi-ip32/ip32-berr.c
+++ b/arch/mips/sgi-ip32/ip32-berr.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <asm/traps.h>
 #include <linux/uaccess.h>
 #include <asm/addrspace.h>
diff --git a/arch/mips/sgi-ip32/ip32-irq.c b/arch/mips/sgi-ip32/ip32-irq.c
index 838d8589a1c0..a6a0ff7f5aed 100644
--- a/arch/mips/sgi-ip32/ip32-irq.c
+++ b/arch/mips/sgi-ip32/ip32-irq.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 #include <asm/irq_cpu.h>
 #include <asm/mipsregs.h>
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index e5def2217f72..a1e2f8301d94 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c
index a7a987c7954f..800fd0801969 100644
--- a/arch/mn10300/kernel/traps.c
+++ b/arch/mn10300/kernel/traps.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 2f8c74f93e70..3ad87a6b119b 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -14,6 +14,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 72ed30a93c85..8184e7d6b385 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/export.h>
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index e7a14e1e0d6b..b804dd06ea1c 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -13,6 +13,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 6e9d1cb519f2..899339dac938 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -22,6 +22,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 7e81ad258bca..2354ab88d948 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -22,6 +22,7 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/extable.h>
 #include <linux/kmod.h>
diff --git a/arch/parisc/kernel/pa7300lc.c b/arch/parisc/kernel/pa7300lc.c
index 8a89780223aa..9b245fc67560 100644
--- a/arch/parisc/kernel/pa7300lc.c
+++ b/arch/parisc/kernel/pa7300lc.c
@@ -5,6 +5,7 @@
  *   Copyright (C) 2000 Philipp Rumpf */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
 #include <asm/io.h>
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index ea6603ee8d24..8c283caf2b4d 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -43,6 +43,7 @@
 #include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index e58925ac64d1..9e03296641d7 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 378df9207406..991654c88eec 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index a08ab481e556..e36f7b75ab07 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -24,6 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/ratelimit.h>
 #include <linux/uaccess.h>
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 1a0b4f63f0e9..c3cac4ddfe9c 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/extable.h>
 #include <linux/uaccess.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 4379a079b3c2..76f58b87dcb2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -16,6 +16,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 4f24606afc3f..66711958493c 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -12,6 +12,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index e6cc56b61d01..ff365f9de27a 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -17,6 +17,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 55d4fe174fd9..72c584d62f59 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <asm/processor.h>
 #include <asm/debug.h>
 #include <asm/dis.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 54281660582c..ed99ff911b67 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/compiler.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/elfcore.h>
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 0085b2d8ed7d..e66687dc6144 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/kallsyms.h>
 #include <linux/export.h>
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 283ad7840335..f787b9d8f54c 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -17,6 +17,7 @@
 #include <linux/extable.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index bb5560eb2435..5845d3028ffc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -12,6 +12,7 @@
 #include <linux/perf_event.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index fa624f30f783..8a54d320fb41 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -25,6 +25,7 @@
 
 #include <linux/extable.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 
 #include <asm/cacheflush.h>
 #include <asm/irq.h>
diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c
index 8dfe645bcc4b..d00cab2f50f9 100644
--- a/arch/sh/kernel/dumpstack.c
+++ b/arch/sh/kernel/dumpstack.c
@@ -11,6 +11,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ftrace.h>
 #include <linux/debug_locks.h>
+#include <linux/sched/debug.h>
 #include <linux/kdebug.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/sh/kernel/nmi_debug.c b/arch/sh/kernel/nmi_debug.c
index ff0abbd1e652..730d928f0d12 100644
--- a/arch/sh/kernel/nmi_debug.c
+++ b/arch/sh/kernel/nmi_debug.c
@@ -9,6 +9,7 @@
 #include <linux/kdebug.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/hardirq.h>
 
 enum nmi_action {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 51741850a715..5098274e0f23 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -15,6 +15,7 @@
  */
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/elfcore.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e0b271bffd6a..ced21a417d9d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/io.h>
+#include <linux/sched/debug.h>
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/arch/sh/kernel/stacktrace.c b/arch/sh/kernel/stacktrace.c
index bf989e063a0c..7a73d2763e1b 100644
--- a/arch/sh/kernel/stacktrace.c
+++ b/arch/sh/kernel/stacktrace.c
@@ -10,6 +10,7 @@
  * for more details.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 3036dee854d1..bc5c9f347b9e 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -4,6 +4,7 @@
 #include <linux/kdebug.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kernel.h>
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index 00835edb6e20..014fb08cf133 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -10,6 +10,7 @@
  * for more details.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 48ffc3e7d1dd..237a471c8ed5 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index d249ca10b203..98f6ff6eaa55 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/sparc/kernel/stacktrace.c b/arch/sparc/kernel/stacktrace.c
index e78386a0029f..be4c14cccc05 100644
--- a/arch/sparc/kernel/stacktrace.c
+++ b/arch/sparc/kernel/stacktrace.c
@@ -1,4 +1,5 @@
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/ftrace.h>
diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c
index da737c712fa8..aa84da0b2d30 100644
--- a/arch/sparc/kernel/sun4m_irq.c
+++ b/arch/sparc/kernel/sun4m_irq.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/debug.h>
 
 #include <asm/timer.h>
 #include <asm/traps.h>
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index ff3573059936..7aecb239626d 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index c521ee277083..ef4520efc813 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/debug.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index ecddac5a4c96..26740a2f285d 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>  /* for jiffies */
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index e022d7b00390..4ff4c35f76b2 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -10,6 +10,7 @@
 
 #include <linux/extable.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/linkage.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 643c149a3151..b84c4dd14954 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/ptrace.h>
 #include <linux/mman.h>
 #include <linux/signal.h>
diff --git a/arch/tile/include/asm/stack.h b/arch/tile/include/asm/stack.h
index c3cb42615a9f..3573325e340b 100644
--- a/arch/tile/include/asm/stack.h
+++ b/arch/tile/include/asm/stack.h
@@ -17,6 +17,8 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
+
 #include <asm/backtrace.h>
 #include <asm/page.h>
 #include <hv/hypervisor.h>
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index c84c54a1ac55..557b4c8435d0 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 87299a6cfec8..8cc92ae6eb27 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 22bbbd3ff4a3..a1c0787fe9d8 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/module.h>
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 39f427bb0de2..54804866f238 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
index f229e979584e..142acae78316 100644
--- a/arch/tile/kernel/unaligned.c
+++ b/arch/tile/kernel/unaligned.c
@@ -17,6 +17,7 @@
 #include <linux/smp.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
+#include <linux/sched/debug.h>
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/mman.h>
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 709f8e9ba3e9..005c91c2adca 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -16,6 +16,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 8a4c72af3bc0..af326fb6510d 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
+#include <linux/sched/debug.h>
 #include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 078630d6448c..c6c45ea4021f 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/seq_file.h>
 #include <linux/tick.h>
 #include <linux/threads.h>
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index aa1b56f5ac68..34ae555c3e70 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -11,6 +11,8 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
+
 #include <asm/sysrq.h>
 #include <asm/stacktrace.h>
 #include <os.h>
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 9711ae4aaa6a..59158871b9fc 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -8,6 +8,7 @@
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
 #include <asm/current.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index d7c6b676b3a5..c1a0eec88f1f 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -13,6 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c
index b34030bdabe3..9976e767d51c 100644
--- a/arch/unicore32/kernel/stacktrace.c
+++ b/arch/unicore32/kernel/stacktrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/stacktrace.h>
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 7f5e06f9a202..506e1a2127c6 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/signal.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
 #include <linux/kallsyms.h>
diff --git a/arch/unicore32/mm/alignment.c b/arch/unicore32/mm/alignment.c
index 24e836023e6c..3a7f6faa8794 100644
--- a/arch/unicore32/mm/alignment.c
+++ b/arch/unicore32/mm/alignment.c
@@ -15,6 +15,7 @@
  */
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/init.h>
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 82dfe32faaf4..df083efe6ee0 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index b2f7207ba86c..f9c324e08d85 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,5 +1,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 0cfd01d2754c..7b9e7e68f316 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,6 +10,7 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bb3b5b9a6899..b0b3a3df7c20 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -2,6 +2,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  */
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index fac189efcc34..a8b117e93b46 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -2,6 +2,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  */
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 520b8dfe1640..6384eb754a58 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 #include <linux/preempt.h>
+#include <linux/sched/debug.h>
 #include <linux/extable.h>
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d17b1a940add..f088ea4c66e7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/sched/debug.h>
 #include <linux/nmi.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6395e0cd7dd6..505be5589e52 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 0653788026e2..330cae0025d0 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61a7e9ea9aa1..35ea061010a1 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,5 +1,7 @@
 #include <linux/extable.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 9743d0ccfec6..c34bd8233f7c 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -27,6 +27,7 @@
 #include <linux/moduleparam.h>
 #include <linux/nmi.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/clocksource.h>
 
diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c
index 16ee0e450e3e..f2383484840d 100644
--- a/arch/x86/um/sysrq_32.c
+++ b/arch/x86/um/sysrq_32.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <asm/ptrace.h>
 #include <asm/sysrq.h>
diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index 38b4e4abd0f8..903ad91b624f 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -7,6 +7,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/utsname.h>
 #include <asm/current.h>
 #include <asm/ptrace.h>
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 826d25104846..afaa3588d869 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -17,6 +17,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 84abd66e680d..d8bb2e62393e 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -25,6 +25,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 249e0304597f..9faee1c893e5 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -27,6 +27,7 @@
 #include <linux/pm_wakeirq.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/async.h>
 #include <linux/suspend.h>
 #include <trace/events/power.h>
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 65db0aeb3d80..e5f0c7d0fe8d 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -16,6 +16,8 @@
 
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index 9229de43e19d..4f6b1b10b537 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -32,6 +32,7 @@
 #include <linux/atomic.h>
 #include <linux/tty.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 6b3a2c00974d..c5f0fc906136 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -27,6 +27,8 @@
 #include <linux/consolemap.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/mm.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4c5fa704e38c..5914fed3712d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -88,6 +88,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/debug.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
new file mode 100644
index 000000000000..55bc0cd35fd5
--- /dev/null
+++ b/include/linux/sched/debug.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_DEBUG_H
+#define _LINUX_SCHED_DEBUG_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_DEBUG_H */
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 9b976a42376d..6ad4a9fcbd6f 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/kdb.h>
 #include <linux/nmi.h>
 #include "kdb_private.h"
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bf48a492b4be..c8146d53ca67 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/debug.h>
 #include <linux/sysrq.h>
 #include <linux/smp.h>
 #include <linux/utsname.h>
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 129247e56902..f0f8e2a8496f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,6 +17,7 @@
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 
 #include <trace/events/sched.h>
 
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b5c30d9f46c5..545839b838d6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,7 @@
 #include <linux/latencytop.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 82ff5726bc1b..198527a62149 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -22,6 +22,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 62b6cee8ea7f..97ee9df32e0f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -18,6 +18,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1c8c8707853a..6edc32ecd9c5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -16,6 +16,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
 #include <linux/timer.h>
 
 #include "rtmutex_common.h"
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 91e7bc8e9d5a..7bc24d477805 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -7,6 +7,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/export.h>
 
 enum rwsem_waiter_type {
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 9bd6e768164e..34e727f18e49 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -15,6 +15,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
 #include <linux/osq_lock.h>
 
 #include "rwsem.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 45ba475d4be3..90a74ccd85a4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9512e37637dc..561acdd39960 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 3ec16e603e88..a58932b41700 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -9,6 +9,7 @@
  * to indicate a major problem.
  */
 #include <linux/debug_locks.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/kmsg_dump.h>
 #include <linux/kallsyms.h>
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 2fba066e125f..85e2915d8961 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,6 +12,7 @@
 #include <linux/oom.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/sched/debug.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 47050eedc206..3caaaa04b92e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,6 +46,7 @@
 #include <linux/ctype.h>
 #include <linux/uio.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e456327a63d6..50fee7689e71 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -35,6 +35,7 @@
 #include <linux/rcupdate_wait.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9dabb04003be..0a62a8f1caac 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -27,6 +27,7 @@
 #include <linux/delay.h>
 #include <linux/gfp.h>
 #include <linux/oom.h>
+#include <linux/sched/debug.h>
 #include <linux/smpboot.h>
 #include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index da128deb10ec..55c8530316c7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -37,6 +37,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index b294a8ee2842..53f9558fa925 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/completion.h>
 
 /**
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5979f47c422c..e1e819f731b2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -11,6 +11,7 @@
 #include <linux/sched/cpufreq.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
@@ -1830,7 +1831,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void
 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
-
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 1fedfcf6fc9b..4d2ea6f25568 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 762fcf64f0c3..15092fd8c7b1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/sched/user.h>
+#include <linux/sched/debug.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 04939053c823..ce3a31e8eb36 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -20,6 +20,7 @@
 #include <linux/timerqueue.h>
 #include <linux/rtc.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/alarmtimer.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index aed2207f5b2d..ec08f527d7ee 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -48,6 +48,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d7999cb06229..1dc0256bfb6e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -41,6 +41,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 52718f4512e9..03e0b69bb5bf 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
 #include <linux/tick.h>
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index b5de262a9eb9..54a427d1f344 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -13,6 +13,8 @@
 
 #include <linux/nmi.h>
 #include <linux/module.h>
+#include <linux/sched/debug.h>
+
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
 
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index c30d07e99dba..625375e7f11f 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/smp.h>
 #include <linux/atomic.h>
 
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 5f7999eacad5..4e8a30d1c22f 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -17,6 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
 #include <linux/cpu.h>
+#include <linux/sched/debug.h>
 
 #ifdef arch_trigger_cpumask_backtrace
 /* For reliability, we're prepared to waste bits here. */
-- 
cgit v1.2.3


From ef8bd77f332bb0a4e467d7171bbfc6c57aa08a88 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:36 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/hotplug.h>

We are going to split <linux/sched/hotplug.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/hotplug.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/smp.c                        | 1 +
 arch/arm64/include/asm/mmu_context.h         | 1 +
 arch/arm64/kernel/smp.c                      | 1 +
 arch/ia64/kernel/process.c                   | 1 +
 arch/metag/kernel/smp.c                      | 1 +
 arch/mips/cavium-octeon/smp.c                | 1 +
 arch/mips/kernel/smp-bmips.c                 | 1 +
 arch/mips/kernel/smp-cps.c                   | 1 +
 arch/mips/loongson64/loongson-3/smp.c        | 1 +
 arch/powerpc/platforms/85xx/smp.c            | 1 +
 arch/powerpc/platforms/powermac/smp.c        | 1 +
 arch/powerpc/platforms/powernv/smp.c         | 1 +
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 1 +
 arch/s390/kernel/smp.c                       | 1 +
 arch/sh/kernel/smp.c                         | 1 +
 arch/sparc/kernel/smp_64.c                   | 1 +
 arch/x86/kernel/smpboot.c                    | 1 +
 arch/xtensa/kernel/smp.c                     | 1 +
 include/linux/sched/hotplug.h                | 6 ++++++
 kernel/cpu.c                                 | 1 +
 kernel/sched/core.c                          | 1 +
 kernel/sched/sched.h                         | 4 +++-
 22 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/hotplug.h

(limited to 'include/linux')

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 5a07c5a4b894..2083a5370308 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 1ef40d82cfd3..3c9f7d18a7e6 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -25,6 +25,7 @@
 
 #include <linux/compiler.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 827d52d78b67..f66e58523b96 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 804b251ee5d1..2204ae450d65 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -21,6 +21,7 @@
 #include <linux/personality.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index c622293254e4..198eda75846b 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index 4355a4cf4d74..4b94b7fbafa3 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/init.h>
 #include <linux/export.h>
 
diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c
index 16e37a28f876..3daa2cae50b0 100644
--- a/arch/mips/kernel/smp-bmips.c
+++ b/arch/mips/kernel/smp-bmips.c
@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index a2544c2394e4..1d3188c23bb8 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -12,6 +12,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/mips-gic.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/types.h>
diff --git a/arch/mips/loongson64/loongson-3/smp.c b/arch/mips/loongson64/loongson-3/smp.c
index cfcf240cedbe..66ede5b11355 100644
--- a/arch/mips/loongson64/loongson-3/smp.c
+++ b/arch/mips/loongson64/loongson-3/smp.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/smp.h>
 #include <linux/cpufreq.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index a83a6d26090d..078097a0b09d 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -12,6 +12,7 @@
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
+#include <linux/sched/hotplug.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/of.h>
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index c9eb7d6540ea..746ca7321b03 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -23,6 +23,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index e39e6c428af1..8b67e1eefb5c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index a1b63e00b2f7..7bc0e91f8715 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/sched.h>	/* for idle_task_exit */
+#include <linux/sched/hotplug.h>
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <linux/slab.h>
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index d0a74d7ce433..b5766e03cdcb 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -31,6 +31,7 @@
 #include <linux/irqflags.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
+#include <linux/sched/hotplug.h>
 #include <linux/crash_dump.h>
 #include <linux/memblock.h>
 #include <asm/asm-offsets.h>
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index edc4769b047e..4abf119c129c 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -21,6 +21,7 @@
 #include <linux/cpu.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/atomic.h>
 #include <linux/clockchips.h>
 #include <asm/processor.h>
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 8e3e13924594..15052d364e04 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -6,6 +6,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/threads.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fe7a66e6b5a0..f3eb266d75ff 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -46,6 +46,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index fcea72019df7..f2b3e1725349 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -21,6 +21,7 @@
 #include <linux/irq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
+#include <linux/sched/hotplug.h>
 #include <linux/reboot.h>
 #include <linux/seq_file.h>
 #include <linux/smp.h>
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
new file mode 100644
index 000000000000..34670ed24894
--- /dev/null
+++ b/include/linux/sched/hotplug.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_HOTPLUG_H
+#define _LINUX_SCHED_HOTPLUG_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_HOTPLUG_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0aae3183b029..78c206579d01 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/notifier.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/hotplug.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11a0684a29a7..956383844116 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9,6 +9,7 @@
 #include <linux/sched/clock.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/hotplug.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e1e819f731b2..0974eb2ef50d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
@@ -12,8 +13,9 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/nohz.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+
 #include <linux/u64_stats_sync.h>
-#include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
 #include <linux/binfmts.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3


From 299300258d1bc4e997b7db340a2e06636757fe2e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:36 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/task.h>

We are going to split <linux/sched/task.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/task.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/process.c              | 1 +
 arch/arc/kernel/process.c                | 2 ++
 arch/arm/kernel/process.c                | 1 +
 arch/arm/mm/init.c                       | 1 +
 arch/arm64/kernel/process.c              | 1 +
 arch/avr32/kernel/process.c              | 1 +
 arch/blackfin/kernel/process.c           | 1 +
 arch/blackfin/kernel/trace.c             | 1 +
 arch/c6x/kernel/process.c                | 1 +
 arch/cris/arch-v10/kernel/process.c      | 1 +
 arch/cris/arch-v32/kernel/process.c      | 1 +
 arch/cris/kernel/process.c               | 1 +
 arch/frv/kernel/process.c                | 1 +
 arch/frv/mm/mmu-context.c                | 1 +
 arch/h8300/kernel/process.c              | 1 +
 arch/hexagon/kernel/process.c            | 1 +
 arch/ia64/kernel/mca.c                   | 1 +
 arch/ia64/kernel/perfmon.c               | 1 +
 arch/ia64/kernel/process.c               | 1 +
 arch/ia64/kernel/ptrace.c                | 1 +
 arch/m32r/kernel/process.c               | 1 +
 arch/m32r/kernel/smpboot.c               | 1 +
 arch/m68k/kernel/process.c               | 1 +
 arch/metag/kernel/process.c              | 1 +
 arch/metag/kernel/traps.c                | 1 +
 arch/microblaze/kernel/process.c         | 1 +
 arch/mips/kernel/mips-mt-fpaff.c         | 1 +
 arch/mips/kernel/process.c               | 1 +
 arch/mn10300/kernel/process.c            | 1 +
 arch/mn10300/kernel/smp.c                | 1 +
 arch/nios2/kernel/process.c              | 1 +
 arch/openrisc/kernel/process.c           | 1 +
 arch/parisc/kernel/process.c             | 1 +
 arch/powerpc/kernel/process.c            | 1 +
 arch/s390/kernel/process.c               | 1 +
 arch/s390/kernel/setup.c                 | 1 +
 arch/score/kernel/process.c              | 1 +
 arch/sh/kernel/cpu/fpu.c                 | 1 +
 arch/sh/kernel/process_32.c              | 1 +
 arch/sh/kernel/process_64.c              | 1 +
 arch/sh/mm/asids-debugfs.c               | 1 +
 arch/sparc/kernel/process_32.c           | 1 +
 arch/sparc/kernel/process_64.c           | 1 +
 arch/tile/kernel/process.c               | 1 +
 arch/tile/kernel/smpboot.c               | 1 +
 arch/tile/kernel/unaligned.c             | 1 +
 arch/tile/mm/fault.c                     | 2 ++
 arch/um/kernel/exec.c                    | 1 +
 arch/um/kernel/process.c                 | 1 +
 arch/um/kernel/reboot.c                  | 1 +
 arch/unicore32/kernel/process.c          | 1 +
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 1 +
 arch/x86/kernel/fpu/init.c               | 1 +
 arch/x86/kernel/process.c                | 1 +
 arch/x86/kernel/process_32.c             | 1 +
 arch/x86/kernel/process_64.c             | 1 +
 arch/x86/kernel/unwind_frame.c           | 1 +
 arch/xtensa/kernel/process.c             | 1 +
 drivers/misc/kgdbts.c                    | 2 ++
 drivers/tty/sysrq.c                      | 2 +-
 drivers/tty/tty_io.c                     | 1 +
 fs/exec.c                                | 1 +
 fs/fcntl.c                               | 1 +
 fs/fs_struct.c                           | 1 +
 fs/proc/array.c                          | 1 +
 fs/proc/kcore.c                          | 1 +
 include/linux/sched/task.h               | 6 ++++++
 init/main.c                              | 1 +
 kernel/cgroup/cgroup.c                   | 1 +
 kernel/cpu.c                             | 1 +
 kernel/exit.c                            | 1 +
 kernel/fork.c                            | 1 +
 kernel/kmod.c                            | 1 +
 kernel/kthread.c                         | 1 +
 kernel/locking/lockdep.c                 | 1 +
 kernel/pid.c                             | 1 +
 kernel/pid_namespace.c                   | 1 +
 kernel/power/process.c                   | 1 +
 kernel/ptrace.c                          | 1 +
 kernel/sched/sched.h                     | 1 +
 kernel/signal.c                          | 1 +
 kernel/smpboot.c                         | 1 +
 kernel/sys.c                             | 1 +
 kernel/trace/ftrace.c                    | 1 +
 kernel/tracepoint.c                      | 1 +
 lib/dma-debug.c                          | 1 +
 mm/kmemleak.c                            | 1 +
 mm/memory-failure.c                      | 1 +
 mm/memory.c                              | 1 +
 mm/oom_kill.c                            | 1 +
 mm/rmap.c                                | 1 +
 mm/swapfile.c                            | 1 +
 mm/usercopy.c                            | 2 ++
 security/keys/keyctl.c                   | 1 +
 security/selinux/hooks.c                 | 1 +
 95 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/task.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 88f7a974d311..713b4fac998e 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index a41a79a4f4fe..d618d26721ab 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -11,6 +11,8 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
+
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/unistd.h>
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 03d46395d1ff..d4c7c9a1afa9 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 2a9040dcf47e..1d8558ff9827 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -14,6 +14,7 @@
 #include <linux/bootmem.h>
 #include <linux/mman.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/export.h>
 #include <linux/nodemask.h>
 #include <linux/initrd.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 40a16cdd9873..f9b8e74ff8f2 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -25,6 +25,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index dac022d88d9d..75b944a5107c 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -7,6 +7,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index e95d094c20a6..1a1f444004b1 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index 23c05b8effb3..151f22196ab6 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -13,6 +13,7 @@
 #include <linux/oom.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 0ee7686a78f3..6b61779d426a 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/mqueue.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
+#include <linux/sched/task.h>
 
 #include <asm/syscalls.h>
 
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 068a40374200..808c2186b704 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -12,6 +12,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index 613c2d71bf10..c852df262948 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -10,6 +10,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 50a7dd451456..0bbd3a0c3d70 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -20,6 +20,7 @@
 #include <linux/spinlock.h>
 #include <linux/init_task.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/user.h>
 #include <linux/elfcore.h>
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index cf4c34c09ab3..c96dbd4b8626 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c
index 4031289bf2ec..16946a58f64d 100644
--- a/arch/frv/mm/mmu-context.c
+++ b/arch/frv/mm/mmu-context.c
@@ -11,6 +11,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 #include <asm/tlbflush.h>
 
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index d7cabf373fe3..54e4d6f01865 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index 3bdd9592d025..a2a822a875b6 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -20,6 +20,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/tick.h>
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index f9fe3ac64242..79c7c46d7dc1 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -74,6 +74,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 677a86826771..7e943d3c05ed 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 2204ae450d65..054facf22156 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -22,6 +22,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 0b1153e610ea..04fe1436e1cc 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 1a227c1fdb1a..2a450382934c 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index f98d2f6519d6..a7d04684d2c7 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -45,6 +45,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/err.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index b4a4675f4119..5d335d178706 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 2e611bd37515..801e6f927e62 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 5ce67f9124aa..23c5ac33a93f 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -11,6 +11,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 8c5fdb2e2850..a642fe5ac5ff 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/pm.h>
 #include <linux/tick.h>
 #include <linux/bitops.h>
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index ffc6bd3464d9..8cab633e0e5a 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/cred.h>
 #include <linux/security.h>
 #include <linux/types.h>
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 97574cdb532d..8bfb833b3158 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/tick.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index a1e2f8301d94..8ca7a9de09a5 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index e65b5cc2fa67..d924f7a79708 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 3ad87a6b119b..5ee1ddf6a924 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 899339dac938..0c5ad3bde0f5 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -23,6 +23,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 8c283caf2b4d..e5f97239cc5e 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -44,6 +44,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 76f58b87dcb2..b99b12656f6f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index ed99ff911b67..250c41f05721 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/elfcore.h>
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index e4d811f17971..3de07004c02e 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index aae9480706c2..32924159d8c9 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/elfcore.h>
 #include <linux/pm.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/task.h>
 
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
index 37e35330aae6..b76370a47bf9 100644
--- a/arch/sh/kernel/cpu/fpu.c
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -1,4 +1,5 @@
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 5098274e0f23..493c3eb86aa5 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/elfcore.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index ced21a417d9d..056607185158 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c
index 110bd35165bf..e5539e0f8e3b 100644
--- a/arch/sh/mm/asids-debugfs.c
+++ b/arch/sh/mm/asids-debugfs.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 237a471c8ed5..ed87c45a785b 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 98f6ff6eaa55..4c82a73af893 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 557b4c8435d0..dbb4fa76d1dd 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -14,6 +14,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 53ce940a5016..f3fdd0c39b12 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kernel_stat.h>
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
index 142acae78316..8149c38f67b6 100644
--- a/arch/tile/kernel/unaligned.c
+++ b/arch/tile/kernel/unaligned.c
@@ -18,6 +18,7 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/mman.h>
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 005c91c2adca..f58fa06a2214 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -17,6 +17,8 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 770ec07b6a6a..2df4e5d11939 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/processor.h>
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index c6c45ea4021f..e76dc7c11251 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/seq_file.h>
 #include <linux/tick.h>
 #include <linux/threads.h>
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 79218106a033..0bc921cee0b1 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index c1a0eec88f1f..d58f1600b477 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d48af18e7baf..0bbe0f3a039f 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -26,6 +26,7 @@
 #include <linux/kernfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/task_work.h>
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 19bdd1bf8160..c2f8dde3255c 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -7,6 +7,7 @@
 #include <asm/cmdline.h>
 
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 
 /*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 505be5589e52..441f00e7be8f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 #include <linux/sched/idle.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a0ac3e81518a..d79ffa47aab8 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -12,6 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a61e141b6891..124f5652ae3c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 23d15565d02a..7c0abdc9a732 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,4 +1,5 @@
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <asm/ptrace.h>
 #include <asm/bitops.h>
 #include <asm/stacktrace.h>
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index afaa3588d869..af33f9d4c624 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 99635dd9dbac..fc7efedbc4be 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -103,6 +103,8 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/sched/task.h>
+
 #include <asm/sections.h>
 
 #define v1printk(a...) do { \
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5f0c7d0fe8d..c6fc7141d7b2 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -17,7 +17,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/debug.h>
-#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 985d33f0f315..e6d1a6510886 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -70,6 +70,7 @@
 #include <linux/signal.h>
 #include <linux/fcntl.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
diff --git a/fs/exec.c b/fs/exec.c
index e857c7260b1f..65145a3df065 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -36,6 +36,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e1c54f20325c..be8fbe289087 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -7,6 +7,7 @@
 #include <linux/syscalls.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 543ed50f0387..be0250788b73 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -1,5 +1,6 @@
 #include <linux/export.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cfe7f934a300..f3169b58af38 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -62,6 +62,7 @@
 #include <linux/mman.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ea9f3d1ae830..4ee55274f155 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -28,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/ioport.h>
 #include <linux/memory.h>
+#include <linux/sched/task.h>
 #include <asm/sections.h>
 #include "internal.h"
 
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
new file mode 100644
index 000000000000..0023c91ff821
--- /dev/null
+++ b/include/linux/sched/task.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_TASK_H
+#define _LINUX_SCHED_TASK_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_TASK_H */
diff --git a/init/main.c b/init/main.c
index c891cfb8928f..47956b22add1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -76,6 +76,7 @@
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
+#include <linux/sched/task.h>
 #include <linux/context_tracking.h>
 #include <linux/random.h>
 #include <linux/list.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e8f87bf9840c..0125589c7428 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -41,6 +41,7 @@
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 78c206579d01..f7c063239fa5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
 #include <linux/notifier.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 5bad7dce1b7b..e53408d156df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -9,6 +9,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 8632905f64c5..0af64d5d8b73 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,6 +18,7 @@
 #include <linux/sched/user.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/task.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0c407f905ca4..ac5f5c2d098d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,6 +20,7 @@
 */
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/kmod.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef9b9eb809c7..2f26adea0f84 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -7,6 +7,7 @@
  */
 #include <uapi/linux/sched/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index cdafaff926ca..12e38c213b70 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -29,6 +29,7 @@
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/task.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index 0291804151b5..0143ac0ddceb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -38,6 +38,7 @@
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
+#include <linux/sched/task.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 25314c96fbe6..7c8367854dc4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -19,6 +19,7 @@
 #include <linux/proc_ns.h>
 #include <linux/reboot.h>
 #include <linux/export.h>
+#include <linux/sched/task.h>
 
 struct pid_cache {
 	int nr_ids;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 85e2915d8961..c7209f060eeb 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,6 +13,7 @@
 #include <linux/suspend.h>
 #include <linux/module.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index bcdbdc19eb35..0af928712174 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0974eb2ef50d..6b7155ae5c33 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,7 @@
 #include <linux/sched/nohz.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 15092fd8c7b1..25d099fd80f7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/user.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 4a5c6e73ecd4..1d71c051a951 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 8c0392c8be51..7f1ecd2e41c1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -54,6 +54,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0c0609326391..0d1597c9ee30 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -15,6 +15,7 @@
 
 #include <linux/stop_machine.h>
 #include <linux/clocksource.h>
+#include <linux/sched/task.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9f6984d52fec..685c50ae6300 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/static_key.h>
 
 extern struct tracepoint * const __start___tracepoints_ptrs[];
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 60c57ec936db..942adf13418d 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -19,6 +19,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
+#include <linux/sched/task.h>
 #include <linux/stacktrace.h>
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2df6d3687b2a..a3970573359e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -74,6 +74,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b74984db386e..27f7210e7fab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -41,6 +41,7 @@
 #include <linux/page-flags.h>
 #include <linux/kernel-page-flags.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
diff --git a/mm/memory.c b/mm/memory.c
index d4994a28dc85..a97a4cec2e1f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -43,6 +43,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 69f75b014607..d083714a2bb9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -24,6 +24,7 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 0367a0506667..2da487d6cea8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff2bf3f61b14..521ef9b6064f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -7,6 +7,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 7ccad05c0d5c..d155e12563b1 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,6 +17,8 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <asm/sections.h>
 
 enum {
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index bcb0b597c391..52c34532c785 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/key.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b12f873f92ba..57ff53696144 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -29,6 +29,7 @@
 #include <linux/tracehook.h>
 #include <linux/errno.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/lsm_hooks.h>
 #include <linux/xattr.h>
 #include <linux/capability.h>
-- 
cgit v1.2.3


From 68db0cf10678630d286f4bbbbdfa102951a35faa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:37 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/task_stack.h>

We are going to split <linux/sched/task_stack.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/task_stack.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c            | 1 +
 arch/alpha/kernel/process.c            | 1 +
 arch/alpha/kernel/ptrace.c             | 1 +
 arch/alpha/kernel/signal.c             | 1 +
 arch/arc/kernel/kgdb.c                 | 1 +
 arch/arc/kernel/process.c              | 1 +
 arch/arc/kernel/ptrace.c               | 1 +
 arch/arc/kernel/signal.c               | 2 ++
 arch/arm/kernel/perf_regs.c            | 1 +
 arch/arm/kernel/process.c              | 1 +
 arch/arm/kernel/ptrace.c               | 1 +
 arch/arm/kernel/smp.c                  | 1 +
 arch/arm/kernel/traps.c                | 1 +
 arch/arm64/include/asm/compat.h        | 1 +
 arch/arm64/kernel/debug-monitors.c     | 1 +
 arch/arm64/kernel/kgdb.c               | 2 ++
 arch/arm64/kernel/perf_regs.c          | 1 +
 arch/arm64/kernel/process.c            | 1 +
 arch/arm64/kernel/ptrace.c             | 1 +
 arch/arm64/kernel/smp.c                | 1 +
 arch/arm64/kernel/stacktrace.c         | 1 +
 arch/arm64/kernel/traps.c              | 1 +
 arch/avr32/kernel/process.c            | 1 +
 arch/avr32/kernel/ptrace.c             | 1 +
 arch/avr32/kernel/stacktrace.c         | 1 +
 arch/blackfin/kernel/process.c         | 1 +
 arch/blackfin/kernel/ptrace.c          | 1 +
 arch/blackfin/kernel/signal.c          | 1 +
 arch/blackfin/kernel/stacktrace.c      | 1 +
 arch/blackfin/mach-common/smp.c        | 1 +
 arch/c6x/kernel/process.c              | 1 +
 arch/c6x/kernel/ptrace.c               | 1 +
 arch/cris/arch-v10/kernel/process.c    | 1 +
 arch/cris/arch-v10/kernel/ptrace.c     | 1 +
 arch/cris/arch-v10/kernel/signal.c     | 1 +
 arch/cris/arch-v32/kernel/process.c    | 1 +
 arch/cris/arch-v32/kernel/ptrace.c     | 1 +
 arch/cris/arch-v32/kernel/signal.c     | 1 +
 arch/frv/kernel/process.c              | 1 +
 arch/h8300/kernel/process.c            | 1 +
 arch/h8300/kernel/signal.c             | 1 +
 arch/hexagon/kernel/kgdb.c             | 1 +
 arch/hexagon/kernel/process.c          | 1 +
 arch/hexagon/kernel/ptrace.c           | 1 +
 arch/hexagon/kernel/signal.c           | 2 ++
 arch/hexagon/kernel/stacktrace.c       | 1 +
 arch/hexagon/kernel/traps.c            | 1 +
 arch/ia64/kernel/perfmon.c             | 1 +
 arch/ia64/kernel/process.c             | 1 +
 arch/ia64/kernel/ptrace.c              | 1 +
 arch/ia64/kernel/setup.c               | 1 +
 arch/ia64/kernel/sys_ia64.c            | 1 +
 arch/m32r/kernel/process.c             | 1 +
 arch/m32r/kernel/ptrace.c              | 1 +
 arch/m68k/kernel/process.c             | 1 +
 arch/m68k/kernel/ptrace.c              | 1 +
 arch/metag/kernel/process.c            | 1 +
 arch/metag/kernel/ptrace.c             | 2 ++
 arch/metag/kernel/signal.c             | 1 +
 arch/metag/kernel/smp.c                | 1 +
 arch/metag/kernel/traps.c              | 1 +
 arch/microblaze/kernel/process.c       | 1 +
 arch/microblaze/kernel/ptrace.c        | 1 +
 arch/microblaze/kernel/unwind.c        | 1 +
 arch/mips/include/asm/fpu.h            | 1 +
 arch/mips/kernel/crash.c               | 1 +
 arch/mips/kernel/perf_event.c          | 1 +
 arch/mips/kernel/process.c             | 1 +
 arch/mips/kernel/ptrace.c              | 1 +
 arch/mips/kernel/ptrace32.c            | 1 +
 arch/mips/kernel/stacktrace.c          | 1 +
 arch/mips/kernel/syscall.c             | 1 +
 arch/mips/loongson64/loongson-3/smp.c  | 1 +
 arch/mips/paravirt/paravirt-smp.c      | 1 +
 arch/mips/sibyte/bcm1480/smp.c         | 1 +
 arch/mn10300/kernel/process.c          | 1 +
 arch/mn10300/kernel/ptrace.c           | 1 +
 arch/nios2/kernel/process.c            | 1 +
 arch/nios2/kernel/ptrace.c             | 1 +
 arch/openrisc/kernel/process.c         | 1 +
 arch/openrisc/kernel/ptrace.c          | 1 +
 arch/parisc/kernel/process.c           | 1 +
 arch/powerpc/kernel/process.c          | 1 +
 arch/powerpc/mm/fault.c                | 1 +
 arch/powerpc/perf/perf_regs.c          | 1 +
 arch/s390/include/asm/compat.h         | 1 +
 arch/s390/include/asm/kprobes.h        | 1 +
 arch/s390/kernel/compat_signal.c       | 1 +
 arch/s390/kernel/dumpstack.c           | 1 +
 arch/s390/kernel/process.c             | 1 +
 arch/s390/kernel/ptrace.c              | 1 +
 arch/s390/kernel/runtime_instr.c       | 2 ++
 arch/s390/kernel/signal.c              | 1 +
 arch/s390/kernel/smp.c                 | 1 +
 arch/s390/kernel/uprobes.c             | 2 ++
 arch/score/kernel/process.c            | 1 +
 arch/score/kernel/ptrace.c             | 1 +
 arch/sh/kernel/cpu/fpu.c               | 1 +
 arch/sh/kernel/dumpstack.c             | 1 +
 arch/sh/kernel/kgdb.c                  | 2 ++
 arch/sh/kernel/process.c               | 1 +
 arch/sh/kernel/process_32.c            | 1 +
 arch/sh/kernel/process_64.c            | 1 +
 arch/sh/kernel/ptrace_32.c             | 1 +
 arch/sh/kernel/ptrace_64.c             | 1 +
 arch/sh/kernel/signal_32.c             | 1 +
 arch/sh/kernel/sys_sh32.c              | 1 +
 arch/sh/kernel/traps.c                 | 1 +
 arch/sh/kernel/traps_32.c              | 2 ++
 arch/sparc/kernel/process_32.c         | 1 +
 arch/sparc/kernel/process_64.c         | 1 +
 arch/sparc/kernel/ptrace_64.c          | 1 +
 arch/tile/kernel/compat_signal.c       | 1 +
 arch/tile/kernel/kgdb.c                | 2 ++
 arch/tile/kernel/process.c             | 1 +
 arch/tile/kernel/ptrace.c              | 2 ++
 arch/tile/kernel/signal.c              | 1 +
 arch/tile/kernel/stack.c               | 1 +
 arch/um/kernel/exec.c                  | 1 +
 arch/um/kernel/process.c               | 1 +
 arch/um/kernel/skas/process.c          | 1 +
 arch/unicore32/kernel/process.c        | 1 +
 arch/unicore32/kernel/ptrace.c         | 1 +
 arch/unicore32/kernel/traps.c          | 1 +
 arch/x86/entry/common.c                | 1 +
 arch/x86/entry/vdso/vma.c              | 1 +
 arch/x86/ia32/ia32_aout.c              | 1 +
 arch/x86/ia32/ia32_signal.c            | 1 +
 arch/x86/kernel/dumpstack.c            | 1 +
 arch/x86/kernel/fpu/regset.c           | 1 +
 arch/x86/kernel/ioport.c               | 1 +
 arch/x86/kernel/irq_64.c               | 1 +
 arch/x86/kernel/perf_regs.c            | 1 +
 arch/x86/kernel/process.c              | 1 +
 arch/x86/kernel/process_32.c           | 1 +
 arch/x86/kernel/process_64.c           | 1 +
 arch/x86/kernel/ptrace.c               | 1 +
 arch/x86/kernel/signal.c               | 1 +
 arch/x86/kernel/smpboot.c              | 1 +
 arch/x86/kernel/stacktrace.c           | 1 +
 arch/x86/kernel/step.c                 | 1 +
 arch/x86/kernel/traps.c                | 1 +
 arch/x86/kernel/unwind_frame.c         | 1 +
 arch/x86/kernel/vm86_32.c              | 1 +
 arch/x86/mm/fault.c                    | 1 +
 arch/xtensa/kernel/process.c           | 1 +
 arch/xtensa/kernel/ptrace.c            | 1 +
 arch/xtensa/kernel/signal.c            | 1 +
 arch/xtensa/kernel/smp.c               | 1 +
 block/blk-map.c                        | 1 +
 drivers/hv/vmbus_drv.c                 | 2 ++
 drivers/ide/ide-cd.c                   | 1 +
 drivers/md/bcache/closure.h            | 1 +
 drivers/misc/lkdtm_usercopy.c          | 1 +
 drivers/mtd/nand/gpmi-nand/gpmi-nand.c | 1 +
 fs/binfmt_aout.c                       | 1 +
 fs/binfmt_elf.c                        | 1 +
 fs/binfmt_elf_fdpic.c                  | 3 ++-
 fs/binfmt_flat.c                       | 1 +
 fs/coredump.c                          | 1 +
 include/linux/elfcore.h                | 2 ++
 include/linux/perf_regs.h              | 2 ++
 include/linux/sched/task_stack.h       | 6 ++++++
 init/main.c                            | 1 +
 kernel/events/callchain.c              | 2 ++
 kernel/exit.c                          | 1 +
 kernel/fork.c                          | 1 +
 kernel/printk/printk.c                 | 1 +
 kernel/sched/sched.h                   | 1 +
 kernel/seccomp.c                       | 1 +
 kernel/signal.c                        | 1 +
 kernel/trace/trace_stack.c             | 1 +
 lib/debugobjects.c                     | 1 +
 lib/dma-debug.c                        | 1 +
 lib/syscall.c                          | 1 +
 mm/kasan/kasan.c                       | 1 +
 mm/kmemleak.c                          | 1 +
 mm/util.c                              | 1 +
 178 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/task_stack.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 3b9b2a382ba2..73446baa632e 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 713b4fac998e..0b9635040721 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index bc4d2cdcf21d..285a82d491ef 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index b8221f112eee..8129dd92cadc 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/errno.h>
diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c
index ecf6a7869375..9a3c34af2ae8 100644
--- a/arch/arc/kernel/kgdb.c
+++ b/arch/arc/kernel/kgdb.c
@@ -10,6 +10,7 @@
 
 #include <linux/kgdb.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <asm/disasm.h>
 #include <asm/cacheflush.h>
 
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index d618d26721ab..2a018de6d6cd 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/arc/kernel/ptrace.c b/arch/arc/kernel/ptrace.c
index 4442204fe238..31150060d38b 100644
--- a/arch/arc/kernel/ptrace.c
+++ b/arch/arc/kernel/ptrace.c
@@ -8,6 +8,7 @@
 
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
 #include <linux/regset.h>
 #include <linux/unistd.h>
 #include <linux/elf.h>
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index d347bbc086fe..48685445002e 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -53,6 +53,8 @@
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/ucontext.h>
 
 struct rt_sigframe {
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
index 592dda3f21ff..c366b83bf955 100644
--- a/arch/arm/kernel/perf_regs.c
+++ b/arch/arm/kernel/perf_regs.c
@@ -3,6 +3,7 @@
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
+#include <linux/sched/task_stack.h>
 #include <asm/perf_regs.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d4c7c9a1afa9..939e8b58c59d 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 46f7bab81c40..58e3771e4c5b 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/elf.h>
 #include <linux/smp.h>
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 2083a5370308..b724cff7ad60 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index a9dad001b97d..948c648fea00 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/irq.h>
 
 #include <linux/atomic.h>
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index eb8432bb82b8..e39d487bf724 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -23,6 +23,7 @@
  */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 #define COMPAT_USER_HZ		100
 #ifdef __AARCH64EB__
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 2bd426448fc1..32913567da08 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -26,6 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/stat.h>
 #include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/cpufeature.h>
 #include <asm/cputype.h>
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index d217c9e95b06..2122cd187f19 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -24,6 +24,8 @@
 #include <linux/kdebug.h>
 #include <linux/kgdb.h>
 #include <linux/kprobes.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
 #include <asm/traps.h>
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 3f62b35fb6f1..bd1b74c2436f 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/compat.h>
 #include <asm/perf_regs.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f9b8e74ff8f2..043d373b8369 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 64fc32ea3422..c142459a88f3 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -23,6 +23,7 @@
 #include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/ptrace.h>
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index f66e58523b96..83c0a839a6ad 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 7597e42feeea..feac80c22f61 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -20,6 +20,7 @@
 #include <linux/ftrace.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 
 #include <asm/irq.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5a5d83791837..bdbd0c3febf4 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -31,6 +31,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/syscalls.h>
 
 #include <asm/atomic.h>
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 75b944a5107c..ad0dfccedb79 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
index a89b893279bb..41a14e96a1db 100644
--- a/arch/avr32/kernel/ptrace.c
+++ b/arch/avr32/kernel/ptrace.c
@@ -8,6 +8,7 @@
 #undef DEBUG
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <linux/errno.h>
diff --git a/arch/avr32/kernel/stacktrace.c b/arch/avr32/kernel/stacktrace.c
index c09f0d8dd679..f8cc995cf0e0 100644
--- a/arch/avr32/kernel/stacktrace.c
+++ b/arch/avr32/kernel/stacktrace.c
@@ -8,6 +8,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 1a1f444004b1..b691ef875a40 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 360d99645163..a6827095b99a 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -7,6 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/elf.h>
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index ea570db598e5..5f5172779204 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -12,6 +12,7 @@
 #include <linux/binfmts.h>
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
diff --git a/arch/blackfin/kernel/stacktrace.c b/arch/blackfin/kernel/stacktrace.c
index 30301e1eace5..17198f3650b6 100644
--- a/arch/blackfin/kernel/stacktrace.c
+++ b/arch/blackfin/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index a2e6db2ce811..3ac98252d299 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/clockchips.h>
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 6b61779d426a..c4ecb24c2d5c 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/syscalls.h>
 
diff --git a/arch/c6x/kernel/ptrace.c b/arch/c6x/kernel/ptrace.c
index 3c494e84444d..a27e1f02ce18 100644
--- a/arch/c6x/kernel/ptrace.c
+++ b/arch/c6x/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/tracehook.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/cacheflush.h>
 
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 808c2186b704..e299d30105b5 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index eca94c7d56e7..c2f2b9b83cc4 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -4,6 +4,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index db30c98e4926..bab4a8dd6bfd 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c852df262948..c530a8fa87ce 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index c366bc05466a..0461e95bbb62 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -4,6 +4,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 816bf2ca93ef..ea2e8e1398e8 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index c96dbd4b8626..5a4c92abc99e 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 54e4d6f01865..0f5db5bb561b 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index d784f7117f9a..1e8070d08770 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -25,6 +25,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c
index 62dece3ad827..16c24b22d0b2 100644
--- a/arch/hexagon/kernel/kgdb.c
+++ b/arch/hexagon/kernel/kgdb.c
@@ -20,6 +20,7 @@
 
 #include <linux/irq.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kdebug.h>
 #include <linux/kgdb.h>
 
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index a2a822a875b6..de715bab7956 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/tick.h>
diff --git a/arch/hexagon/kernel/ptrace.c b/arch/hexagon/kernel/ptrace.c
index 390a9ad14ca1..ecd75e2e8eb3 100644
--- a/arch/hexagon/kernel/ptrace.c
+++ b/arch/hexagon/kernel/ptrace.c
@@ -22,6 +22,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index c6b22b9945a7..78aa7304a5c9 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -21,6 +21,8 @@
 #include <linux/linkage.h>
 #include <linux/syscalls.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/registers.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
diff --git a/arch/hexagon/kernel/stacktrace.c b/arch/hexagon/kernel/stacktrace.c
index f94918b449a8..41866a06adf7 100644
--- a/arch/hexagon/kernel/stacktrace.c
+++ b/arch/hexagon/kernel/stacktrace.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index b55f13c70b34..2942a9204a9a 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kdebug.h>
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 7e943d3c05ed..09f86ebfcc7b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 054facf22156..d344d0d691aa 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -23,6 +23,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 04fe1436e1cc..3f8293378a83 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 32a6cc36296f..b2b4f5216180 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -33,6 +33,7 @@
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/threads.h>
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index ce4cc60d519b..5ce927c854a6 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -11,6 +11,7 @@
 #include <linux/mman.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
 #include <linux/shm.h>
 #include <linux/file.h>		/* doh, must come after sched.h... */
 #include <linux/smp.h>
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 2a450382934c..d8ffcfec599c 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index a68acb9fa515..2d887400e30e 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/smp.h>
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 5d335d178706..e475c945c8b2 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index 9cd86d7343a6..748c63bd0081 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -12,6 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 801e6f927e62..c4606ce743d2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -10,6 +10,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/ptrace.c b/arch/metag/kernel/ptrace.c
index 7563628822bd..5fd16ee5280c 100644
--- a/arch/metag/kernel/ptrace.c
+++ b/arch/metag/kernel/ptrace.c
@@ -15,6 +15,8 @@
 #include <linux/tracehook.h>
 #include <linux/elf.h>
 #include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
+
 #include <trace/syscall.h>
 
 #define CREATE_TRACE_POINTS
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
index ce49d429c74a..338925d808e6 100644
--- a/arch/metag/kernel/signal.c
+++ b/arch/metag/kernel/signal.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index 198eda75846b..cab2aa64ca82 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 23c5ac33a93f..444851e510d5 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index a642fe5ac5ff..e92a817e645f 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/pm.h>
 #include <linux/tick.h>
 #include <linux/bitops.h>
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 8cfa98cadf3d..badd286882ae 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -27,6 +27,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/elf.h>
diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c
index 61c04eed14d5..34c270cb11fc 100644
--- a/arch/microblaze/kernel/unwind.c
+++ b/arch/microblaze/kernel/unwind.c
@@ -17,6 +17,7 @@
 #include <linux/kallsyms.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/arch/mips/include/asm/fpu.h b/arch/mips/include/asm/fpu.h
index f06f97bd62df..321752bcbab6 100644
--- a/arch/mips/include/asm/fpu.h
+++ b/arch/mips/include/asm/fpu.h
@@ -11,6 +11,7 @@
 #define _ASM_FPU_H
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 #include <linux/bitops.h>
 
diff --git a/arch/mips/kernel/crash.c b/arch/mips/kernel/crash.c
index 5a71518be0f1..ca25cd393b1c 100644
--- a/arch/mips/kernel/crash.c
+++ b/arch/mips/kernel/crash.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu = -1;
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index d64056e0bb56..f298eb2ff6c2 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/perf_event.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/stacktrace.h>
 
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 8bfb833b3158..fb6b6b650719 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tick.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index fdef26382c37..339601267265 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -19,6 +19,7 @@
 #include <linux/elf.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 4f0998525626..40e212d6b26b 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/mips/kernel/stacktrace.c b/arch/mips/kernel/stacktrace.c
index 986f910961d9..7c7c902249f2 100644
--- a/arch/mips/kernel/stacktrace.c
+++ b/arch/mips/kernel/stacktrace.c
@@ -5,6 +5,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <asm/stacktrace.h>
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index c86ddbaa4598..f1d17ece4181 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -26,6 +26,7 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/elf.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/asm.h>
 #include <asm/branch.h>
diff --git a/arch/mips/loongson64/loongson-3/smp.c b/arch/mips/loongson64/loongson-3/smp.c
index 66ede5b11355..64659fc73940 100644
--- a/arch/mips/loongson64/loongson-3/smp.c
+++ b/arch/mips/loongson64/loongson-3/smp.c
@@ -18,6 +18,7 @@
 #include <linux/cpu.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/smp.h>
 #include <linux/cpufreq.h>
 #include <asm/processor.h>
diff --git a/arch/mips/paravirt/paravirt-smp.c b/arch/mips/paravirt/paravirt-smp.c
index f8d3e081b2eb..72eb1a56c645 100644
--- a/arch/mips/paravirt/paravirt-smp.c
+++ b/arch/mips/paravirt/paravirt-smp.c
@@ -10,6 +10,7 @@
 #include <linux/cpumask.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/mipsregs.h>
 #include <asm/setup.h>
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 4c71aea25663..d0e94ffcc1b8 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -21,6 +21,7 @@
 #include <linux/smp.h>
 #include <linux/kernel_stat.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/mmu_context.h>
 #include <asm/io.h>
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 8ca7a9de09a5..c9fa42619c6a 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c
index 976020f469c1..8009876a7ac4 100644
--- a/arch/mn10300/kernel/ptrace.c
+++ b/arch/mn10300/kernel/ptrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 5ee1ddf6a924..869a4d59de32 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/nios2/kernel/ptrace.c b/arch/nios2/kernel/ptrace.c
index 681dda92eff1..de97bcb7dd44 100644
--- a/arch/nios2/kernel/ptrace.c
+++ b/arch/nios2/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/regset.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/user.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 0c5ad3bde0f5..828a29110459 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -24,6 +24,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c
index 228288887d74..eb97a8e7c8aa 100644
--- a/arch/openrisc/kernel/ptrace.c
+++ b/arch/openrisc/kernel/ptrace.c
@@ -18,6 +18,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/string.h>
 
 #include <linux/mm.h>
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index e5f97239cc5e..06f7ca7fe70b 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -45,6 +45,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index b99b12656f6f..d645da302bf2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 8dc758658972..51def8a515be 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -17,6 +17,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index d24a8a3668fa..cbd82fde5770 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
 #include <linux/stddef.h>
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 352f7bdaf11f..0ddd37e6c29d 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -5,6 +5,7 @@
  */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 
 #define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64))
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index 84c0f9086483..1293c4066cfc 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -35,6 +35,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <linux/sched/task_stack.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
 
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 362350cc485c..c620049c61f2 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -10,6 +10,7 @@
 
 #include <linux/compat.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 72c584d62f59..829e1c53005c 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <asm/processor.h>
 #include <asm/debug.h>
 #include <asm/dis.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 250c41f05721..20cd339e11ae 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/elfcore.h>
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 12020b55887b..c14df0a1ec3c 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -8,6 +8,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index fffa0e5462af..429d3a782f1c 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -11,6 +11,8 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/kernel_stat.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/runtime_instr.h>
 #include <asm/cpu_mf.h>
 #include <asm/irq.h>
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 62a4c263e887..289dd50f9744 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b5766e03cdcb..47a973b5b4f1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -32,6 +32,7 @@
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/crash_dump.h>
 #include <linux/memblock.h>
 #include <asm/asm-offsets.h>
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index 66956c09d5bf..314e0ee3016a 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -9,6 +9,8 @@
 #include <linux/uprobes.h>
 #include <linux/compat.h>
 #include <linux/kdebug.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/switch_to.h>
 #include <asm/facility.h>
 #include <asm/kprobes.h>
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index 32924159d8c9..eb64d7a677cb 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -29,6 +29,7 @@
 #include <linux/pm.h>
 #include <linux/rcupdate.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c
index 8b75e54816c1..d8455e60bce0 100644
--- a/arch/score/kernel/ptrace.c
+++ b/arch/score/kernel/ptrace.c
@@ -28,6 +28,7 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <linux/regset.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
index b76370a47bf9..547c73478459 100644
--- a/arch/sh/kernel/cpu/fpu.c
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -1,5 +1,6 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c
index d00cab2f50f9..b564b1eae4ae 100644
--- a/arch/sh/kernel/dumpstack.c
+++ b/arch/sh/kernel/dumpstack.c
@@ -12,6 +12,7 @@
 #include <linux/ftrace.h>
 #include <linux/debug_locks.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kdebug.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c
index adad46e41a1d..4f04c6638a4d 100644
--- a/arch/sh/kernel/kgdb.c
+++ b/arch/sh/kernel/kgdb.c
@@ -14,6 +14,8 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/cacheflush.h>
 #include <asm/traps.h>
 
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 80a61c5e3015..f8a695a223dd 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/export.h>
 #include <linux/stackprotector.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 493c3eb86aa5..2c7bdf8cb934 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -17,6 +17,7 @@
 #include <linux/mm.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/elfcore.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index 056607185158..ee2abe96f9f3 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -27,6 +27,7 @@
 #include <linux/io.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 1aabfd356b35..5fc3ff606210 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -12,6 +12,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index c49d0d05a215..1e0656d9e7af 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/rwsem.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/bitops.h>
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 5128d3001ee5..08bce11badc6 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -9,6 +9,7 @@
  *
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index d5287d76809c..a2e1231a90a3 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -1,5 +1,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/sem.h>
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index bc5c9f347b9e..b32d1c3a4655 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -5,6 +5,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kernel.h>
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index ff639342a8be..57cff00cad17 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -25,6 +25,8 @@
 #include <linux/sysfs.h>
 #include <linux/uaccess.h>
 #include <linux/perf_event.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/alignment.h>
 #include <asm/fpu.h>
 #include <asm/kprobes.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index ed87c45a785b..b6dac8e980f0 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 4c82a73af893..1badc493e62e 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 901063c1cf7e..df9e731a76f5 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -12,6 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/export.h>
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index c667e104a0c2..0e863f1ee08c 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/tile/kernel/kgdb.c b/arch/tile/kernel/kgdb.c
index 9247d6b562f4..d4eb5fb2df9d 100644
--- a/arch/tile/kernel/kgdb.c
+++ b/arch/tile/kernel/kgdb.c
@@ -19,6 +19,8 @@
 #include <linux/kdebug.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/cacheflush.h>
 
 static tile_bundle_bits singlestep_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index dbb4fa76d1dd..f0a0e18e4dfb 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index e279572824b1..e1a078e6828e 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -23,6 +23,8 @@
 #include <linux/elf.h>
 #include <linux/tracehook.h>
 #include <linux/context_tracking.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/traps.h>
 #include <arch/chip.h>
 
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 8cc92ae6eb27..f2bf557bb005 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index a1c0787fe9d8..94ecbc6676e5 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -14,6 +14,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/module.h>
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 2df4e5d11939..31968677a0d0 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -9,6 +9,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/processor.h>
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index e76dc7c11251..a9bd61820042 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seq_file.h>
 #include <linux/tick.h>
 #include <linux/threads.h>
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 527fa5881915..ddecf326dab2 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -5,6 +5,7 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <as-layout.h>
 #include <kern.h>
 #include <os.h>
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index d58f1600b477..d22c1dc7e39e 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/unicore32/kernel/ptrace.c b/arch/unicore32/kernel/ptrace.c
index 9f07c08da050..a102c2b4f358 100644
--- a/arch/unicore32/kernel/ptrace.c
+++ b/arch/unicore32/kernel/ptrace.c
@@ -15,6 +15,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
 
 /*
  * this routine will get a word off of the processes privileged stack.
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 506e1a2127c6..5f25b39f04d4 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -16,6 +16,7 @@
 #include <linux/signal.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
 #include <linux/kallsyms.h>
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index b83c61cfd154..370c42c7f046 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 572cee3fccff..226ca70dc6bd 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/random.h>
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 7c0a711989d2..8d0879f1d42c 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/perf_event.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 #include <asm/pgalloc.h>
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 95c0b4ae09b0..724153797209 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 7b9e7e68f316..09d4ac0d2661 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c114b132d121..b188b16841e3 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -5,6 +5,7 @@
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
+#include <linux/sched/task_stack.h>
 
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index b01bc8517450..ca49bab3e467 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b0678a541e2..3be74fbdeff2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -15,6 +15,7 @@
 #include <linux/ftrace.h>
 #include <linux/uaccess.h>
 #include <linux/smp.h>
+#include <linux/sched/task_stack.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
 
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index da8cb987b973..587d887f7f17 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -1,6 +1,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
 #include <linux/stddef.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 441f00e7be8f..56b059486c3b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
 #include <linux/sched/idle.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d79ffa47aab8..4c818f8bc135 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 124f5652ae3c..d6b784a5520d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9cc7d5a330ef..2364b23ea3e5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 763af1d0de64..396c042e9d0e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -10,6 +10,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f3eb266d75ff..bd1f1ad35284 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 330cae0025d0..8e2b79b88e51 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -5,6 +5,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index a23ce84a3f6c..f07f83b3611b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -2,6 +2,7 @@
  * x86 single-step support code, common to 32-bit and 64-bit.
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1dc86ee60a03..948443e115c1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/kexec.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/bug.h>
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 7c0abdc9a732..478d15dbaee4 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <asm/ptrace.h>
 #include <asm/bitops.h>
 #include <asm/stacktrace.h>
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 0442d98367ae..23ee89ce59a9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/string.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e3254ca0eec4..428e31763cb9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  */
 #include <linux/sched.h>		/* test_thread_flag(), ...	*/
+#include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
 #include <linux/extable.h>		/* search_exception_tables	*/
 #include <linux/bootmem.h>		/* max_low_pfn			*/
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index af33f9d4c624..58f96d1230d4 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index 32519b71d914..e0f583fed06a 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -20,6 +20,7 @@
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index c41294745731..70a131945443 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -20,6 +20,7 @@
 #include <linux/ptrace.h>
 #include <linux/personality.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index f2b3e1725349..fd894eaa63f3 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/reboot.h>
 #include <linux/seq_file.h>
 #include <linux/smp.h>
diff --git a/block/blk-map.c b/block/blk-map.c
index 2f18c2a0be1b..3b5cb863318f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -2,6 +2,7 @@
  * Functions related to mapping data to requests
  */
 #include <linux/kernel.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index f7f6b9144b07..da6b59ba5940 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -34,6 +34,8 @@
 #include <linux/kernel_stat.h>
 #include <linux/clockchips.h>
 #include <linux/cpu.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/hyperv.h>
 #include <asm/hypervisor.h>
 #include <asm/mshyperv.h>
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index aef00511ca86..74f1b7dc03f7 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/sched/task_stack.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
 #include <linux/seq_file.h>
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 9b2fe2d3e3a9..1ec84ca81146 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -3,6 +3,7 @@
 
 #include <linux/llist.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/workqueue.h>
 
 /*
diff --git a/drivers/misc/lkdtm_usercopy.c b/drivers/misc/lkdtm_usercopy.c
index 1dd611423d8b..df6ac985fbb5 100644
--- a/drivers/misc/lkdtm_usercopy.c
+++ b/drivers/misc/lkdtm_usercopy.c
@@ -5,6 +5,7 @@
 #include "lkdtm.h"
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mman.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
index 6c062b8251d2..d52139635b67 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
@@ -20,6 +20,7 @@
  */
 #include <linux/clk.h>
 #include <linux/slab.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/mtd/partitions.h>
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 2a59139f520b..9be82c4e14a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/coredump.h>
 #include <linux/slab.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2c95257fa4da..92c00a13b28b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,6 +36,7 @@
 #include <linux/coredump.h>
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task_stack.h>
 #include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 222873bb689c..6103a8149ccd 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -15,6 +15,8 @@
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
@@ -35,7 +37,6 @@
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
 #include <linux/coredump.h>
-#include <linux/sched/coredump.h>
 #include <linux/dax.h>
 
 #include <linux/uaccess.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9b2917a30294..2edcefc0a294 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -19,6 +19,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index c74ab43b8383..592683711c64 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -18,6 +18,7 @@
 #include <linux/coredump.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 698d51a0eea3..c8240a12c42d 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -3,6 +3,8 @@
 
 #include <linux/user.h>
 #include <linux/bug.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/elf.h>
 #include <uapi/linux/elfcore.h>
 
diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
index a5f98d53d732..9b7dd59fe28d 100644
--- a/include/linux/perf_regs.h
+++ b/include/linux/perf_regs.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_PERF_REGS_H
 #define _LINUX_PERF_REGS_H
 
+#include <linux/sched/task_stack.h>
+
 struct perf_regs {
 	__u64		abi;
 	struct pt_regs	*regs;
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
new file mode 100644
index 000000000000..933cd49824c2
--- /dev/null
+++ b/include/linux/sched/task_stack.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_TASK_STACK_H
+#define _LINUX_SCHED_TASK_STACK_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_TASK_STACK_H */
diff --git a/init/main.c b/init/main.c
index 47956b22add1..c2d337a136d1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -77,6 +77,7 @@
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/context_tracking.h>
 #include <linux/random.h>
 #include <linux/list.h>
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index e9fdb5203de5..c04917cad1bf 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -11,6 +11,8 @@
 
 #include <linux/perf_event.h>
 #include <linux/slab.h>
+#include <linux/sched/task_stack.h>
+
 #include "internal.h"
 
 struct callchain_cpus_entries {
diff --git a/kernel/exit.c b/kernel/exit.c
index e53408d156df..771c33fc9952 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -10,6 +10,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 0af64d5d8b73..c87b6f03c710 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -19,6 +19,7 @@
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 3caaaa04b92e..2984fb0f0257 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -47,6 +47,7 @@
 #include <linux/uio.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6b7155ae5c33..4e2fec3f12a0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -15,6 +15,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e15185c28de5..65f61077ad50 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seccomp.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 25d099fd80f7..e6d470a5f75a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
 #include <linux/sched/user.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2a1abbaca10e..1d68b5b7ad41 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  *
  */
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 8c28cbd7e104..17afb0430161 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -13,6 +13,7 @@
 #include <linux/debugobjects.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/slab.h>
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 942adf13418d..b157b46cc9a6 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -17,6 +17,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#include <linux/sched/task_stack.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched/task.h>
diff --git a/lib/syscall.c b/lib/syscall.c
index 63239e097b13..17d5ff5fa6a3 100644
--- a/lib/syscall.c
+++ b/lib/syscall.c
@@ -1,5 +1,6 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/export.h>
 #include <asm/syscall.h>
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index d99312ed7c22..98b27195e38b 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a3970573359e..26c874e90b12 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -75,6 +75,7 @@
 #include <linux/list.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/mm/util.c b/mm/util.c
index 14f4e13323e2..656dc5e37a87 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
-- 
cgit v1.2.3


From dfc3401a33086a3fd465468e171ea0e82430569b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:15:21 +0100
Subject: sched/headers: Prepare to move the 'root_task_group' declaration to
 <linux/sched/autogroup.h>

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h | 1 +
 kernel/sched/sched.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3a85d61f7614..452c9799318c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,7 @@
 #include <linux/securebits.h>
 #include <linux/seqlock.h>
 #include <linux/rbtree.h>
+#include <linux/sched/autogroup.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4e2fec3f12a0..b1f1c8443837 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,6 @@
 
 #include <linux/sched.h>
+#include <linux/sched/autogroup.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
-- 
cgit v1.2.3


From e6d930b4e0115eb0732eec0efb11c3b09a8000fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:43:50 +0100
Subject: signals: Prepare to split out <linux/signal_types.h> from
 <linux/signal.h>

Introduce dummy header and add dependencies to places that will depend on it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/include/asm/abi.h  | 2 ++
 include/linux/signal.h       | 3 +--
 include/linux/signal_types.h | 7 +++++++
 3 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/signal_types.h

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/abi.h b/arch/mips/include/asm/abi.h
index 940760844e2f..dba7f4b6bebf 100644
--- a/arch/mips/include/asm/abi.h
+++ b/arch/mips/include/asm/abi.h
@@ -9,6 +9,8 @@
 #ifndef _ASM_ABI_H
 #define _ASM_ABI_H
 
+#include <linux/signal_types.h>
+
 #include <asm/signal.h>
 #include <asm/siginfo.h>
 #include <asm/vdso.h>
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 5308304993be..d1c2b05a7a55 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -1,9 +1,8 @@
 #ifndef _LINUX_SIGNAL_H
 #define _LINUX_SIGNAL_H
 
-#include <linux/list.h>
 #include <linux/bug.h>
-#include <uapi/linux/signal.h>
+#include <linux/signal_types.h>
 
 struct task_struct;
 
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
new file mode 100644
index 000000000000..28799c88f490
--- /dev/null
+++ b/include/linux/signal_types.h
@@ -0,0 +1,7 @@
+#ifndef _LINUX_SIGNAL_TYPES_H
+#define _LINUX_SIGNAL_TYPES_H
+
+#include <linux/list.h>
+#include <uapi/linux/signal.h>
+
+#endif /* _LINUX_SIGNAL_TYPES_H */
-- 
cgit v1.2.3


From f361bf4a66c9bfabace46f6ff5d97005c9b524fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:47:37 +0100
Subject: sched/headers: Prepare for the reduction of <linux/sched.h>'s signal
 API dependency

Instead of including the full <linux/signal.h>, we are going to include the
types-only <linux/signal_types.h> header in <linux/sched.h>, to further
decouple the scheduler header from the signal headers.

This means that various files which relied on the full <linux/signal.h> need
to be updated to gain an explicit dependency on it.

Update the code that relies on sched.h's inclusion of the <linux/signal.h> header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/sys_compat.c          | 1 +
 drivers/gpu/drm/i915/i915_gem_request.c | 1 +
 drivers/isdn/mISDN/stack.c              | 2 ++
 fs/btrfs/extent-tree.c                  | 1 +
 fs/btrfs/free-space-cache.c             | 1 +
 fs/buffer.c                             | 1 +
 fs/ceph/addr.c                          | 1 +
 fs/dax.c                                | 1 +
 fs/ioctl.c                              | 2 ++
 fs/iomap.c                              | 2 ++
 fs/ocfs2/super.c                        | 1 +
 include/linux/sched.h                   | 1 +
 include/linux/sched/signal.h            | 1 +
 kernel/pid_namespace.c                  | 1 +
 mm/page-writeback.c                     | 1 +
 15 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index abaf582fc7a8..8b8bbd3eaa52 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -21,6 +21,7 @@
 #include <linux/compat.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index df3fef393dbe..e7c3c0318ff6 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -26,6 +26,7 @@
 #include <linux/dma-fence-array.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/signal.h>
 
 #include "i915_drv.h"
 
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index b324474c0c12..696f22fd5ab4 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -19,6 +19,8 @@
 #include <linux/mISDNif.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
+#include <linux/signal.h>
+
 #include "core.h"
 
 static u_int	*debug;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c35b96633554..dad395b0b9fc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1a131f7d6c1b..493a654b6012 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
 
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ratelimit.h>
diff --git a/fs/buffer.c b/fs/buffer.c
index 28484b3ebc98..9196f2a270da 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
 #include <linux/iomap.h>
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f297a9e18642..1a3e1b40799a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/signal.h>
 
 #include "super.h"
 #include "mds_client.h"
diff --git a/fs/dax.c b/fs/dax.c
index 7436c98b92c8..de622d4282a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -27,6 +27,7 @@
 #include <linux/pagevec.h>
 #include <linux/pmem.h>
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
diff --git a/fs/ioctl.c b/fs/ioctl.c
index cb9b02940805..569db68d02b3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,8 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/falloc.h>
+#include <linux/sched/signal.h>
+
 #include "internal.h"
 
 #include <asm/ioctls.h>
diff --git a/fs/iomap.c b/fs/iomap.c
index 0f85f2410605..3ca1a8e44135 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -26,6 +26,8 @@
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/dax.h>
+#include <linux/sched/signal.h>
+
 #include "internal.h"
 
 /*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a24e42f95341..ca1646fbcaef 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
 #include <linux/cleancache.h>
+#include <linux/signal.h>
 
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ae7b3d85658..ea05116bc3c2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@ struct sched_param {
 #include <linux/signal.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
+#include <linux/signal_types.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0f4e9f4a43fd..7e10a7824523 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_SCHED_SIGNAL_H
 #define _LINUX_SCHED_SIGNAL_H
 
+#include <linux/signal.h>
 #include <linux/cred.h>
 #include <linux/sched.h>
 #include <linux/sched/jobctl.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7c8367854dc4..de461aa0bf9a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -20,6 +20,7 @@
 #include <linux/reboot.h>
 #include <linux/export.h>
 #include <linux/sched/task.h>
+#include <linux/sched/signal.h>
 
 struct pid_cache {
 	int nr_ids;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26a60818a8fc..d8ac2a7fb9e7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
 #include <linux/mm_inline.h>
 #include <trace/events/writeback.h>
 
-- 
cgit v1.2.3


From 2e58f173ab89b29a1373088b8727133dbf7322b0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 00:12:19 +0100
Subject: mm/headers, sched/headers: Prepare to split <linux/mm_types_task.h>
 out of <linux/mm_types.h>

We are going to separate all the MM types that are embedded directly in 'struct task_struct'
into the new <linux/mm_types_task.h> header.

Create a new <linux/mm_types_task.h> that only contains some includes from mm_types.h itself.

This should be trivially correct and easy to bisect to.

(This patch does not materially move the types yet.)

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h      |  6 +++---
 include/linux/mm_types_task.h | 10 ++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/mm_types_task.h

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 137797cd7b50..6daaf0a51968 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1,9 +1,9 @@
 #ifndef _LINUX_MM_TYPES_H
 #define _LINUX_MM_TYPES_H
 
+#include <linux/mm_types_task.h>
+
 #include <linux/auxvec.h>
-#include <linux/types.h>
-#include <linux/threads.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
@@ -13,7 +13,7 @@
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
-#include <asm/page.h>
+
 #include <asm/mmu.h>
 
 #ifndef AT_VECTOR_SIZE_ARCH
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
new file mode 100644
index 000000000000..2419d302f03c
--- /dev/null
+++ b/include/linux/mm_types_task.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_MM_TYPES_TASK_H
+#define _LINUX_MM_TYPES_TASK_H
+
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+
+#include <asm/page.h>
+
+#endif /* _LINUX_MM_TYPES_TASK_H */
-- 
cgit v1.2.3


From 589ee62844e042b0b7d19ef57fb4cff77f3ca294 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 00:16:44 +0100
Subject: sched/headers: Prepare to remove the <linux/mm_types.h> dependency
 from <linux/sched.h>

Update code that relied on sched.h including various MM types for them.

This will allow us to remove the <linux/mm_types.h> include from <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/a.out-core.h              | 1 +
 arch/alpha/include/asm/mmu_context.h             | 2 ++
 arch/arc/mm/tlb.c                                | 2 ++
 arch/arm/include/asm/mmu_context.h               | 2 ++
 arch/arm/include/asm/tlbflush.h                  | 7 ++++---
 arch/arm/kernel/suspend.c                        | 1 +
 arch/arm/kernel/swp_emulate.c                    | 1 +
 arch/arm/mm/idmap.c                              | 1 +
 arch/arm64/include/asm/mmu_context.h             | 1 +
 arch/arm64/kernel/traps.c                        | 1 +
 arch/avr32/include/asm/mmu_context.h             | 2 ++
 arch/blackfin/include/asm/mmu_context.h          | 2 ++
 arch/blackfin/kernel/flat.c                      | 1 +
 arch/blackfin/kernel/process.c                   | 1 +
 arch/blackfin/mm/sram-alloc.c                    | 2 ++
 arch/cris/arch-v10/mm/tlb.c                      | 2 ++
 arch/cris/arch-v32/mm/tlb.c                      | 1 +
 arch/cris/include/asm/pgtable.h                  | 2 +-
 arch/cris/mm/tlb.c                               | 2 ++
 arch/h8300/kernel/traps.c                        | 1 +
 arch/hexagon/include/asm/mmu_context.h           | 2 ++
 arch/hexagon/kernel/smp.c                        | 1 +
 arch/ia64/include/asm/mmu_context.h              | 1 +
 arch/ia64/include/asm/pgtable.h                  | 2 +-
 arch/ia64/sn/kernel/sn2/sn2_smp.c                | 1 +
 arch/m32r/include/asm/mmu_context.h              | 2 ++
 arch/m68k/include/asm/a.out-core.h               | 1 +
 arch/m68k/include/asm/mmu_context.h              | 1 +
 arch/metag/include/asm/mmu_context.h             | 1 +
 arch/microblaze/include/asm/mmu_context_mm.h     | 2 ++
 arch/microblaze/mm/pgtable.c                     | 1 +
 arch/mips/include/asm/elf.h                      | 2 ++
 arch/mips/include/asm/mmu_context.h              | 2 ++
 arch/mips/kernel/smp.c                           | 2 +-
 arch/mips/math-emu/dsemul.c                      | 1 +
 arch/mips/mm/ioremap.c                           | 1 +
 arch/mn10300/include/asm/mmu_context.h           | 2 ++
 arch/mn10300/kernel/smp.c                        | 2 +-
 arch/mn10300/mm/tlb-smp.c                        | 2 +-
 arch/nios2/include/asm/mmu_context.h             | 2 ++
 arch/nios2/kernel/process.c                      | 1 +
 arch/powerpc/kernel/io-workarounds.c             | 2 +-
 arch/powerpc/kvm/e500_mmu_host.c                 | 2 +-
 arch/powerpc/lib/feature-fixups.c                | 1 +
 arch/powerpc/mm/hash_utils_64.c                  | 2 +-
 arch/powerpc/mm/pgtable-book3s64.c               | 2 ++
 arch/powerpc/mm/pgtable-hash64.c                 | 2 ++
 arch/powerpc/mm/pgtable-radix.c                  | 2 +-
 arch/powerpc/mm/slb.c                            | 2 ++
 arch/s390/include/asm/elf.h                      | 2 +-
 arch/s390/include/asm/mmu_context.h              | 1 +
 arch/s390/kernel/processor.c                     | 1 +
 arch/s390/kvm/gaccess.c                          | 2 ++
 arch/s390/kvm/priv.c                             | 2 ++
 arch/score/include/asm/mmu_context.h             | 2 ++
 arch/score/kernel/traps.c                        | 1 +
 arch/sh/include/asm/mmu_context.h                | 2 ++
 arch/sparc/include/asm/mmu_context_64.h          | 2 ++
 arch/sparc/include/asm/pgtable_64.h              | 3 +++
 arch/sparc/kernel/asm-offsets.c                  | 1 +
 arch/sparc/kernel/traps_32.c                     | 1 +
 arch/sparc/mm/tsb.c                              | 2 ++
 arch/tile/include/asm/mmu_context.h              | 2 ++
 arch/um/include/asm/mmu_context.h                | 2 ++
 arch/um/kernel/exec.c                            | 2 +-
 arch/um/kernel/reboot.c                          | 1 +
 arch/um/kernel/skas/process.c                    | 3 ++-
 arch/x86/entry/vsyscall/vsyscall_64.c            | 1 +
 arch/x86/events/core.c                           | 2 +-
 arch/x86/include/asm/a.out-core.h                | 2 ++
 arch/x86/include/asm/mpx.h                       | 2 ++
 arch/x86/mm/mpx.c                                | 1 +
 arch/x86/um/syscalls_64.c                        | 1 +
 arch/x86/xen/mmu.c                               | 2 +-
 arch/xtensa/include/asm/mmu_context.h            | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c  | 2 ++
 drivers/infiniband/hw/cxgb3/iwch_provider.c      | 2 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h           | 2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c        | 2 +-
 fs/binfmt_misc.c                                 | 2 +-
 fs/kernfs/file.c                                 | 2 +-
 include/drm/drm_mm.h                             | 1 +
 include/linux/init_task.h                        | 1 +
 include/linux/sched/mm.h                         | 1 +
 kernel/sched/debug.c                             | 2 +-
 kernel/sched/fair.c                              | 2 +-
 kernel/signal.c                                  | 2 +-
 lib/is_single_threaded.c                         | 1 +
 89 files changed, 125 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/a.out-core.h b/arch/alpha/include/asm/a.out-core.h
index 9e33e92e524c..1610d078b064 100644
--- a/arch/alpha/include/asm/a.out-core.h
+++ b/arch/alpha/include/asm/a.out-core.h
@@ -15,6 +15,7 @@
 #ifdef __KERNEL__
 
 #include <linux/user.h>
+#include <linux/mm_types.h>
 
 /*
  * Fill in the user structure for an ECOFF core dump.
diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h
index 4c51c05333c6..384bd47b5187 100644
--- a/arch/alpha/include/asm/mmu_context.h
+++ b/arch/alpha/include/asm/mmu_context.h
@@ -7,6 +7,8 @@
  * Copyright (C) 1996, Linus Torvalds
  */
 
+#include <linux/mm_types.h>
+
 #include <asm/machvec.h>
 #include <asm/compiler.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index bdb295e09160..d0126fdfe2d8 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -53,6 +53,8 @@
 
 #include <linux/module.h>
 #include <linux/bug.h>
+#include <linux/mm_types.h>
+
 #include <asm/arcregs.h>
 #include <asm/setup.h>
 #include <asm/mmu_context.h>
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index 3cc14dd8587c..7f303295ef19 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -15,7 +15,9 @@
 
 #include <linux/compiler.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/preempt.h>
+
 #include <asm/cacheflush.h>
 #include <asm/cachetype.h>
 #include <asm/proc-fns.h>
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index def9e570199f..1897b5196fb5 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -10,6 +10,10 @@
 #ifndef _ASMARM_TLBFLUSH_H
 #define _ASMARM_TLBFLUSH_H
 
+#ifndef __ASSEMBLY__
+# include <linux/mm_types.h>
+#endif
+
 #ifdef CONFIG_MMU
 
 #include <asm/glue.h>
@@ -644,9 +648,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
 #elif defined(CONFIG_SMP)	/* !CONFIG_MMU */
 
 #ifndef __ASSEMBLY__
-
-#include <linux/mm_types.h>
-
 static inline void local_flush_tlb_all(void)									{ }
 static inline void local_flush_tlb_mm(struct mm_struct *mm)							{ }
 static inline void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)			{ }
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c
index 9a2f882a0a2d..ef794c799cb6 100644
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -1,5 +1,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 
 #include <asm/cacheflush.h>
 #include <asm/idmap.h>
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 853221f81104..3bda08bee674 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -23,6 +23,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/syscalls.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index c1a48f88764e..3e511bec69b8 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -1,6 +1,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 
 #include <asm/cputype.h>
 #include <asm/idmap.h>
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 3c9f7d18a7e6..3257895a9b5e 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -26,6 +26,7 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/mm_types.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index bdbd0c3febf4..e52be6aa44ee 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -33,6 +33,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/syscalls.h>
+#include <linux/mm_types.h>
 
 #include <asm/atomic.h>
 #include <asm/bug.h>
diff --git a/arch/avr32/include/asm/mmu_context.h b/arch/avr32/include/asm/mmu_context.h
index 27ff23407100..cd87abba8db7 100644
--- a/arch/avr32/include/asm/mmu_context.h
+++ b/arch/avr32/include/asm/mmu_context.h
@@ -12,6 +12,8 @@
 #ifndef __ASM_AVR32_MMU_CONTEXT_H
 #define __ASM_AVR32_MMU_CONTEXT_H
 
+#include <linux/mm_types.h>
+
 #include <asm/tlbflush.h>
 #include <asm/sysreg.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/blackfin/include/asm/mmu_context.h b/arch/blackfin/include/asm/mmu_context.h
index 15b16d3e8de8..0ce6de873b27 100644
--- a/arch/blackfin/include/asm/mmu_context.h
+++ b/arch/blackfin/include/asm/mmu_context.h
@@ -9,6 +9,8 @@
 
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/arch/blackfin/kernel/flat.c b/arch/blackfin/kernel/flat.c
index a88daddbf074..b5b658449616 100644
--- a/arch/blackfin/kernel/flat.c
+++ b/arch/blackfin/kernel/flat.c
@@ -6,6 +6,7 @@
 
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/flat.h>
 
 #define FLAT_BFIN_RELOC_TYPE_16_BIT 0
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index b691ef875a40..89d5162d4ca6 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/mm_types.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c
index 1f3b3ef3e103..d2a96c2c02a3 100644
--- a/arch/blackfin/mm/sram-alloc.c
+++ b/arch/blackfin/mm/sram-alloc.c
@@ -19,6 +19,8 @@
 #include <linux/spinlock.h>
 #include <linux/rtc.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include <asm/blackfin.h>
 #include <asm/mem_map.h>
 #include "blackfin_sram.h"
diff --git a/arch/cris/arch-v10/mm/tlb.c b/arch/cris/arch-v10/mm/tlb.c
index 21d78c599bab..3225d38bdaea 100644
--- a/arch/cris/arch-v10/mm/tlb.c
+++ b/arch/cris/arch-v10/mm/tlb.c
@@ -10,6 +10,8 @@
  *
  */
 
+#include <linux/mm_types.h>
+
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 #include <arch/svinto.h>
diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c
index c030d020660a..bc3de5b5e27c 100644
--- a/arch/cris/arch-v32/mm/tlb.c
+++ b/arch/cris/arch-v32/mm/tlb.c
@@ -6,6 +6,7 @@
  * Authors:   Bjorn Wesen <bjornw@axis.com>
  *            Tobias Anderberg <tobiasa@axis.com>, CRISv32 port.
  */
+#include <linux/mm_types.h>
 
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h
index ceefc314d64d..2a3210ba4c72 100644
--- a/arch/cris/include/asm/pgtable.h
+++ b/arch/cris/include/asm/pgtable.h
@@ -9,7 +9,7 @@
 #include <asm-generic/pgtable-nopmd.h>
 
 #ifndef __ASSEMBLY__
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <asm/mmu.h>
 #endif
 #include <arch/pgtable.h>
diff --git a/arch/cris/mm/tlb.c b/arch/cris/mm/tlb.c
index b7f8de576777..8413741cfa0f 100644
--- a/arch/cris/mm/tlb.c
+++ b/arch/cris/mm/tlb.c
@@ -9,6 +9,8 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/mm_types.h>
+
 #include <asm/tlb.h>
 
 #define D(x)
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index ee7e3ce40d78..e47a9e0dc278 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -17,6 +17,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/mm_types.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
diff --git a/arch/hexagon/include/asm/mmu_context.h b/arch/hexagon/include/asm/mmu_context.h
index d423d2e73c30..d8a071afdd1d 100644
--- a/arch/hexagon/include/asm/mmu_context.h
+++ b/arch/hexagon/include/asm/mmu_context.h
@@ -21,6 +21,8 @@
 #ifndef _ASM_MMU_CONTEXT_H
 #define _ASM_MMU_CONTEXT_H
 
+#include <linux/mm_types.h>
+
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index c02a6455839e..1f63e91a353b 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -29,6 +29,7 @@
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/cpu.h>
+#include <linux/mm_types.h>
 
 #include <asm/time.h>    /*  timer_interrupt  */
 #include <asm/hexagon_vm.h>
diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
index 7f2a456603cb..9b99368633b5 100644
--- a/arch/ia64/include/asm/mmu_context.h
+++ b/arch/ia64/include/asm/mmu_context.h
@@ -26,6 +26,7 @@
 #include <linux/compiler.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/spinlock.h>
 
 #include <asm/processor.h>
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 9f3ed9ee8f13..384794e665fc 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -147,7 +147,7 @@
 
 # ifndef __ASSEMBLY__
 
-#include <linux/sched.h>	/* for mm_struct */
+#include <linux/sched/mm.h>	/* for mm_struct */
 #include <linux/bitops.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index c98dc965fe82..b73b0ebf8214 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/threads.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
diff --git a/arch/m32r/include/asm/mmu_context.h b/arch/m32r/include/asm/mmu_context.h
index 9fc78fc44445..1230b7050d8e 100644
--- a/arch/m32r/include/asm/mmu_context.h
+++ b/arch/m32r/include/asm/mmu_context.h
@@ -12,6 +12,8 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/mmu.h>
 #include <asm/tlbflush.h>
diff --git a/arch/m68k/include/asm/a.out-core.h b/arch/m68k/include/asm/a.out-core.h
index f6bfc1d63ff6..ae91ea6bb303 100644
--- a/arch/m68k/include/asm/a.out-core.h
+++ b/arch/m68k/include/asm/a.out-core.h
@@ -16,6 +16,7 @@
 
 #include <linux/user.h>
 #include <linux/elfcore.h>
+#include <linux/mm_types.h>
 
 /*
  * fill in the user structure for an a.out core dump
diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h
index dc3be991d634..4a6ae6dffa34 100644
--- a/arch/m68k/include/asm/mmu_context.h
+++ b/arch/m68k/include/asm/mmu_context.h
@@ -2,6 +2,7 @@
 #define __M68K_MMU_CONTEXT_H
 
 #include <asm-generic/mm_hooks.h>
+#include <linux/mm_types.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/arch/metag/include/asm/mmu_context.h b/arch/metag/include/asm/mmu_context.h
index ae2a71b5e0be..2e0312748197 100644
--- a/arch/metag/include/asm/mmu_context.h
+++ b/arch/metag/include/asm/mmu_context.h
@@ -9,6 +9,7 @@
 #include <asm/cacheflush.h>
 
 #include <linux/io.h>
+#include <linux/mm_types.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm,
 				  struct task_struct *tsk)
diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h
index d68647746448..99472d2ca340 100644
--- a/arch/microblaze/include/asm/mmu_context_mm.h
+++ b/arch/microblaze/include/asm/mmu_context_mm.h
@@ -12,6 +12,8 @@
 #define _ASM_MICROBLAZE_MMU_CONTEXT_H
 
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+
 #include <asm/bitops.h>
 #include <asm/mmu.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index cc732fe357ad..4c0599239915 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
+#include <linux/mm_types.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 7a6c466e5f2a..0eb1a75be105 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -10,6 +10,8 @@
 
 #include <linux/auxvec.h>
 #include <linux/fs.h>
+#include <linux/mm_types.h>
+
 #include <uapi/linux/elf.h>
 
 #include <asm/current.h>
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index 2abf94f72c0a..da2004cef2d5 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -13,8 +13,10 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
+
 #include <asm/cacheflush.h>
 #include <asm/dsemul.h>
 #include <asm/hazards.h>
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 8c60a296294c..6e71130549ea 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -28,7 +28,7 @@
 #include <linux/export.h>
 #include <linux/time.h>
 #include <linux/timex.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/cpumask.h>
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index c4469ff4a996..6664908514b3 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -1,5 +1,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 
 #include <asm/branch.h>
 #include <asm/cacheflush.h>
diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c
index 1f189627440f..1986e09fb457 100644
--- a/arch/mips/mm/ioremap.c
+++ b/arch/mips/mm/ioremap.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/mm_types.h>
 #include <asm/cacheflush.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
diff --git a/arch/mn10300/include/asm/mmu_context.h b/arch/mn10300/include/asm/mmu_context.h
index 75dbe696f830..d2034f5e6eda 100644
--- a/arch/mn10300/include/asm/mmu_context.h
+++ b/arch/mn10300/include/asm/mmu_context.h
@@ -23,6 +23,8 @@
 #define _ASM_MMU_CONTEXT_H
 
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index d924f7a79708..35d2c3fe6f76 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -21,7 +21,7 @@
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/mm/tlb-smp.c b/arch/mn10300/mm/tlb-smp.c
index 9a39ea9031d4..085f2bb691ac 100644
--- a/arch/mn10300/mm/tlb-smp.c
+++ b/arch/mn10300/mm/tlb-smp.c
@@ -20,7 +20,7 @@
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
 #include <asm/tlbflush.h>
diff --git a/arch/nios2/include/asm/mmu_context.h b/arch/nios2/include/asm/mmu_context.h
index 294b4b1f81d4..78ab3dacf579 100644
--- a/arch/nios2/include/asm/mmu_context.h
+++ b/arch/nios2/include/asm/mmu_context.h
@@ -13,6 +13,8 @@
 #ifndef _ASM_NIOS2_MMU_CONTEXT_H
 #define _ASM_NIOS2_MMU_CONTEXT_H
 
+#include <linux/mm_types.h>
+
 #include <asm-generic/mm_hooks.h>
 
 extern void mmu_context_init(void);
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 869a4d59de32..509e7855e8dc 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/mm_types.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 5f8613ceb97f..a582e0d42525 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -12,7 +12,7 @@
 #undef DEBUG
 
 #include <linux/kernel.h>
-#include <linux/sched.h>	/* for init_mm */
+#include <linux/sched/mm.h>	/* for init_mm */
 
 #include <asm/io.h>
 #include <asm/machdep.h>
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index b0333cc737dd..0fda4230f6c0 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -25,7 +25,7 @@
 #include <linux/highmem.h>
 #include <linux/log2.h>
 #include <linux/uaccess.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/rwsem.h>
 #include <linux/vmalloc.h>
 #include <linux/hugetlb.h>
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 043415f0bdb1..f3917705c686 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/sched/mm.h>
 #include <asm/cputable.h>
 #include <asm/code-patching.h>
 #include <asm/page.h>
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 12d679df50bd..c554768b1fa2 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -23,7 +23,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/sysctl.h>
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index b798ff674fab..5fcb3dd74c13 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -8,6 +8,8 @@
  */
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index c23e286a6b8f..8b85a14b08ea 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -10,6 +10,8 @@
  */
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index feeda90cd06d..2a590a98e652 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -8,7 +8,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 48fc28bab544..5e01b2ece1d0 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -22,6 +22,8 @@
 #include <asm/cacheflush.h>
 #include <asm/smp.h>
 #include <linux/compiler.h>
+#include <linux/mm_types.h>
+
 #include <asm/udbg.h>
 #include <asm/code-patching.h>
 
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 83aaefed2a7b..1d48880b3cc1 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -132,7 +132,7 @@ typedef s390_fp_regs compat_elf_fpregset_t;
 typedef s390_compat_regs compat_elf_gregset_t;
 
 #include <linux/compat.h>
-#include <linux/sched.h>	/* for task_struct */
+#include <linux/sched/mm.h>	/* for task_struct */
 #include <asm/mmu_context.h>
 
 #include <asm/vdso.h>
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 9b828c073176..6e31d87fb669 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -9,6 +9,7 @@
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
+#include <linux/mm_types.h>
 #include <asm/tlbflush.h>
 #include <asm/ctl_reg.h>
 
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index bc2b60dcb178..bf7854523831 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
+#include <linux/mm_types.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <asm/diag.h>
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4492c9363178..d55c829a5944 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -6,7 +6,9 @@
  */
 
 #include <linux/vmalloc.h>
+#include <linux/mm_types.h>
 #include <linux/err.h>
+
 #include <asm/pgtable.h>
 #include <asm/gmap.h>
 #include "kvm-s390.h"
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index fb4b494cde9b..64b6a309f2c4 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -15,6 +15,8 @@
 #include <linux/gfp.h>
 #include <linux/errno.h>
 #include <linux/compat.h>
+#include <linux/mm_types.h>
+
 #include <asm/asm-offsets.h>
 #include <asm/facility.h>
 #include <asm/current.h>
diff --git a/arch/score/include/asm/mmu_context.h b/arch/score/include/asm/mmu_context.h
index 2644577c96e8..073f95d350de 100644
--- a/arch/score/include/asm/mmu_context.h
+++ b/arch/score/include/asm/mmu_context.h
@@ -3,7 +3,9 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/slab.h>
+
 #include <asm-generic/mm_hooks.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 8a54d320fb41..fe68de6c746c 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -26,6 +26,7 @@
 #include <linux/extable.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/mm_types.h>
 
 #include <asm/cacheflush.h>
 #include <asm/irq.h>
diff --git a/arch/sh/include/asm/mmu_context.h b/arch/sh/include/asm/mmu_context.h
index 35ffdd081d26..eb6ac3c10c44 100644
--- a/arch/sh/include/asm/mmu_context.h
+++ b/arch/sh/include/asm/mmu_context.h
@@ -11,6 +11,8 @@
 #include <cpu/mmu_context.h>
 #include <asm/tlbflush.h>
 #include <linux/uaccess.h>
+#include <linux/mm_types.h>
+
 #include <asm/io.h>
 #include <asm-generic/mm_hooks.h>
 
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index d0317993e947..22fede6eba11 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -6,6 +6,8 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/spinlock.h>
+#include <linux/mm_types.h>
+
 #include <asm/spitfire.h>
 #include <asm-generic/mm_hooks.h>
 
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 7932a4a37817..56e49c8f770d 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -878,6 +878,9 @@ static inline unsigned long pud_pfn(pud_t pud)
 #define pte_offset_map			pte_index
 #define pte_unmap(pte)			do { } while (0)
 
+/* We cannot include <linux/mm_types.h> at this point yet: */
+extern struct mm_struct init_mm;
+
 /* Actual page table PTE updates.  */
 void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
 		   pte_t *ptep, pte_t orig, int fullmm,
diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
index f76389a32342..3f09e1c83f58 100644
--- a/arch/sparc/kernel/asm-offsets.c
+++ b/arch/sparc/kernel/asm-offsets.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 // #include <linux/mm.h>
 #include <linux/kbuild.h>
 
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 26740a2f285d..2de72a49308a 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -11,6 +11,7 @@
 
 #include <linux/sched.h>  /* for jiffies */
 #include <linux/sched/debug.h>
+#include <linux/mm_types.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 23479c3d39f0..0a04811f06b7 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -6,6 +6,8 @@
 #include <linux/kernel.h>
 #include <linux/preempt.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index f67753db1f78..45a4b4c424cf 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -16,6 +16,8 @@
 #define _ASM_TILE_MMU_CONTEXT_H
 
 #include <linux/smp.h>
+#include <linux/mm_types.h>
+
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 1a60e1328e2f..94ac2739918c 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -7,6 +7,8 @@
 #define __UM_MMU_CONTEXT_H
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/mmu.h>
 
 extern void uml_setup_stubs(struct mm_struct *mm);
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 31968677a0d0..a43d42bf0a86 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -7,7 +7,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/ptrace.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/slab.h>
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 0bc921cee0b1..71f3e9217cf2 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -5,6 +5,7 @@
 
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index ddecf326dab2..3a952a4f7965 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -4,8 +4,9 @@
  */
 
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task_stack.h>
+
 #include <as-layout.h>
 #include <kern.h>
 #include <os.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index df91fb393a01..ce1d7534fa53 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -28,6 +28,7 @@
 #include <linux/kernel.h>
 #include <linux/timer.h>
 #include <linux/sched/signal.h>
+#include <linux/mm_types.h>
 #include <linux/syscalls.h>
 #include <linux/ratelimit.h>
 
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 2ecc0e97772b..349d4d17aa7f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -20,7 +20,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/kdebug.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/clock.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index 7a15588e45d4..7d3ece8bfb61 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,8 @@
 
 #include <linux/user.h>
 #include <linux/elfcore.h>
+#include <linux/mm_types.h>
+
 #include <asm/debugreg.h>
 
 /*
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index 0b416d4cf73b..a0d662be4c5b 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -2,6 +2,8 @@
 #define _ASM_X86_MPX_H
 
 #include <linux/types.h>
+#include <linux/mm_types.h>
+
 #include <asm/ptrace.h>
 #include <asm/insn.h>
 
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index c98079684bdb..5126dfd52b18 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -7,6 +7,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 #include <linux/syscalls.h>
 #include <linux/sched/sysctl.h>
 
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index e6552275320b..10d907098c26 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <asm/prctl.h> /* XXX This should get the constants from libc */
 #include <os.h>
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f6740b5b1738..37cb5aad71de 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -38,7 +38,7 @@
  *
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/bug.h>
diff --git a/arch/xtensa/include/asm/mmu_context.h b/arch/xtensa/include/asm/mmu_context.h
index 04c8ebdc4517..f7e186dfc4e4 100644
--- a/arch/xtensa/include/asm/mmu_context.h
+++ b/arch/xtensa/include/asm/mmu_context.h
@@ -17,6 +17,7 @@
 
 #include <linux/stringify.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 
 #include <asm/vectors.h>
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index d83de985e88c..6acc4313363e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -23,6 +23,8 @@
 
 #include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 #include "cik_regs.h"
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index fa32c32fa1c2..a9b9882a9a77 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -23,6 +23,8 @@
 
 #include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 #include "vi_structs.h"
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 318ec5267bdf..86ecd3ea6a4b 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index d19662f635b1..5846c47c8d55 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -37,7 +37,7 @@
 #include <linux/idr.h>
 #include <linux/completion.h>
 #include <linux/netdevice.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/inet.h>
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 36bd904946bd..0b5c43f7e020 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 9b4688ab1d8e..bee1a36bc2ec 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/magic.h>
 #include <linux/binfmts.h>
 #include <linux/slab.h>
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 35043a8c4529..8e4dc7ab584c 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -13,7 +13,7 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/pagemap.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/fsnotify.h>
 
 #include "kernfs-internal.h"
diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
index d81b0ba9921f..2ef16bf25826 100644
--- a/include/drm/drm_mm.h
+++ b/include/drm/drm_mm.h
@@ -40,6 +40,7 @@
 #include <linux/bug.h>
 #include <linux/rbtree.h>
 #include <linux/kernel.h>
+#include <linux/mm_types.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #ifdef CONFIG_DRM_DEBUG_MM
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 452c9799318c..f6841c19c913 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,7 @@
 #include <linux/sched/autogroup.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
+#include <linux/mm_types.h>
 
 #include <asm/thread_info.h>
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1cf7941bb946..d32e3932b2e3 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -2,6 +2,7 @@
 #define _LINUX_SCHED_MM_H
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/gfp.h>
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 109adc0e9cb9..e865d8bfb881 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11e0ab57748a..3e88b35ac157 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,7 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/topology.h>
 
 #include <linux/latencytop.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index e6d470a5f75a..2008aa999976 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -13,7 +13,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/user.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
index 9745cfffcb69..8d4678edcc0e 100644
--- a/lib/is_single_threaded.c
+++ b/lib/is_single_threaded.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
 
 /*
  * Returns true if the task does not share ->mm with another thread/process.
-- 
cgit v1.2.3


From 9164bb4a18dfa592cd0aca455ea57abf89ca4526 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 01:20:53 +0100
Subject: sched/headers: Prepare to move 'init_task' and 'init_thread_union'
 from <linux/sched.h> to <linux/sched/task.h>

Update all usage sites first.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/setup.c     | 1 +
 arch/arm64/mm/kasan_init.c    | 1 +
 arch/frv/mm/init.c            | 1 +
 arch/metag/mm/init.c          | 1 +
 arch/nios2/kernel/setup.c     | 1 +
 arch/powerpc/kernel/paca.c    | 1 +
 arch/um/kernel/skas/process.c | 1 +
 arch/um/kernel/um_arch.c      | 2 ++
 arch/x86/kernel/cpu/common.c  | 1 +
 arch/x86/mm/kasan_init_64.c   | 1 +
 fs/namespace.c                | 2 ++
 include/linux/sched/signal.h  | 1 +
 init/init_task.c              | 1 +
 kernel/delayacct.c            | 1 +
 lib/is_single_threaded.c      | 1 +
 mm/vmacache.c                 | 1 +
 16 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 952e2c0dabd5..42274bda0ccb 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -42,6 +42,7 @@
 #include <linux/of_fdt.h>
 #include <linux/efi.h>
 #include <linux/psci.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 
 #include <asm/acpi.h>
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 201d918e7575..55d1e9205543 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -13,6 +13,7 @@
 #define pr_fmt(fmt) "kasan: " fmt
 #include <linux/kasan.h>
 #include <linux/kernel.h>
+#include <linux/sched/task.h>
 #include <linux/memblock.h>
 #include <linux/start_kernel.h>
 #include <linux/mm.h>
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 88a159743528..328f0a292316 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -18,6 +18,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/gfp.h>
 #include <linux/swap.h>
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index c0ec116b3993..188d4d9fbed4 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -12,6 +12,7 @@
 #include <linux/percpu.h>
 #include <linux/memblock.h>
 #include <linux/initrd.h>
+#include <linux/sched/task.h>
 
 #include <asm/setup.h>
 #include <asm/page.h>
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index a3fa80d1aacc..6e57ffa5db27 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/console.h>
 #include <linux/bootmem.h>
 #include <linux/initrd.h>
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index fa20060ff7a5..dfc479df9634 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -10,6 +10,7 @@
 #include <linux/smp.h>
 #include <linux/export.h>
 #include <linux/memblock.h>
+#include <linux/sched/task.h>
 
 #include <asm/lppaca.h>
 #include <asm/paca.h>
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 3a952a4f7965..d4dbf08722d6 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/task.h>
 
 #include <as-layout.h>
 #include <kern.h>
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index e8175a8aa22c..4b85acd4020c 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -11,7 +11,9 @@
 #include <linux/string.h>
 #include <linux/utsname.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kmsg_dump.h>
+
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7e2ca966386..f2fd8fefc589 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -9,6 +9,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 #include <linux/kprobes.h>
 #include <linux/kgdb.h>
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0493c17b8a51..8d63d7a104c3 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -4,6 +4,7 @@
 #include <linux/kdebug.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/vmalloc.h>
 
 #include <asm/tlbflush.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 131cd7b94f47..cc1375eff88c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,8 @@
 #include <linux/magic.h>
 #include <linux/bootmem.h>
 #include <linux/task_work.h>
+#include <linux/sched/task.h>
+
 #include "pnode.h"
 #include "internal.h"
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7e10a7824523..320eca0446cd 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -5,5 +5,6 @@
 #include <linux/cred.h>
 #include <linux/sched.h>
 #include <linux/sched/jobctl.h>
+#include <linux/sched/task.h>
 
 #endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/init/init_task.c b/init/init_task.c
index 53d4ce942a88..66787e30a419 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 660549656991..c94135fc2698 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/taskstats.h>
 #include <linux/time.h>
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
index 8d4678edcc0e..9c7d89df40ed 100644
--- a/lib/is_single_threaded.c
+++ b/lib/is_single_threaded.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/sched/mm.h>
 
 /*
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 4355d34c68a6..7ffa0ee341b5 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2014 Davidlohr Bueso.
  */
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
 
-- 
cgit v1.2.3


From b2d091031075ac9a1598e3cc3a29c28f02e64c0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 01:27:20 +0100
Subject: sched/headers: Prepare to use <linux/rcuupdate.h> instead of
 <linux/rculist.h> in <linux/sched.h>

We don't actually need the full rculist.h header in sched.h anymore,
we will be able to include the smaller rcupdate.h header instead.

But first update code that relied on the implicit header inclusion.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kvm/book3s_mmu_hpte.c  | 1 +
 arch/x86/kvm/irq_comm.c             | 2 ++
 arch/x86/kvm/page_track.c           | 2 ++
 drivers/md/bcache/btree.c           | 2 ++
 drivers/misc/vmw_vmci/vmci_event.c  | 1 +
 drivers/nvme/target/admin-cmd.c     | 2 ++
 drivers/nvme/target/core.c          | 2 ++
 drivers/s390/cio/qdio_thinint.c     | 2 ++
 drivers/scsi/libfc/fc_disc.c        | 2 ++
 drivers/scsi/libfc/fc_rport.c       | 2 ++
 include/linux/dmar.h                | 2 +-
 include/linux/pid.h                 | 2 +-
 include/linux/rhashtable.h          | 2 +-
 include/linux/sched/signal.h        | 1 +
 include/net/bluetooth/hci_core.h    | 2 ++
 kernel/trace/trace_events_hist.c    | 1 +
 kernel/trace/trace_events_trigger.c | 1 +
 kernel/trace/trace_kprobe.c         | 1 +
 kernel/trace/trace_uprobe.c         | 1 +
 lib/bug.c                           | 1 +
 lib/rhashtable.c                    | 1 +
 net/mac80211/mesh_plink.c           | 2 ++
 net/mac802154/llsec.c               | 2 ++
 security/apparmor/policy.c          | 1 +
 security/tomoyo/domain.c            | 2 ++
 security/tomoyo/group.c             | 2 ++
 security/tomoyo/util.c              | 2 ++
 27 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 5a1ab1250a05..905a934c1ef4 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,6 +21,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index b96d3893f121..6825cd36d13b 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -23,6 +23,8 @@
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/rculist.h>
+
 #include <trace/events/kvm.h>
 
 #include <asm/msidef.h>
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 4a1c13eaa518..37942e419c32 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -14,6 +14,8 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/rculist.h>
+
 #include <asm/kvm_host.h>
 #include <asm/kvm_page_track.h>
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 384559af310e..450d0e848ae4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -33,6 +33,8 @@
 #include <linux/random.h>
 #include <linux/rcupdate.h>
 #include <linux/sched/clock.h>
+#include <linux/rculist.h>
+
 #include <trace/events/bcache.h>
 
 /*
diff --git a/drivers/misc/vmw_vmci/vmci_event.c b/drivers/misc/vmw_vmci/vmci_event.c
index 8449516d6ac6..84258a48029d 100644
--- a/drivers/misc/vmw_vmci/vmci_event.c
+++ b/drivers/misc/vmw_vmci/vmci_event.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include "vmci_driver.h"
 #include "vmci_event.h"
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 94e524fea568..a7bcff45f437 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -13,6 +13,8 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
+#include <linux/rculist.h>
+
 #include <generated/utsrelease.h>
 #include <asm/unaligned.h>
 #include "nvmet.h"
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 5267ce20c12d..11b0a0a5f661 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -14,6 +14,8 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/rculist.h>
+
 #include "nvmet.h"
 
 static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c
index 8ad98a902a91..c61164f4528e 100644
--- a/drivers/s390/cio/qdio_thinint.c
+++ b/drivers/s390/cio/qdio_thinint.c
@@ -8,6 +8,8 @@
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/atomic.h>
+#include <linux/rculist.h>
+
 #include <asm/debug.h>
 #include <asm/qdio.h>
 #include <asm/airq.h>
diff --git a/drivers/scsi/libfc/fc_disc.c b/drivers/scsi/libfc/fc_disc.c
index 6103231104da..fd501f8dbb11 100644
--- a/drivers/scsi/libfc/fc_disc.c
+++ b/drivers/scsi/libfc/fc_disc.c
@@ -36,6 +36,8 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/export.h>
+#include <linux/rculist.h>
+
 #include <asm/unaligned.h>
 
 #include <scsi/fc/fc_gs.h>
diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c
index c991f3b822f8..b44c3136eb51 100644
--- a/drivers/scsi/libfc/fc_rport.c
+++ b/drivers/scsi/libfc/fc_rport.c
@@ -65,6 +65,8 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/export.h>
+#include <linux/rculist.h>
+
 #include <asm/unaligned.h>
 
 #include <scsi/libfc.h>
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e9bc9292bd3a..e8ffba1052d3 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -26,7 +26,7 @@
 #include <linux/msi.h>
 #include <linux/irqreturn.h>
 #include <linux/rwsem.h>
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 struct acpi_dmar_header;
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 298ead5512e5..4d179316e431 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_PID_H
 #define _LINUX_PID_H
 
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 enum pid_type
 {
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f2e12a845910..092292b6675e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -25,7 +25,7 @@
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 /*
  * The end of the chain is marked with a special nulls marks which has
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 320eca0446cd..c6958a53fef3 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_SCHED_SIGNAL_H
 #define _LINUX_SCHED_SIGNAL_H
 
+#include <linux/rculist.h>
 #include <linux/signal.h>
 #include <linux/cred.h>
 #include <linux/sched.h>
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 90708f68cc02..95ccc1eef558 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -26,6 +26,8 @@
 #define __HCI_CORE_H
 
 #include <linux/leds.h>
+#include <linux/rculist.h>
+
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/hci_sock.h>
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f3a960ed75a1..1c21d0e2a145 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
+#include <linux/rculist.h>
 
 #include "tracing_map.h"
 #include "trace.h"
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 6721a1e89f39..f2ac9d44f6c4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include "trace.h"
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index eadd96ef772f..5f688cc724f0 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -20,6 +20,7 @@
 
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/rculist.h>
 
 #include "trace_probe.h"
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f4379e772171..a7581fec9681 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -24,6 +24,7 @@
 #include <linux/uprobes.h>
 #include <linux/namei.h>
 #include <linux/string.h>
+#include <linux/rculist.h>
 
 #include "trace_probe.h"
 
diff --git a/lib/bug.c b/lib/bug.c
index bc3656e944d2..06edbbef0623 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -45,6 +45,7 @@
 #include <linux/kernel.h>
 #include <linux/bug.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 
 extern const struct bug_entry __start___bug_table[], __stop___bug_table[];
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c5b9b9351cec..f8635fd57442 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index fcba70e57073..953d71e784a9 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -9,6 +9,8 @@
 #include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/random.h>
+#include <linux/rculist.h>
+
 #include "ieee80211_i.h"
 #include "rate.h"
 #include "mesh.h"
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 6a3e1c2181d3..1e1c9b20bab7 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -18,6 +18,8 @@
 #include <linux/bug.h>
 #include <linux/completion.h>
 #include <linux/ieee802154.h>
+#include <linux/rculist.h>
+
 #include <crypto/aead.h>
 #include <crypto/skcipher.h>
 
diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c
index 462c5d36b871..def1fbd6bdfd 100644
--- a/security/apparmor/policy.c
+++ b/security/apparmor/policy.c
@@ -77,6 +77,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/cred.h>
+#include <linux/rculist.h>
 #include <linux/user_namespace.h>
 
 #include "include/apparmor.h"
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 838ffa78cfda..00d223e9fb37 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -5,8 +5,10 @@
  */
 
 #include "common.h"
+
 #include <linux/binfmts.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 /* Variables definitions.*/
 
diff --git a/security/tomoyo/group.c b/security/tomoyo/group.c
index 50092534ec54..944ad77d8fba 100644
--- a/security/tomoyo/group.c
+++ b/security/tomoyo/group.c
@@ -5,6 +5,8 @@
  */
 
 #include <linux/slab.h>
+#include <linux/rculist.h>
+
 #include "common.h"
 
 /**
diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c
index 5fe3679137ae..848317fea704 100644
--- a/security/tomoyo/util.c
+++ b/security/tomoyo/util.c
@@ -5,6 +5,8 @@
  */
 
 #include <linux/slab.h>
+#include <linux/rculist.h>
+
 #include "common.h"
 
 /* Lock for protecting policy. */
-- 
cgit v1.2.3


From f719ff9bcee2a422647790f12d53d3755f47c727 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 10:57:33 +0100
Subject: sched/headers: Prepare to move the task_lock()/unlock() APIs to
 <linux/sched/task.h>

But first update the code that uses these facilities with the
new header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/math-emu/dsemul.c                | 1 +
 block/blk-ioc.c                            | 1 +
 block/ioprio.c                             | 1 +
 drivers/staging/android/ion/ion.c          | 1 +
 drivers/staging/lustre/lustre/ptlrpc/sec.c | 1 +
 fs/proc/internal.h                         | 1 +
 fs/proc/proc_net.c                         | 1 +
 fs/proc_namespace.c                        | 2 ++
 include/linux/cpuset.h                     | 1 +
 ipc/namespace.c                            | 1 +
 kernel/cgroup/cpuset.c                     | 1 +
 kernel/sched/debug.c                       | 1 +
 kernel/utsname.c                           | 1 +
 mm/mempolicy.c                             | 1 +
 mm/mmu_context.c                           | 1 +
 net/core/net_namespace.c                   | 2 ++
 net/core/netclassid_cgroup.c               | 2 ++
 net/core/netprio_cgroup.c                  | 2 ++
 18 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 6664908514b3..b6bfd3625369 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -1,6 +1,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/mm_types.h>
+#include <linux/sched/task.h>
 
 #include <asm/branch.h>
 #include <asm/cacheflush.h>
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b12f9c87b4c3..6bfa39675337 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -7,6 +7,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/sched/task.h>
 
 #include "blk.h"
 
diff --git a/block/ioprio.c b/block/ioprio.c
index 4b9ab8367dda..0c47a00f92a8 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -27,6 +27,7 @@
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/sched/user.h>
+#include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 2c3ffbcbd621..f45115fce4eb 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -36,6 +36,7 @@
 #include <linux/debugfs.h>
 #include <linux/dma-buf.h>
 #include <linux/idr.h>
+#include <linux/sched/task.h>
 
 #include "ion.h"
 #include "ion_priv.h"
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
index 49f34fd655c3..366f2ce20f5e 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -40,6 +40,7 @@
 #include <linux/crypto.h>
 #include <linux/cred.h>
 #include <linux/key.h>
+#include <linux/sched/task.h>
 
 #include "../include/obd.h"
 #include "../include/obd_class.h"
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 550e8b183abe..26a6daf02185 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -15,6 +15,7 @@
 #include <linux/atomic.h>
 #include <linux/binfmts.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 
 struct ctl_table_header;
 struct mempolicy;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index ffd72a6c6e04..5cbc65d7a1e1 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/mount.h>
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 3f1190d18991..b5713fefb4c1 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -10,6 +10,8 @@
 #include <linux/nsproxy.h>
 #include <linux/security.h>
 #include <linux/fs_struct.h>
+#include <linux/sched/task.h>
+
 #include "proc/internal.h" /* only for get_proc_task() in ->open() */
 
 #include "pnode.h"
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index c608c39cb161..611fce58d670 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -10,6 +10,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/topology.h>
+#include <linux/sched/task.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/mm.h>
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 1f1d713ac19c..b4d80f9f7246 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -14,6 +14,7 @@
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
+#include <linux/sched/task.h>
 
 #include "util.h"
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a5c46db2855c..0f41292be0fb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -45,6 +45,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/seq_file.h>
 #include <linux/security.h>
 #include <linux/slab.h>
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e865d8bfb881..38f019324f1a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -12,6 +12,7 @@
 
 #include <linux/proc_fs.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 06585ad296ff..913fe4336d2b 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,7 @@
 #include <linux/cred.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
+#include <linux/sched/task.h>
 
 static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
 {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 18e0105810df..75b2745bac41 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -75,6 +75,7 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 8888b124a386..3e612ae748e9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/mmu_context.h>
 #include <linux/export.h>
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3c4bbec39713..652468ff65b7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,6 +16,8 @@
 #include <linux/export.h>
 #include <linux/user_namespace.h>
 #include <linux/net_namespace.h>
+#include <linux/sched/task.h>
+
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/net_namespace.h>
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 11fce17274f6..6ae56037bb13 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -12,6 +12,8 @@
 #include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fdtable.h>
+#include <linux/sched/task.h>
+
 #include <net/cls_cgroup.h>
 #include <net/sock.h>
 
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 756637dc7a57..0f9275ee5595 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -20,6 +20,8 @@
 #include <linux/cgroup.h>
 #include <linux/rcupdate.h>
 #include <linux/atomic.h>
+#include <linux/sched/task.h>
+
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/sock.h>
-- 
cgit v1.2.3


From 32ef5517c298042ed58408545f475df43afe1f24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 11:48:36 +0100
Subject: sched/headers: Prepare to move cputime functionality from
 <linux/sched.h> into <linux/sched/cputime.h>

Introduce a trivial, mostly empty <linux/sched/cputime.h> header
to prepare for the moving of cputime functionality out of sched.h.

Update all code that relies on these facilities.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c    | 1 +
 arch/ia64/kernel/time.c        | 2 +-
 arch/powerpc/kernel/time.c     | 2 +-
 arch/s390/kernel/idle.c        | 2 +-
 arch/s390/kernel/vtime.c       | 2 +-
 arch/x86/kernel/apm_32.c       | 1 +
 arch/x86/kvm/hyperv.c          | 2 ++
 drivers/isdn/mISDN/stack.c     | 1 +
 drivers/s390/cio/cio.c         | 2 +-
 fs/binfmt_elf.c                | 1 +
 fs/binfmt_elf_fdpic.c          | 1 +
 fs/proc/array.c                | 1 +
 fs/proc/stat.c                 | 2 +-
 include/linux/sched/cputime.h  | 7 +++++++
 kernel/acct.c                  | 2 ++
 kernel/delayacct.c             | 1 +
 kernel/exit.c                  | 1 +
 kernel/fork.c                  | 1 +
 kernel/sched/cputime.c         | 2 +-
 kernel/sched/sched.h           | 1 +
 kernel/signal.c                | 1 +
 kernel/sys.c                   | 1 +
 kernel/time/itimer.c           | 1 +
 kernel/time/posix-cpu-timers.c | 1 +
 kernel/tsacct.c                | 1 +
 25 files changed, 33 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/sched/cputime.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 73446baa632e..0b961093ca5c 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -14,6 +14,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 144f9db7a876..aa7be020a904 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -22,7 +22,7 @@
 #include <linux/timex.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/platform_device.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 37833c5dc274..07b90725855e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -58,7 +58,7 @@
 #include <linux/clk-provider.h>
 #include <linux/suspend.h>
 #include <linux/rtc.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <asm/trace.h>
 
 #include <asm/io.h>
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index fb07a70820af..9340b2a07935 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -12,7 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include "entry.h"
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 31bd96e81167..c14fc9029912 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -6,7 +6,7 @@
  */
 
 #include <linux/kernel_stat.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/timex.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index dc04b30cbd60..5a414545e8a3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,6 +219,7 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
 #include <linux/pm.h>
 #include <linux/capability.h>
 #include <linux/device.h>
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f701d4430727..ebae57ac5902 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -28,6 +28,8 @@
 
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
+#include <linux/sched/cputime.h>
+
 #include <asm/apicdef.h>
 #include <trace/events/kvm.h>
 
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index 696f22fd5ab4..8b7faea2ddf8 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -19,6 +19,7 @@
 #include <linux/mISDNif.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
+#include <linux/sched/cputime.h>
 #include <linux/signal.h>
 
 #include "core.h"
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index de6fccc13124..1b350665c823 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -29,7 +29,7 @@
 #include <asm/chpid.h>
 #include <asm/airq.h>
 #include <asm/isc.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <asm/fcx.h>
 #include <asm/nmi.h>
 #include <asm/crw.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 92c00a13b28b..5075fd5c62c8 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -37,6 +37,7 @@
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6103a8149ccd..cf93a4fad012 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f3169b58af38..88c355574aa0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -63,6 +63,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b95556e036bb..bd4e55f4aa20 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,7 +10,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/irqnr.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <linux/tick.h>
 
 #ifndef arch_irq_stat_cpu
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
new file mode 100644
index 000000000000..6ed4fe43de28
--- /dev/null
+++ b/include/linux/sched/cputime.h
@@ -0,0 +1,7 @@
+#ifndef _LINUX_SCHED_CPUTIME_H
+#define _LINUX_SCHED_CPUTIME_H
+
+#include <linux/sched/signal.h>
+#include <linux/cputime.h>
+
+#endif /* _LINUX_SCHED_CPUTIME_H */
diff --git a/kernel/acct.c b/kernel/acct.c
index ca9cb55b5855..5b1284370367 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -56,6 +56,8 @@
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/uaccess.h>
+#include <linux/sched/cputime.h>
+
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c94135fc2698..4a1c33416b6a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
 #include <linux/slab.h>
 #include <linux/taskstats.h>
 #include <linux/time.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 771c33fc9952..e126ebf2400c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -11,6 +11,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index c87b6f03c710..916e78004b8f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -20,6 +20,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 54031fa681e2..f3778e2b46c8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,7 +4,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include "sched.h"
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1f1c8443837..76e3af3f39f4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -17,6 +17,7 @@
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 2008aa999976..7e59ebc2c25e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -18,6 +18,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f1ecd2e41c1..7ff6d1b10cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -55,6 +55,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index f6b961c5e58c..087d6a1279b8 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -11,6 +11,7 @@
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
 #include <linux/posix-timers.h>
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a2475a9f57d8..4513ad16a253 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
 #include <linux/posix-timers.h>
 #include <linux/errno.h>
 #include <linux/math64.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index d9a03b80b75d..370724b45391 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/cputime.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
-- 
cgit v1.2.3


From 1777e4635507265ba53d8dc4cd248e7d7c306fa0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:47:12 +0100
Subject: sched/headers: Prepare to move _init() prototypes from
 <linux/sched.h> to <linux/sched/init.h>

But first introduce a trivial header and update usage sites.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/setup.c   | 1 +
 arch/m32r/kernel/traps.c   | 1 +
 arch/s390/kernel/setup.c   | 1 +
 include/linux/cpu.h        | 2 ++
 include/linux/sched/init.h | 6 ++++++
 init/main.c                | 1 +
 kernel/sched/sched.h       | 1 +
 7 files changed, 13 insertions(+)
 create mode 100644 include/linux/sched/init.h

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index b2b4f5216180..63bef6fc0f3e 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -29,6 +29,7 @@
 #include <linux/bootmem.h>
 #include <linux/console.h>
 #include <linux/delay.h>
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index 832bb2bb29bd..647dd94a0c39 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -17,6 +17,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/mm.h>
+#include <linux/cpu.h>
 
 #include <asm/page.h>
 #include <asm/processor.h>
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 3de07004c02e..911dc0b49be0 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -19,6 +19,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 21f9c74496e7..f92081234afd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -30,6 +30,8 @@ struct cpu {
 
 extern void boot_cpu_init(void);
 extern void boot_cpu_state_init(void);
+extern void cpu_init(void);
+extern void trap_init(void);
 
 extern int register_cpu(struct cpu *cpu, int num);
 extern struct device *get_cpu_device(unsigned cpu);
diff --git a/include/linux/sched/init.h b/include/linux/sched/init.h
new file mode 100644
index 000000000000..15e46313e55e
--- /dev/null
+++ b/include/linux/sched/init.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_INIT_H
+#define _LINUX_SCHED_INIT_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_INIT_H */
diff --git a/init/main.c b/init/main.c
index ca9f53fb4125..eae2f15657c6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
 #include <linux/device.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
+#include <linux/sched/init.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/kgdb.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 76e3af3f39f4..5cbf92214ad8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -18,6 +18,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/init.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
-- 
cgit v1.2.3


From 50d34394cee68dd12c5e01fff073d1167700bfce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 16:03:58 +0100
Subject: sched/headers: Prepare to remove the <linux/magic.h> include from
 <linux/sched/task_stack.h>

Update files that depend on the magic.h inclusion.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/dax/dax.c                 | 1 +
 drivers/virtio/virtio_balloon.c   | 1 +
 include/linux/sched/task_stack.h  | 1 +
 mm/zsmalloc.c                     | 1 +
 security/integrity/evm/evm_main.c | 2 ++
 5 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index b75c77254fdb..8d9829ff2a78 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/pfn_t.h>
 #include <linux/hash.h>
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 9d2738e9217f..a610061fabf6 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -31,6 +31,7 @@
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
+#include <linux/magic.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index 933cd49824c2..d2d578e85f7d 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -2,5 +2,6 @@
 #define _LINUX_SCHED_TASK_STACK_H
 
 #include <linux/sched.h>
+#include <linux/magic.h>
 
 #endif /* _LINUX_SCHED_TASK_STACK_H */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b7b1fb6c8c21..b7ee9c34dbd6 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/magic.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index e2ed498c0f5f..063d38aef64e 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -22,6 +22,8 @@
 #include <linux/xattr.h>
 #include <linux/integrity.h>
 #include <linux/evm.h>
+#include <linux/magic.h>
+
 #include <crypto/hash.h>
 #include <crypto/algapi.h>
 #include "evm.h"
-- 
cgit v1.2.3


From b69339ba109549beeaf45c45c52ac7025bfcd954 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 16:15:03 +0100
Subject: sched/headers: Prepare to remove spurious <linux/sched.h> inclusion
 dependencies

In the following patches we are going to remove various headers
from sched.h and other headers that sched.h includes.

To make those patches build cleanly prepare the scene by adding
dependencies to various files that learned to rely on those
to-be-removed dependencies.

These changes all make sense standalone: they add a header for
a data type that a particular .c or .h file is using.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/misc/vmw_vmci/vmci_resource.c | 1 +
 include/linux/sched.h                 | 1 +
 include/sound/control.h               | 1 +
 include/target/target_core_base.h     | 1 +
 4 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/vmw_vmci/vmci_resource.c b/drivers/misc/vmw_vmci/vmci_resource.c
index 9a53a30de445..1ab6e8737a5f 100644
--- a/drivers/misc/vmw_vmci/vmci_resource.c
+++ b/drivers/misc/vmw_vmci/vmci_resource.c
@@ -17,6 +17,7 @@
 #include <linux/hash.h>
 #include <linux/types.h>
 #include <linux/rculist.h>
+#include <linux/completion.h>
 
 #include "vmci_resource.h"
 #include "vmci_driver.h"
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ea05116bc3c2..b5c6d602dfe9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -18,6 +18,7 @@ struct sched_param {
 #include <linux/types.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/mutex.h>
 #include <linux/plist.h>
 #include <linux/rbtree.h>
 #include <linux/thread_info.h>
diff --git a/include/sound/control.h b/include/sound/control.h
index 21d047f229a1..bd7246de58e7 100644
--- a/include/sound/control.h
+++ b/include/sound/control.h
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/wait.h>
 #include <sound/asound.h>
 
 #define snd_kcontrol_chip(kcontrol) ((kcontrol)->private_data)
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 878560e60c75..8be9ba73383d 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -5,6 +5,7 @@
 #include <linux/dma-direction.h> /* enum dma_data_direction */
 #include <linux/percpu_ida.h>    /* struct percpu_ida */
 #include <linux/semaphore.h>     /* struct semaphore */
+#include <linux/completion.h>
 
 #define TARGET_CORE_VERSION		"v5.0"
 
-- 
cgit v1.2.3


From a60b9eda67beac2318fa159545ba7d022f780736 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Move scheduler topology interfaces to
 <linux/sched/topology.h>

The vast majority of sched.h users does not require the topology types and
interfaces, so split them out into <linux/sched/topology.h>.

This reduces the size of linux/sched.h by ~6%.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h          | 212 ----------------------------------------
 include/linux/sched/topology.h | 213 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 213 insertions(+), 212 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5c6d602dfe9..55480782ce7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -960,12 +960,6 @@ enum cpu_idle_type {
 # define SCHED_FIXEDPOINT_SHIFT	10
 # define SCHED_FIXEDPOINT_SCALE	(1L << SCHED_FIXEDPOINT_SHIFT)
 
-/*
- * Increase resolution of cpu_capacity calculations
- */
-#define SCHED_CAPACITY_SHIFT	SCHED_FIXEDPOINT_SHIFT
-#define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
-
 /*
  * Wake-queues are lists of tasks with a pending wakeup, whose
  * callers have already marked the task as woken internally,
@@ -1016,214 +1010,8 @@ extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
 
-/*
- * sched-domains (multiprocessor balancing) declarations:
- */
-#ifdef CONFIG_SMP
-#define SD_LOAD_BALANCE		0x0001	/* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
-#define SD_BALANCE_EXEC		0x0004	/* Balance on exec */
-#define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
-#define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
-#define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
-#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
-#define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
-#define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
-#define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
-#define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
-#define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
-#define SD_NUMA			0x4000	/* cross-node balancing */
-
-#ifdef CONFIG_SCHED_SMT
-static inline int cpu_smt_flags(void)
-{
-	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-}
-#endif
-
-#ifdef CONFIG_SCHED_MC
-static inline int cpu_core_flags(void)
-{
-	return SD_SHARE_PKG_RESOURCES;
-}
-#endif
-
-#ifdef CONFIG_NUMA
-static inline int cpu_numa_flags(void)
-{
-	return SD_NUMA;
-}
-#endif
-
-extern int arch_asym_cpu_priority(int cpu);
-
-struct sched_domain_attr {
-	int relax_domain_level;
-};
-
-#define SD_ATTR_INIT	(struct sched_domain_attr) {	\
-	.relax_domain_level = -1,			\
-}
-
-extern int sched_domain_level_max;
-
-struct sched_group;
-
-struct sched_domain_shared {
-	atomic_t	ref;
-	atomic_t	nr_busy_cpus;
-	int		has_idle_cores;
-};
-
-struct sched_domain {
-	/* These fields must be setup */
-	struct sched_domain *parent;	/* top domain must be null terminated */
-	struct sched_domain *child;	/* bottom domain must be null terminated */
-	struct sched_group *groups;	/* the balancing groups of the domain */
-	unsigned long min_interval;	/* Minimum balance interval ms */
-	unsigned long max_interval;	/* Maximum balance interval ms */
-	unsigned int busy_factor;	/* less balancing by factor if busy */
-	unsigned int imbalance_pct;	/* No balance until over watermark */
-	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
-	unsigned int busy_idx;
-	unsigned int idle_idx;
-	unsigned int newidle_idx;
-	unsigned int wake_idx;
-	unsigned int forkexec_idx;
-	unsigned int smt_gain;
-
-	int nohz_idle;			/* NOHZ IDLE status */
-	int flags;			/* See SD_* */
-	int level;
-
-	/* Runtime fields. */
-	unsigned long last_balance;	/* init to jiffies. units in jiffies */
-	unsigned int balance_interval;	/* initialise to 1. units in ms. */
-	unsigned int nr_balance_failed; /* initialise to 0 */
-
-	/* idle_balance() stats */
-	u64 max_newidle_lb_cost;
-	unsigned long next_decay_max_lb_cost;
-
-	u64 avg_scan_cost;		/* select_idle_sibling */
-
-#ifdef CONFIG_SCHEDSTATS
-	/* load_balance() stats */
-	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
-	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
-	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
-	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
-	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
-	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
-	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
-	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
-
-	/* Active load balancing */
-	unsigned int alb_count;
-	unsigned int alb_failed;
-	unsigned int alb_pushed;
-
-	/* SD_BALANCE_EXEC stats */
-	unsigned int sbe_count;
-	unsigned int sbe_balanced;
-	unsigned int sbe_pushed;
-
-	/* SD_BALANCE_FORK stats */
-	unsigned int sbf_count;
-	unsigned int sbf_balanced;
-	unsigned int sbf_pushed;
-
-	/* try_to_wake_up() stats */
-	unsigned int ttwu_wake_remote;
-	unsigned int ttwu_move_affine;
-	unsigned int ttwu_move_balance;
-#endif
-#ifdef CONFIG_SCHED_DEBUG
-	char *name;
-#endif
-	union {
-		void *private;		/* used during construction */
-		struct rcu_head rcu;	/* used during destruction */
-	};
-	struct sched_domain_shared *shared;
-
-	unsigned int span_weight;
-	/*
-	 * Span of all CPUs in this domain.
-	 *
-	 * NOTE: this field is variable length. (Allocated dynamically
-	 * by attaching extra space to the end of the structure,
-	 * depending on how many CPUs the kernel has booted up with)
-	 */
-	unsigned long span[0];
-};
-
-static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
-{
-	return to_cpumask(sd->span);
-}
-
-extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-				    struct sched_domain_attr *dattr_new);
-
-/* Allocate an array of sched domains, for partition_sched_domains(). */
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
-
-bool cpus_share_cache(int this_cpu, int that_cpu);
-
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-typedef int (*sched_domain_flags_f)(void);
-
-#define SDTL_OVERLAP	0x01
-
-struct sd_data {
-	struct sched_domain **__percpu sd;
-	struct sched_domain_shared **__percpu sds;
-	struct sched_group **__percpu sg;
-	struct sched_group_capacity **__percpu sgc;
-};
-
-struct sched_domain_topology_level {
-	sched_domain_mask_f mask;
-	sched_domain_flags_f sd_flags;
-	int		    flags;
-	int		    numa_level;
-	struct sd_data      data;
-#ifdef CONFIG_SCHED_DEBUG
-	char                *name;
-#endif
-};
-
-extern void set_sched_topology(struct sched_domain_topology_level *tl);
 extern void wake_up_if_idle(int cpu);
 
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(type)		.name = #type
-#else
-# define SD_INIT_NAME(type)
-#endif
-
-#else /* CONFIG_SMP */
-
-struct sched_domain_attr;
-
-static inline void
-partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-			struct sched_domain_attr *dattr_new)
-{
-}
-
-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
-{
-	return true;
-}
-
-#endif	/* !CONFIG_SMP */
-
-
 struct io_context;			/* See blkdev.h */
 
 
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 184b56b8aa64..b3550b36e65d 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -5,4 +5,217 @@
 
 #include <linux/sched/idle.h>
 
+/*
+ * sched-domains (multiprocessor balancing) declarations:
+ */
+#ifdef CONFIG_SMP
+
+#define SD_LOAD_BALANCE		0x0001	/* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
+#define SD_BALANCE_EXEC		0x0004	/* Balance on exec */
+#define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
+#define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
+#define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
+#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
+#define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
+#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
+#define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
+#define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
+#define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
+#define SD_NUMA			0x4000	/* cross-node balancing */
+
+/*
+ * Increase resolution of cpu_capacity calculations
+ */
+#define SCHED_CAPACITY_SHIFT	SCHED_FIXEDPOINT_SHIFT
+#define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
+
+#ifdef CONFIG_SCHED_SMT
+static inline int cpu_smt_flags(void)
+{
+	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static inline int cpu_core_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static inline int cpu_numa_flags(void)
+{
+	return SD_NUMA;
+}
+#endif
+
+extern int arch_asym_cpu_priority(int cpu);
+
+struct sched_domain_attr {
+	int relax_domain_level;
+};
+
+#define SD_ATTR_INIT	(struct sched_domain_attr) {	\
+	.relax_domain_level = -1,			\
+}
+
+extern int sched_domain_level_max;
+
+struct sched_group;
+
+struct sched_domain_shared {
+	atomic_t	ref;
+	atomic_t	nr_busy_cpus;
+	int		has_idle_cores;
+};
+
+struct sched_domain {
+	/* These fields must be setup */
+	struct sched_domain *parent;	/* top domain must be null terminated */
+	struct sched_domain *child;	/* bottom domain must be null terminated */
+	struct sched_group *groups;	/* the balancing groups of the domain */
+	unsigned long min_interval;	/* Minimum balance interval ms */
+	unsigned long max_interval;	/* Maximum balance interval ms */
+	unsigned int busy_factor;	/* less balancing by factor if busy */
+	unsigned int imbalance_pct;	/* No balance until over watermark */
+	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int busy_idx;
+	unsigned int idle_idx;
+	unsigned int newidle_idx;
+	unsigned int wake_idx;
+	unsigned int forkexec_idx;
+	unsigned int smt_gain;
+
+	int nohz_idle;			/* NOHZ IDLE status */
+	int flags;			/* See SD_* */
+	int level;
+
+	/* Runtime fields. */
+	unsigned long last_balance;	/* init to jiffies. units in jiffies */
+	unsigned int balance_interval;	/* initialise to 1. units in ms. */
+	unsigned int nr_balance_failed; /* initialise to 0 */
+
+	/* idle_balance() stats */
+	u64 max_newidle_lb_cost;
+	unsigned long next_decay_max_lb_cost;
+
+	u64 avg_scan_cost;		/* select_idle_sibling */
+
+#ifdef CONFIG_SCHEDSTATS
+	/* load_balance() stats */
+	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
+
+	/* Active load balancing */
+	unsigned int alb_count;
+	unsigned int alb_failed;
+	unsigned int alb_pushed;
+
+	/* SD_BALANCE_EXEC stats */
+	unsigned int sbe_count;
+	unsigned int sbe_balanced;
+	unsigned int sbe_pushed;
+
+	/* SD_BALANCE_FORK stats */
+	unsigned int sbf_count;
+	unsigned int sbf_balanced;
+	unsigned int sbf_pushed;
+
+	/* try_to_wake_up() stats */
+	unsigned int ttwu_wake_remote;
+	unsigned int ttwu_move_affine;
+	unsigned int ttwu_move_balance;
+#endif
+#ifdef CONFIG_SCHED_DEBUG
+	char *name;
+#endif
+	union {
+		void *private;		/* used during construction */
+		struct rcu_head rcu;	/* used during destruction */
+	};
+	struct sched_domain_shared *shared;
+
+	unsigned int span_weight;
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 */
+	unsigned long span[0];
+};
+
+static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+{
+	return to_cpumask(sd->span);
+}
+
+extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+				    struct sched_domain_attr *dattr_new);
+
+/* Allocate an array of sched domains, for partition_sched_domains(). */
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+
+bool cpus_share_cache(int this_cpu, int that_cpu);
+
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+typedef int (*sched_domain_flags_f)(void);
+
+#define SDTL_OVERLAP	0x01
+
+struct sd_data {
+	struct sched_domain **__percpu sd;
+	struct sched_domain_shared **__percpu sds;
+	struct sched_group **__percpu sg;
+	struct sched_group_capacity **__percpu sgc;
+};
+
+struct sched_domain_topology_level {
+	sched_domain_mask_f mask;
+	sched_domain_flags_f sd_flags;
+	int		    flags;
+	int		    numa_level;
+	struct sd_data      data;
+#ifdef CONFIG_SCHED_DEBUG
+	char                *name;
+#endif
+};
+
+extern void set_sched_topology(struct sched_domain_topology_level *tl);
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(type)		.name = #type
+#else
+# define SD_INIT_NAME(type)
+#endif
+
+#else /* CONFIG_SMP */
+
+struct sched_domain_attr;
+
+static inline void
+partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+			struct sched_domain_attr *dattr_new)
+{
+}
+
+static inline bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
+#endif	/* !CONFIG_SMP */
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
-- 
cgit v1.2.3


From b768917d2c08edc4b5616c86a569e524d190434b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:51:00 +0100
Subject: sched/headers: Move the 'cpu_idle_type' enum from <linux/sched.h> to
 <linux/sched/idle.h>

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 7 -------
 include/linux/sched/idle.h | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55480782ce7f..8a8252e9e0c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -943,13 +943,6 @@ static inline int sched_info_on(void)
 void force_schedstat_enabled(void);
 #endif
 
-enum cpu_idle_type {
-	CPU_IDLE,
-	CPU_NOT_IDLE,
-	CPU_NEWLY_IDLE,
-	CPU_MAX_IDLE_TYPES
-};
-
 /*
  * Integer metrics need fixed point arithmetic, e.g., sched/fair
  * has a few: load, load_avg, util_avg, freq, and capacity.
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index a3cf79b86986..6aabc9968cba 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -3,4 +3,11 @@
 
 #include <linux/sched.h>
 
+enum cpu_idle_type {
+	CPU_IDLE,
+	CPU_NOT_IDLE,
+	CPU_NEWLY_IDLE,
+	CPU_MAX_IDLE_TYPES
+};
+
 #endif /* _LINUX_SCHED_IDLE_H */
-- 
cgit v1.2.3


From 4437722b0486c36dc90a1adf584fca4cea6cf1de Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:58:52 +0100
Subject: sched/headers: Move the wake_up_if_idle() prototype to
 <linux/sched/idle.h>

No need to clutter <linux/sched.h> with this rarely used prototype.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 2 --
 include/linux/sched/idle.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8a8252e9e0c7..c30705c3e553 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1003,8 +1003,6 @@ extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
 
-extern void wake_up_if_idle(int cpu);
-
 struct io_context;			/* See blkdev.h */
 
 
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 6aabc9968cba..72b6b6ab6354 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -10,4 +10,6 @@ enum cpu_idle_type {
 	CPU_MAX_IDLE_TYPES
 };
 
+extern void wake_up_if_idle(int cpu);
+
 #endif /* _LINUX_SCHED_IDLE_H */
-- 
cgit v1.2.3


From 5dbe91de599423d243738a205367506444ebb4a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:47:28 +0100
Subject: sched/headers: Move idle polling methods to <linux/sched/idle.h>

Further reduce the size of <linux/sched.h> by moving these APIs:

	tsk_is_polling()
	__current_set_polling()
	current_set_polling_and_test()
	__current_clr_polling()
	current_clr_polling_and_test()
	current_clr_polling()

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 76 ----------------------------------------------
 include/linux/sched/idle.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c30705c3e553..4f512e337584 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3176,82 +3176,6 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
-/*
- * Idle thread specific functions to determine the need_resched
- * polling state.
- */
-#ifdef TIF_POLLING_NRFLAG
-static inline int tsk_is_polling(struct task_struct *p)
-{
-	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
-}
-
-static inline void __current_set_polling(void)
-{
-	set_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	__current_set_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_curr()
-	 */
-	smp_mb__after_atomic();
-
-	return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-	clear_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_curr()
-	 */
-	smp_mb__after_atomic();
-
-	return unlikely(tif_need_resched());
-}
-
-#else
-static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	return unlikely(tif_need_resched());
-}
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-	return unlikely(tif_need_resched());
-}
-#endif
-
-static inline void current_clr_polling(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-	 * Once the bit is cleared, we'll get IPIs with every new
-	 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-	 * fold.
-	 */
-	smp_mb(); /* paired with resched_curr() */
-
-	preempt_fold_need_resched();
-}
-
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 72b6b6ab6354..66ee32f87f0b 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -12,4 +12,80 @@ enum cpu_idle_type {
 
 extern void wake_up_if_idle(int cpu);
 
+/*
+ * Idle thread specific functions to determine the need_resched
+ * polling state.
+ */
+#ifdef TIF_POLLING_NRFLAG
+static inline int tsk_is_polling(struct task_struct *p)
+{
+	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+}
+
+static inline void __current_set_polling(void)
+{
+	set_thread_flag(TIF_POLLING_NRFLAG);
+}
+
+static inline bool __must_check current_set_polling_and_test(void)
+{
+	__current_set_polling();
+
+	/*
+	 * Polling state must be visible before we test NEED_RESCHED,
+	 * paired by resched_curr()
+	 */
+	smp_mb__after_atomic();
+
+	return unlikely(tif_need_resched());
+}
+
+static inline void __current_clr_polling(void)
+{
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+}
+
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+	__current_clr_polling();
+
+	/*
+	 * Polling state must be visible before we test NEED_RESCHED,
+	 * paired by resched_curr()
+	 */
+	smp_mb__after_atomic();
+
+	return unlikely(tif_need_resched());
+}
+
+#else
+static inline int tsk_is_polling(struct task_struct *p) { return 0; }
+static inline void __current_set_polling(void) { }
+static inline void __current_clr_polling(void) { }
+
+static inline bool __must_check current_set_polling_and_test(void)
+{
+	return unlikely(tif_need_resched());
+}
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+	return unlikely(tif_need_resched());
+}
+#endif
+
+static inline void current_clr_polling(void)
+{
+	__current_clr_polling();
+
+	/*
+	 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
+	 * Once the bit is cleared, we'll get IPIs with every new
+	 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
+	 * fold.
+	 */
+	smp_mb(); /* paired with resched_curr() */
+
+	preempt_fold_need_resched();
+}
+
 #endif /* _LINUX_SCHED_IDLE_H */
-- 
cgit v1.2.3


From eb61baf69871b9836783a81bc451189edb0d9de2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 17:09:06 +0100
Subject: sched/headers: Move the wake-queue types and interfaces from sched.h
 into <linux/sched/wake_q.h>

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h        | 54 ++++----------------------------------------
 include/linux/sched/wake_q.h | 47 ++++++++++++++++++++++++++++++++++++++
 ipc/msg.c                    |  1 -
 3 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f512e337584..5cda7cf09505 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -953,56 +953,6 @@ void force_schedstat_enabled(void);
 # define SCHED_FIXEDPOINT_SHIFT	10
 # define SCHED_FIXEDPOINT_SCALE	(1L << SCHED_FIXEDPOINT_SHIFT)
 
-/*
- * Wake-queues are lists of tasks with a pending wakeup, whose
- * callers have already marked the task as woken internally,
- * and can thus carry on. A common use case is being able to
- * do the wakeups once the corresponding user lock as been
- * released.
- *
- * We hold reference to each task in the list across the wakeup,
- * thus guaranteeing that the memory is still valid by the time
- * the actual wakeups are performed in wake_up_q().
- *
- * One per task suffices, because there's never a need for a task to be
- * in two wake queues simultaneously; it is forbidden to abandon a task
- * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
- * already in a wake queue, the wakeup will happen soon and the second
- * waker can just skip it.
- *
- * The DEFINE_WAKE_Q macro declares and initializes the list head.
- * wake_up_q() does NOT reinitialize the list; it's expected to be
- * called near the end of a function. Otherwise, the list can be
- * re-initialized for later re-use by wake_q_init().
- *
- * Note that this can cause spurious wakeups. schedule() callers
- * must ensure the call is done inside a loop, confirming that the
- * wakeup condition has in fact occurred.
- */
-struct wake_q_node {
-	struct wake_q_node *next;
-};
-
-struct wake_q_head {
-	struct wake_q_node *first;
-	struct wake_q_node **lastp;
-};
-
-#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
-
-#define DEFINE_WAKE_Q(name)				\
-	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
-
-static inline void wake_q_init(struct wake_q_head *head)
-{
-	head->first = WAKE_Q_TAIL;
-	head->lastp = &head->first;
-}
-
-extern void wake_q_add(struct wake_q_head *head,
-		       struct task_struct *task);
-extern void wake_up_q(struct wake_q_head *head);
-
 struct io_context;			/* See blkdev.h */
 
 
@@ -1234,6 +1184,10 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+struct wake_q_node {
+	struct wake_q_node *next;
+};
+
 /* Track pages that require TLB flushes */
 struct tlbflush_unmap_batch {
 	/*
diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index 6383b35e4eba..d03d8a9047dc 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -1,6 +1,53 @@
 #ifndef _LINUX_SCHED_WAKE_Q_H
 #define _LINUX_SCHED_WAKE_Q_H
 
+/*
+ * Wake-queues are lists of tasks with a pending wakeup, whose
+ * callers have already marked the task as woken internally,
+ * and can thus carry on. A common use case is being able to
+ * do the wakeups once the corresponding user lock as been
+ * released.
+ *
+ * We hold reference to each task in the list across the wakeup,
+ * thus guaranteeing that the memory is still valid by the time
+ * the actual wakeups are performed in wake_up_q().
+ *
+ * One per task suffices, because there's never a need for a task to be
+ * in two wake queues simultaneously; it is forbidden to abandon a task
+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ * already in a wake queue, the wakeup will happen soon and the second
+ * waker can just skip it.
+ *
+ * The DEFINE_WAKE_Q macro declares and initializes the list head.
+ * wake_up_q() does NOT reinitialize the list; it's expected to be
+ * called near the end of a function. Otherwise, the list can be
+ * re-initialized for later re-use by wake_q_init().
+ *
+ * Note that this can cause spurious wakeups. schedule() callers
+ * must ensure the call is done inside a loop, confirming that the
+ * wakeup condition has in fact occurred.
+ */
+
 #include <linux/sched.h>
 
+struct wake_q_head {
+	struct wake_q_node *first;
+	struct wake_q_node **lastp;
+};
+
+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+
+#define DEFINE_WAKE_Q(name)				\
+	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+
+static inline void wake_q_init(struct wake_q_head *head)
+{
+	head->first = WAKE_Q_TAIL;
+	head->lastp = &head->first;
+}
+
+extern void wake_q_add(struct wake_q_head *head,
+		       struct task_struct *task);
+extern void wake_up_q(struct wake_q_head *head);
+
 #endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/ipc/msg.c b/ipc/msg.c
index ecc387e573f6..104926dc72be 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -30,7 +30,6 @@
 #include <linux/proc_fs.h>
 #include <linux/list.h>
 #include <linux/security.h>
-#include <linux/sched.h>
 #include <linux/sched/wake_q.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
-- 
cgit v1.2.3


From 5689810360c2e88ce1619e8bcfa859852f9a1d1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 7 Feb 2017 12:07:18 +0100
Subject: sched/headers: Move scheduler clock interfaces to
 <linux/sched/clock.h>

Move the sched_clock interfaces into a separate header file, to reduce
the size of sched.h.

Include <linux/sched/clock.h> in all files that made use of one of the
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       | 100 --------------------------------------------
 include/linux/sched/clock.h |  98 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5cda7cf09505..b42d2f9d673a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2190,103 +2190,6 @@ static inline void calc_load_exit_idle(void) { }
 #define cpu_relax_yield() cpu_relax()
 #endif
 
-/*
- * Do not use outside of architecture code which knows its limitations.
- *
- * sched_clock() has no promise of monotonicity or bounded drift between
- * CPUs, use (which you should not) requires disabling IRQs.
- *
- * Please use one of the three interfaces below.
- */
-extern unsigned long long notrace sched_clock(void);
-/*
- * See the comment in kernel/sched/clock.c
- */
-extern u64 running_clock(void);
-extern u64 sched_clock_cpu(int cpu);
-
-
-extern void sched_clock_init(void);
-
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_init_late(void)
-{
-}
-
-static inline void sched_clock_tick(void)
-{
-}
-
-static inline void clear_sched_clock_stable(void)
-{
-}
-
-static inline void sched_clock_idle_sleep_event(void)
-{
-}
-
-static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-}
-
-static inline u64 cpu_clock(int cpu)
-{
-	return sched_clock();
-}
-
-static inline u64 local_clock(void)
-{
-	return sched_clock();
-}
-#else
-extern void sched_clock_init_late(void);
-/*
- * Architectures can set this to 1 if they have specified
- * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- * but then during bootup it turns out that sched_clock()
- * is reliable after all:
- */
-extern int sched_clock_stable(void);
-extern void clear_sched_clock_stable(void);
-
-extern void sched_clock_tick(void);
-extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
-
-/*
- * As outlined in clock.c, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- */
-static inline u64 cpu_clock(int cpu)
-{
-	return sched_clock_cpu(cpu);
-}
-
-static inline u64 local_clock(void)
-{
-	return sched_clock_cpu(raw_smp_processor_id());
-}
-#endif
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
- * The reason for this explicit opt-in is not to have perf penalty with
- * slow sched_clocks.
- */
-extern void enable_sched_clock_irqtime(void);
-extern void disable_sched_clock_irqtime(void);
-#else
-static inline void enable_sched_clock_irqtime(void) {}
-static inline void disable_sched_clock_irqtime(void) {}
-#endif
-
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 
@@ -2297,9 +2200,6 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif
 
-extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
-
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
 #else
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 7cb7d79b2cf9..ac12f71d359c 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -3,4 +3,102 @@
 
 #include <linux/sched.h>
 
+/*
+ * Do not use outside of architecture code which knows its limitations.
+ *
+ * sched_clock() has no promise of monotonicity or bounded drift between
+ * CPUs, use (which you should not) requires disabling IRQs.
+ *
+ * Please use one of the three interfaces below.
+ */
+extern unsigned long long notrace sched_clock(void);
+
+/*
+ * See the comment in kernel/sched/clock.c
+ */
+extern u64 running_clock(void);
+extern u64 sched_clock_cpu(int cpu);
+
+
+extern void sched_clock_init(void);
+
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_init_late(void)
+{
+}
+
+static inline void sched_clock_tick(void)
+{
+}
+
+static inline void clear_sched_clock_stable(void)
+{
+}
+
+static inline void sched_clock_idle_sleep_event(void)
+{
+}
+
+static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+}
+
+static inline u64 cpu_clock(int cpu)
+{
+	return sched_clock();
+}
+
+static inline u64 local_clock(void)
+{
+	return sched_clock();
+}
+#else
+extern void sched_clock_init_late(void);
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+extern int sched_clock_stable(void);
+extern void clear_sched_clock_stable(void);
+
+extern void sched_clock_tick(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+/*
+ * As outlined in clock.c, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+static inline u64 cpu_clock(int cpu)
+{
+	return sched_clock_cpu(cpu);
+}
+
+static inline u64 local_clock(void)
+{
+	return sched_clock_cpu(raw_smp_processor_id());
+}
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+ * The reason for this explicit opt-in is not to have perf penalty with
+ * slow sched_clocks.
+ */
+extern void enable_sched_clock_irqtime(void);
+extern void disable_sched_clock_irqtime(void);
+#else
+static inline void enable_sched_clock_irqtime(void) {}
+static inline void disable_sched_clock_irqtime(void) {}
+#endif
+
 #endif /* _LINUX_SCHED_CLOCK_H */
-- 
cgit v1.2.3


From 47913d4ebd99c827c82c4f29eb282a119c3f2aeb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:00:26 +0100
Subject: sched/headers, delayacct: Move the 'struct task_delay_info'
 definition from <linux/sched.h> to <linux/delayacct.h>

The 'struct task_delay_info' definition does not have to be in sched.h,
because task_struct only has a pointer to it.

So move it to <linux/delayacct.h> to reduce the size of <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/delayacct.h | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/sched.h     | 39 ++++-----------------------------------
 2 files changed, 40 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 00e60f79a9cc..6b769cbd1000 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -30,7 +30,43 @@
 #define DELAYACCT_PF_BLKIO	0x00000002	/* I am waiting on IO */
 
 #ifdef CONFIG_TASK_DELAY_ACCT
+struct task_delay_info {
+	spinlock_t	lock;
+	unsigned int	flags;	/* Private per-task flags */
+
+	/* For each stat XXX, add following, aligned appropriately
+	 *
+	 * struct timespec XXX_start, XXX_end;
+	 * u64 XXX_delay;
+	 * u32 XXX_count;
+	 *
+	 * Atomicity of updates to XXX_delay, XXX_count protected by
+	 * single lock above (split into XXX_lock if contention is an issue).
+	 */
+
+	/*
+	 * XXX_count is incremented on every XXX operation, the delay
+	 * associated with the operation is added to XXX_delay.
+	 * XXX_delay contains the accumulated delay time in nanoseconds.
+	 */
+	u64 blkio_start;	/* Shared by blkio, swapin */
+	u64 blkio_delay;	/* wait for sync block io completion */
+	u64 swapin_delay;	/* wait for swapin block io completion */
+	u32 blkio_count;	/* total count of the number of sync block */
+				/* io operations performed */
+	u32 swapin_count;	/* total count of the number of swapin block */
+				/* io operations performed */
+
+	u64 freepages_start;
+	u64 freepages_delay;	/* wait for memory reclaim */
+	u32 freepages_count;	/* total count of memory reclaim */
+};
+#endif
 
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_TASK_DELAY_ACCT
 extern int delayacct_on;	/* Delay accounting turned on/off */
 extern struct kmem_cache *delayacct_cache;
 extern void delayacct_init(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b42d2f9d673a..7d6998858fa3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -893,39 +893,7 @@ struct sched_info {
 };
 #endif /* CONFIG_SCHED_INFO */
 
-#ifdef CONFIG_TASK_DELAY_ACCT
-struct task_delay_info {
-	spinlock_t	lock;
-	unsigned int	flags;	/* Private per-task flags */
-
-	/* For each stat XXX, add following, aligned appropriately
-	 *
-	 * struct timespec XXX_start, XXX_end;
-	 * u64 XXX_delay;
-	 * u32 XXX_count;
-	 *
-	 * Atomicity of updates to XXX_delay, XXX_count protected by
-	 * single lock above (split into XXX_lock if contention is an issue).
-	 */
-
-	/*
-	 * XXX_count is incremented on every XXX operation, the delay
-	 * associated with the operation is added to XXX_delay.
-	 * XXX_delay contains the accumulated delay time in nanoseconds.
-	 */
-	u64 blkio_start;	/* Shared by blkio, swapin */
-	u64 blkio_delay;	/* wait for sync block io completion */
-	u64 swapin_delay;	/* wait for swapin block io completion */
-	u32 blkio_count;	/* total count of the number of sync block */
-				/* io operations performed */
-	u32 swapin_count;	/* total count of the number of swapin block */
-				/* io operations performed */
-
-	u64 freepages_start;
-	u64 freepages_delay;	/* wait for memory reclaim */
-	u32 freepages_count;	/* total count of memory reclaim */
-};
-#endif	/* CONFIG_TASK_DELAY_ACCT */
+struct task_delay_info;
 
 static inline int sched_info_on(void)
 {
@@ -1612,9 +1580,10 @@ struct task_struct {
 
 	struct page_frag task_frag;
 
-#ifdef	CONFIG_TASK_DELAY_ACCT
-	struct task_delay_info *delays;
+#ifdef CONFIG_TASK_DELAY_ACCT
+	struct task_delay_info		*delays;
 #endif
+
 #ifdef CONFIG_FAULT_INJECTION
 	int make_it_fail;
 #endif
-- 
cgit v1.2.3


From e2d1e2aec572a2138dea74d53be54a1406d419c0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:07:51 +0100
Subject: sched/headers: Move various ABI definitions to
 <uapi/linux/sched/types.h>

Move scheduler ABI types (struct sched_attr, struct sched_param, etc.) into
the new UAPI header.

This further reduces the size and complexity of <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h            | 70 ++--------------------------------------
 include/uapi/linux/sched/types.h | 68 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d6998858fa3..5c481906e835 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -5,11 +5,6 @@
 
 #include <linux/sched/prio.h>
 
-
-struct sched_param {
-	int sched_priority;
-};
-
 #include <asm/param.h>	/* for HZ */
 
 #include <linux/capability.h>
@@ -64,69 +59,8 @@ struct sched_param {
 
 #include <asm/processor.h>
 
-#define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
-
-/*
- * Extended scheduling parameters data structure.
- *
- * This is needed because the original struct sched_param can not be
- * altered without introducing ABI issues with legacy applications
- * (e.g., in sched_getparam()).
- *
- * However, the possibility of specifying more than just a priority for
- * the tasks may be useful for a wide variety of application fields, e.g.,
- * multimedia, streaming, automation and control, and many others.
- *
- * This variant (sched_attr) is meant at describing a so-called
- * sporadic time-constrained task. In such model a task is specified by:
- *  - the activation period or minimum instance inter-arrival time;
- *  - the maximum (or average, depending on the actual scheduling
- *    discipline) computation time of all instances, a.k.a. runtime;
- *  - the deadline (relative to the actual activation time) of each
- *    instance.
- * Very briefly, a periodic (sporadic) task asks for the execution of
- * some specific computation --which is typically called an instance--
- * (at most) every period. Moreover, each instance typically lasts no more
- * than the runtime and must be completed by time instant t equal to
- * the instance activation time + the deadline.
- *
- * This is reflected by the actual fields of the sched_attr structure:
- *
- *  @size		size of the structure, for fwd/bwd compat.
- *
- *  @sched_policy	task's scheduling policy
- *  @sched_flags	for customizing the scheduler behaviour
- *  @sched_nice		task's nice value      (SCHED_NORMAL/BATCH)
- *  @sched_priority	task's static priority (SCHED_FIFO/RR)
- *  @sched_deadline	representative of the task's deadline
- *  @sched_runtime	representative of the task's runtime
- *  @sched_period	representative of the task's period
- *
- * Given this task model, there are a multiplicity of scheduling algorithms
- * and policies, that can be used to ensure all the tasks will make their
- * timing constraints.
- *
- * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
- * only user of this new interface. More information about the algorithm
- * available in the scheduling class file or in Documentation/.
- */
-struct sched_attr {
-	u32 size;
-
-	u32 sched_policy;
-	u64 sched_flags;
-
-	/* SCHED_NORMAL, SCHED_BATCH */
-	s32 sched_nice;
-
-	/* SCHED_FIFO, SCHED_RR */
-	u32 sched_priority;
-
-	/* SCHED_DEADLINE */
-	u64 sched_runtime;
-	u64 sched_deadline;
-	u64 sched_period;
-};
+struct sched_attr;
+struct sched_param;
 
 struct futex_pi_state;
 struct robust_list_head;
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index d162d315f4b5..307acbc82d80 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -3,4 +3,72 @@
 
 #include <linux/types.h>
 
+struct sched_param {
+	int sched_priority;
+};
+
+#define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+
+/*
+ * Extended scheduling parameters data structure.
+ *
+ * This is needed because the original struct sched_param can not be
+ * altered without introducing ABI issues with legacy applications
+ * (e.g., in sched_getparam()).
+ *
+ * However, the possibility of specifying more than just a priority for
+ * the tasks may be useful for a wide variety of application fields, e.g.,
+ * multimedia, streaming, automation and control, and many others.
+ *
+ * This variant (sched_attr) is meant at describing a so-called
+ * sporadic time-constrained task. In such model a task is specified by:
+ *  - the activation period or minimum instance inter-arrival time;
+ *  - the maximum (or average, depending on the actual scheduling
+ *    discipline) computation time of all instances, a.k.a. runtime;
+ *  - the deadline (relative to the actual activation time) of each
+ *    instance.
+ * Very briefly, a periodic (sporadic) task asks for the execution of
+ * some specific computation --which is typically called an instance--
+ * (at most) every period. Moreover, each instance typically lasts no more
+ * than the runtime and must be completed by time instant t equal to
+ * the instance activation time + the deadline.
+ *
+ * This is reflected by the actual fields of the sched_attr structure:
+ *
+ *  @size		size of the structure, for fwd/bwd compat.
+ *
+ *  @sched_policy	task's scheduling policy
+ *  @sched_flags	for customizing the scheduler behaviour
+ *  @sched_nice		task's nice value      (SCHED_NORMAL/BATCH)
+ *  @sched_priority	task's static priority (SCHED_FIFO/RR)
+ *  @sched_deadline	representative of the task's deadline
+ *  @sched_runtime	representative of the task's runtime
+ *  @sched_period	representative of the task's period
+ *
+ * Given this task model, there are a multiplicity of scheduling algorithms
+ * and policies, that can be used to ensure all the tasks will make their
+ * timing constraints.
+ *
+ * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
+ * only user of this new interface. More information about the algorithm
+ * available in the scheduling class file or in Documentation/.
+ */
+struct sched_attr {
+	u32 size;
+
+	u32 sched_policy;
+	u64 sched_flags;
+
+	/* SCHED_NORMAL, SCHED_BATCH */
+	s32 sched_nice;
+
+	/* SCHED_FIFO, SCHED_RR */
+	u32 sched_priority;
+
+	/* SCHED_DEADLINE */
+	u64 sched_runtime;
+	u64 sched_deadline;
+	u64 sched_period;
+};
+
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */
-- 
cgit v1.2.3


From dea38c74cb9205341f52b8d8ae18f61247a43ea8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:25:37 +0100
Subject: sched/headers: Move loadavg related definitions from <linux/sched.h>
 to <linux/sched/loadavg.h>

Move these bits to <linux/sched/loadavg.h>, to reduce the size and
complexity of <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h         | 27 ---------------------------
 include/linux/sched/loadavg.h | 27 +++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c481906e835..78adf7a0cac1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,31 +71,6 @@ struct blk_plug;
 struct filename;
 struct nameidata;
 
-/*
- * These are the constant used to fake the fixed-point load-average
- * counting. Some notes:
- *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
- *    a load-average precision of 10 bits integer + 11 bits fractional
- *  - if you want to count load-averages more often, you need more
- *    precision, or rounding will get you. With 2-second counting freq,
- *    the EXP_n values would be 1981, 2034 and 2043 if still using only
- *    11 bit fractions.
- */
-extern unsigned long avenrun[];		/* Load averages */
-extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
-
-#define FSHIFT		11		/* nr of bits of precision */
-#define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
-#define LOAD_FREQ	(5*HZ+1)	/* 5 sec intervals */
-#define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
-#define EXP_5		2014		/* 1/exp(5sec/5min) */
-#define EXP_15		2037		/* 1/exp(5sec/15min) */
-
-#define CALC_LOAD(load,exp,n) \
-	load *= exp; \
-	load += n*(FIXED_1-exp); \
-	load >>= FSHIFT;
-
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
@@ -106,8 +81,6 @@ extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
-extern void calc_global_load(unsigned long ticks);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void cpu_load_update_nohz_start(void);
 extern void cpu_load_update_nohz_stop(void);
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 3ae74be327e8..c392e28ce0ac 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -3,4 +3,31 @@
 
 #include <linux/sched.h>
 
+/*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ *    a load-average precision of 10 bits integer + 11 bits fractional
+ *  - if you want to count load-averages more often, you need more
+ *    precision, or rounding will get you. With 2-second counting freq,
+ *    the EXP_n values would be 1981, 2034 and 2043 if still using only
+ *    11 bit fractions.
+ */
+extern unsigned long avenrun[];		/* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+
+#define FSHIFT		11		/* nr of bits of precision */
+#define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
+#define LOAD_FREQ	(5*HZ+1)	/* 5 sec intervals */
+#define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
+#define EXP_5		2014		/* 1/exp(5sec/5min) */
+#define EXP_15		2037		/* 1/exp(5sec/15min) */
+
+#define CALC_LOAD(load,exp,n) \
+	load *= exp; \
+	load += n*(FIXED_1-exp); \
+	load >>= FSHIFT;
+
+extern void calc_global_load(unsigned long ticks);
+
 #endif /* _LINUX_SCHED_LOADAVG_H */
-- 
cgit v1.2.3


From de8f1c77313d8c908b2897f268d466c13df161d4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:41:24 +0100
Subject: sched/headers: Move autogroup APIs into <linux/sched/autogroup.h>

Further reduce the size of sched.h.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h           | 18 ------------------
 include/linux/sched/autogroup.h | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78adf7a0cac1..042620729230 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2092,24 +2092,6 @@ static inline void wake_up_nohz_cpu(int cpu) { }
 extern u64 scheduler_tick_max_deferment(void);
 #endif
 
-#ifdef CONFIG_SCHED_AUTOGROUP
-extern void sched_autogroup_create_attach(struct task_struct *p);
-extern void sched_autogroup_detach(struct task_struct *p);
-extern void sched_autogroup_fork(struct signal_struct *sig);
-extern void sched_autogroup_exit(struct signal_struct *sig);
-extern void sched_autogroup_exit_task(struct task_struct *p);
-#ifdef CONFIG_PROC_FS
-extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
-extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
-#endif
-#else
-static inline void sched_autogroup_create_attach(struct task_struct *p) { }
-static inline void sched_autogroup_detach(struct task_struct *p) { }
-static inline void sched_autogroup_fork(struct signal_struct *sig) { }
-static inline void sched_autogroup_exit(struct signal_struct *sig) { }
-static inline void sched_autogroup_exit_task(struct task_struct *p) { }
-#endif
-
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
diff --git a/include/linux/sched/autogroup.h b/include/linux/sched/autogroup.h
index d745f22e57dc..586bdf37330d 100644
--- a/include/linux/sched/autogroup.h
+++ b/include/linux/sched/autogroup.h
@@ -3,4 +3,25 @@
 
 #include <linux/sched.h>
 
+struct signal_struct;
+struct seq_file;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+extern void sched_autogroup_exit_task(struct task_struct *p);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit_task(struct task_struct *p) { }
+#endif
+
 #endif /* _LINUX_SCHED_AUTOGROUP_H */
-- 
cgit v1.2.3


From 27ddb689909cd0bab30524a5f720ae3a3e55acac Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 1 Feb 2017 09:53:15 -0800
Subject: PCI: add an API to get node from vector

Next patch will use the API to get the node from vector for nvme device

Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/pci/msi.c   | 16 ++++++++++++++++
 include/linux/pci.h |  6 ++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 980eaf588281..d571bc330686 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1298,6 +1298,22 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 }
 EXPORT_SYMBOL(pci_irq_get_affinity);
 
+/**
+ * pci_irq_get_node - return the numa node of a particular msi vector
+ * @pdev:	PCI device to operate on
+ * @vec:	device-relative interrupt vector index (0-based).
+ */
+int pci_irq_get_node(struct pci_dev *pdev, int vec)
+{
+	const struct cpumask *mask;
+
+	mask = pci_irq_get_affinity(pdev, vec);
+	if (mask)
+		return local_memory_node(cpu_to_node(cpumask_first(mask)));
+	return dev_to_node(&pdev->dev);
+}
+EXPORT_SYMBOL(pci_irq_get_node);
+
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
 {
 	return to_pci_dev(desc->dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 282ed32244ce..eb3da1a04e6c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1323,6 +1323,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
+int pci_irq_get_node(struct pci_dev *pdev, int vec);
 
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
@@ -1370,6 +1371,11 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev,
 {
 	return cpu_possible_mask;
 }
+
+static inline int pci_irq_get_node(struct pci_dev *pdev, int vec)
+{
+	return first_online_node;
+}
 #endif
 
 static inline int
-- 
cgit v1.2.3


From 6bae363ee3057a14eec93440826813603559273a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 1 Mar 2017 14:22:10 -0500
Subject: blk-mq: Export blk_mq_freeze_queue_wait

Drivers can start a freeze, so this provides a way to wait for frozen.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 3 ++-
 include/linux/blk-mq.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 94593c6282d8..8da2c04bb88f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -75,10 +75,11 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
 
-static void blk_mq_freeze_queue_wait(struct request_queue *q)
+void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 }
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 
 /*
  * Guarantee no request is in use, so we can change any data structure of
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 001d30d727c5..8dacf680c851 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -245,6 +245,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set);
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
-- 
cgit v1.2.3


From f91328c40a559362b6e7b7bfee01ca17fda87592 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 1 Mar 2017 14:22:11 -0500
Subject: blk-mq: Provide freeze queue timeout

A driver may wish to take corrective action if queued requests do not
complete within a set time.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 9 +++++++++
 include/linux/blk-mq.h | 2 ++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8da2c04bb88f..a5e66a7a3506 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -81,6 +81,15 @@ void blk_mq_freeze_queue_wait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 
+int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
+				     unsigned long timeout)
+{
+	return wait_event_timeout(q->mq_freeze_wq,
+					percpu_ref_is_zero(&q->q_usage_counter),
+					timeout);
+}
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
+
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8dacf680c851..b296a9006117 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -246,6 +246,8 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
+int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
+				     unsigned long timeout);
 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set);
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
-- 
cgit v1.2.3


From 474c90156c8dcc2fa815e6716cc9394d7930cb9c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 2 Mar 2017 12:17:22 -0800
Subject: give up on gcc ilog2() constant optimizations

gcc-7 has an "optimization" pass that completely screws up, and
generates the code expansion for the (impossible) case of calling
ilog2() with a zero constant, even when the code gcc compiles does not
actually have a zero constant.

And we try to generate a compile-time error for anybody doing ilog2() on
a constant where that doesn't make sense (be it zero or negative).  So
now gcc7 will fail the build due to our sanity checking, because it
created that constant-zero case that didn't actually exist in the source
code.

There's a whole long discussion on the kernel mailing about how to work
around this gcc bug.  The gcc people themselevs have discussed their
"feature" in

   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72785

but it's all water under the bridge, because while it looked at one
point like it would be solved by the time gcc7 was released, that was
not to be.

So now we have to deal with this compiler braindamage.

And the only simple approach seems to be to just delete the code that
tries to warn about bad uses of ilog2().

So now "ilog2()" will just return 0 not just for the value 1, but for
any non-positive value too.

It's not like I can recall anybody having ever actually tried to use
this function on any invalid value, but maybe the sanity check just
meant that such code never made it out in public.

Reported-by: Laura Abbott <labbott@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>,
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/log2.h       | 13 ++-----------
 tools/include/linux/log2.h | 13 ++-----------
 2 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/log2.h b/include/linux/log2.h
index ef3d4f67118c..c373295f359f 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -15,12 +15,6 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 
-/*
- * deal with unrepresentable constant logarithms
- */
-extern __attribute__((const, noreturn))
-int ____ilog2_NaN(void);
-
 /*
  * non-constant log of base 2 calculators
  * - the arch may override these in asm/bitops.h if they can be implemented
@@ -85,7 +79,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 #define ilog2(n)				\
 (						\
 	__builtin_constant_p(n) ? (		\
-		(n) < 1 ? ____ilog2_NaN() :	\
+		(n) < 2 ? 0 :			\
 		(n) & (1ULL << 63) ? 63 :	\
 		(n) & (1ULL << 62) ? 62 :	\
 		(n) & (1ULL << 61) ? 61 :	\
@@ -148,10 +142,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 		(n) & (1ULL <<  4) ?  4 :	\
 		(n) & (1ULL <<  3) ?  3 :	\
 		(n) & (1ULL <<  2) ?  2 :	\
-		(n) & (1ULL <<  1) ?  1 :	\
-		(n) & (1ULL <<  0) ?  0 :	\
-		____ilog2_NaN()			\
-				   ) :		\
+		1 ) :				\
 	(sizeof(n) <= 4) ?			\
 	__ilog2_u32(n) :			\
 	__ilog2_u64(n)				\
diff --git a/tools/include/linux/log2.h b/tools/include/linux/log2.h
index 41446668ccce..d5677d39c1e4 100644
--- a/tools/include/linux/log2.h
+++ b/tools/include/linux/log2.h
@@ -12,12 +12,6 @@
 #ifndef _TOOLS_LINUX_LOG2_H
 #define _TOOLS_LINUX_LOG2_H
 
-/*
- * deal with unrepresentable constant logarithms
- */
-extern __attribute__((const, noreturn))
-int ____ilog2_NaN(void);
-
 /*
  * non-constant log of base 2 calculators
  * - the arch may override these in asm/bitops.h if they can be implemented
@@ -78,7 +72,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 #define ilog2(n)				\
 (						\
 	__builtin_constant_p(n) ? (		\
-		(n) < 1 ? ____ilog2_NaN() :	\
+		(n) < 2 ? 0 :			\
 		(n) & (1ULL << 63) ? 63 :	\
 		(n) & (1ULL << 62) ? 62 :	\
 		(n) & (1ULL << 61) ? 61 :	\
@@ -141,10 +135,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 		(n) & (1ULL <<  4) ?  4 :	\
 		(n) & (1ULL <<  3) ?  3 :	\
 		(n) & (1ULL <<  2) ?  2 :	\
-		(n) & (1ULL <<  1) ?  1 :	\
-		(n) & (1ULL <<  0) ?  0 :	\
-		____ilog2_NaN()			\
-				   ) :		\
+		1 ) :				\
 	(sizeof(n) <= 4) ?			\
 	__ilog2_u32(n) :			\
 	__ilog2_u64(n)				\
-- 
cgit v1.2.3


From 68e21be2916b359fd8afb536c1911dc014cfd03e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 19:08:20 +0100
Subject: sched/headers: Move task->mm handling methods to <linux/sched/mm.h>

Move the following task->mm helper APIs into a new header file,
<linux/sched/mm.h>, to further reduce the size and complexity
of <linux/sched.h>.

Here are how the APIs are used in various kernel files:

  # mm_alloc():
  arch/arm/mach-rpc/ecard.c
  fs/exec.c
  include/linux/sched/mm.h
  kernel/fork.c

  # __mmdrop():
  arch/arc/include/asm/mmu_context.h
  include/linux/sched/mm.h
  kernel/fork.c

  # mmdrop():
  arch/arm/mach-rpc/ecard.c
  arch/m68k/sun3/mmu_emu.c
  arch/x86/mm/tlb.c
  drivers/gpu/drm/amd/amdkfd/kfd_process.c
  drivers/gpu/drm/i915/i915_gem_userptr.c
  drivers/infiniband/hw/hfi1/file_ops.c
  drivers/vfio/vfio_iommu_spapr_tce.c
  fs/exec.c
  fs/proc/base.c
  fs/proc/task_mmu.c
  fs/proc/task_nommu.c
  fs/userfaultfd.c
  include/linux/mmu_notifier.h
  include/linux/sched/mm.h
  kernel/fork.c
  kernel/futex.c
  kernel/sched/core.c
  mm/khugepaged.c
  mm/ksm.c
  mm/mmu_context.c
  mm/mmu_notifier.c
  mm/oom_kill.c
  virt/kvm/kvm_main.c

  # mmdrop_async_fn():
  include/linux/sched/mm.h

  # mmdrop_async():
  include/linux/sched/mm.h
  kernel/fork.c

  # mmget_not_zero():
  fs/userfaultfd.c
  include/linux/sched/mm.h
  mm/oom_kill.c

  # mmput():
  arch/arc/include/asm/mmu_context.h
  arch/arc/kernel/troubleshoot.c
  arch/frv/mm/mmu-context.c
  arch/powerpc/platforms/cell/spufs/context.c
  arch/sparc/include/asm/mmu_context_32.h
  drivers/android/binder.c
  drivers/gpu/drm/etnaviv/etnaviv_gem.c
  drivers/gpu/drm/i915/i915_gem_userptr.c
  drivers/infiniband/core/umem.c
  drivers/infiniband/core/umem_odp.c
  drivers/infiniband/core/uverbs_main.c
  drivers/infiniband/hw/mlx4/main.c
  drivers/infiniband/hw/mlx5/main.c
  drivers/infiniband/hw/usnic/usnic_uiom.c
  drivers/iommu/amd_iommu_v2.c
  drivers/iommu/intel-svm.c
  drivers/lguest/lguest_user.c
  drivers/misc/cxl/fault.c
  drivers/misc/mic/scif/scif_rma.c
  drivers/oprofile/buffer_sync.c
  drivers/vfio/vfio_iommu_type1.c
  drivers/vhost/vhost.c
  drivers/xen/gntdev.c
  fs/exec.c
  fs/proc/array.c
  fs/proc/base.c
  fs/proc/task_mmu.c
  fs/proc/task_nommu.c
  fs/userfaultfd.c
  include/linux/sched/mm.h
  kernel/cpuset.c
  kernel/events/core.c
  kernel/events/uprobes.c
  kernel/exit.c
  kernel/fork.c
  kernel/ptrace.c
  kernel/sys.c
  kernel/trace/trace_output.c
  kernel/tsacct.c
  mm/memcontrol.c
  mm/memory.c
  mm/mempolicy.c
  mm/migrate.c
  mm/mmu_notifier.c
  mm/nommu.c
  mm/oom_kill.c
  mm/process_vm_access.c
  mm/rmap.c
  mm/swapfile.c
  mm/util.c
  virt/kvm/async_pf.c

  # mmput_async():
  include/linux/sched/mm.h
  kernel/fork.c
  mm/oom_kill.c

  # get_task_mm():
  arch/arc/kernel/troubleshoot.c
  arch/powerpc/platforms/cell/spufs/context.c
  drivers/android/binder.c
  drivers/gpu/drm/etnaviv/etnaviv_gem.c
  drivers/infiniband/core/umem.c
  drivers/infiniband/core/umem_odp.c
  drivers/infiniband/hw/mlx4/main.c
  drivers/infiniband/hw/mlx5/main.c
  drivers/infiniband/hw/usnic/usnic_uiom.c
  drivers/iommu/amd_iommu_v2.c
  drivers/iommu/intel-svm.c
  drivers/lguest/lguest_user.c
  drivers/misc/cxl/fault.c
  drivers/misc/mic/scif/scif_rma.c
  drivers/oprofile/buffer_sync.c
  drivers/vfio/vfio_iommu_type1.c
  drivers/vhost/vhost.c
  drivers/xen/gntdev.c
  fs/proc/array.c
  fs/proc/base.c
  fs/proc/task_mmu.c
  include/linux/sched/mm.h
  kernel/cpuset.c
  kernel/events/core.c
  kernel/exit.c
  kernel/fork.c
  kernel/ptrace.c
  kernel/sys.c
  kernel/trace/trace_output.c
  kernel/tsacct.c
  mm/memcontrol.c
  mm/memory.c
  mm/mempolicy.c
  mm/migrate.c
  mm/mmu_notifier.c
  mm/nommu.c
  mm/util.c

  # mm_access():
  fs/proc/base.c
  include/linux/sched/mm.h
  kernel/fork.c
  mm/process_vm_access.c

  # mm_release():
  arch/arc/include/asm/mmu_context.h
  fs/exec.c
  include/linux/sched/mm.h
  include/uapi/linux/sched.h
  kernel/exit.c
  kernel/fork.c

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/smp.c         |  2 +-
 arch/arc/kernel/smp.c           |  2 +-
 arch/arm/kernel/smp.c           |  2 +-
 arch/arm64/kernel/smp.c         |  2 +-
 arch/blackfin/mach-common/smp.c |  2 +-
 arch/hexagon/kernel/smp.c       |  2 +-
 arch/ia64/kernel/setup.c        |  2 +-
 arch/m32r/kernel/setup.c        |  2 +-
 arch/metag/kernel/smp.c         |  2 +-
 arch/mips/kernel/traps.c        |  2 +-
 arch/parisc/kernel/smp.c        |  2 +-
 arch/powerpc/kernel/smp.c       |  2 +-
 arch/s390/kernel/processor.c    |  2 +
 arch/score/kernel/traps.c       |  1 +
 arch/sh/kernel/smp.c            |  2 +-
 arch/sparc/kernel/leon_smp.c    |  2 +-
 arch/sparc/kernel/smp_64.c      |  2 +-
 arch/sparc/kernel/sun4d_smp.c   |  2 +-
 arch/sparc/kernel/sun4m_smp.c   |  2 +-
 arch/sparc/kernel/traps_32.c    |  2 +-
 arch/sparc/kernel/traps_64.c    |  2 +-
 arch/tile/kernel/smpboot.c      |  2 +-
 arch/x86/kernel/cpu/common.c    |  2 +-
 arch/xtensa/kernel/smp.c        |  1 +
 include/linux/sched.h           | 95 -----------------------------------------
 include/linux/sched/mm.h        | 95 +++++++++++++++++++++++++++++++++++++++++
 26 files changed, 120 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index acb4b146a607..9fc560459ebd 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/threads.h>
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index b8e8d3944481..f46267153ec2 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -13,7 +13,7 @@
  */
 
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/interrupt.h>
 #include <linux/profile.h>
 #include <linux/mm.h>
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index b724cff7ad60..572a8df1b766 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -11,7 +11,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 83c0a839a6ad..ef1caae02110 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -21,7 +21,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 3ac98252d299..b32ddab7966c 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -11,7 +11,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 1f63e91a353b..5dbc15549e01 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -25,7 +25,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/cpu.h>
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 63bef6fc0f3e..23e3fd61e335 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -32,7 +32,7 @@
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/reboot.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/task_stack.h>
 #include <linux/seq_file.h>
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index b18bc0bd6544..1a9e977287e6 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -11,7 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/stddef.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
 #include <linux/bootmem.h>
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index cab2aa64ca82..232a12bf3f99 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -12,7 +12,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 1a9366a157cb..c7d17cfb32f6 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/extable.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/debug.h>
 #include <linux/smp.h>
 #include <linux/spinlock.h>
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 67b452b41ff6..63365106ea19 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -21,7 +21,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/smp.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index fce17789c675..46f89e66a273 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -19,7 +19,7 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/topology.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index bf7854523831..928b929a6261 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -8,11 +8,13 @@
 
 #include <linux/cpufeature.h>
 #include <linux/kernel.h>
+#include <linux/sched/mm.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/mm_types.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
+
 #include <asm/diag.h>
 #include <asm/facility.h>
 #include <asm/elf.h>
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index fe68de6c746c..e359ec675869 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -24,6 +24,7 @@
  */
 
 #include <linux/extable.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
 #include <linux/mm_types.h>
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 4abf119c129c..c483422ea4d0 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -20,7 +20,7 @@
 #include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/hotplug.h>
 #include <linux/atomic.h>
 #include <linux/clockchips.h>
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c
index b99d33797e1d..db7acf27bea2 100644
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -9,7 +9,7 @@
 #include <asm/head.h>
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/threads.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 15052d364e04..b3bc0ac757cc 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -5,7 +5,7 @@
 
 #include <linux/export.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/hotplug.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 7b55c50eabe5..af93b50e3ce4 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -10,7 +10,7 @@
 #include <linux/interrupt.h>
 #include <linux/profile.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/cpu.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index 633c4cf6fdb0..5547fcb1d72d 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/profile.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/cpu.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 2de72a49308a..466d4aed06c7 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -9,7 +9,7 @@
  * I hate traps on the sparc, grrr...
  */
 
-#include <linux/sched.h>  /* for jiffies */
+#include <linux/sched/mm.h>
 #include <linux/sched/debug.h>
 #include <linux/mm_types.h>
 #include <linux/kernel.h>
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 4ff4c35f76b2..196ee5eb4d48 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -9,7 +9,7 @@
  */
 
 #include <linux/extable.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/debug.h>
 #include <linux/linkage.h>
 #include <linux/kernel.h>
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index f3fdd0c39b12..869c22e57561 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -16,7 +16,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task.h>
 #include <linux/kernel_stat.h>
 #include <linux/bootmem.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2fd8fefc589..b11b38c3b0bd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -7,7 +7,7 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/task.h>
 #include <linux/init.h>
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index fd894eaa63f3..932d64689bac 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -21,6 +21,7 @@
 #include <linux/irq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/reboot.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 042620729230..d29bbe0ee41f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2379,101 +2379,6 @@ static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
 	return sp;
 }
 
-/*
- * Routines for handling mm_structs
- */
-extern struct mm_struct * mm_alloc(void);
-
-/**
- * mmgrab() - Pin a &struct mm_struct.
- * @mm: The &struct mm_struct to pin.
- *
- * Make sure that @mm will not get freed even after the owning task
- * exits. This doesn't guarantee that the associated address space
- * will still exist later on and mmget_not_zero() has to be used before
- * accessing it.
- *
- * This is a preferred way to to pin @mm for a longer/unbounded amount
- * of time.
- *
- * Use mmdrop() to release the reference acquired by mmgrab().
- *
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
- * of &mm_struct.mm_count vs &mm_struct.mm_users.
- */
-static inline void mmgrab(struct mm_struct *mm)
-{
-	atomic_inc(&mm->mm_count);
-}
-
-/* mmdrop drops the mm and the page tables */
-extern void __mmdrop(struct mm_struct *);
-static inline void mmdrop(struct mm_struct *mm)
-{
-	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
-		__mmdrop(mm);
-}
-
-static inline void mmdrop_async_fn(struct work_struct *work)
-{
-	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
-	__mmdrop(mm);
-}
-
-static inline void mmdrop_async(struct mm_struct *mm)
-{
-	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
-		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
-		schedule_work(&mm->async_put_work);
-	}
-}
-
-/**
- * mmget() - Pin the address space associated with a &struct mm_struct.
- * @mm: The address space to pin.
- *
- * Make sure that the address space of the given &struct mm_struct doesn't
- * go away. This does not protect against parts of the address space being
- * modified or freed, however.
- *
- * Never use this function to pin this address space for an
- * unbounded/indefinite amount of time.
- *
- * Use mmput() to release the reference acquired by mmget().
- *
- * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
- * of &mm_struct.mm_count vs &mm_struct.mm_users.
- */
-static inline void mmget(struct mm_struct *mm)
-{
-	atomic_inc(&mm->mm_users);
-}
-
-static inline bool mmget_not_zero(struct mm_struct *mm)
-{
-	return atomic_inc_not_zero(&mm->mm_users);
-}
-
-/* mmput gets rid of the mappings and all user-space */
-extern void mmput(struct mm_struct *);
-#ifdef CONFIG_MMU
-/* same as above but performs the slow path from the async context. Can
- * be called from the atomic context as well
- */
-extern void mmput_async(struct mm_struct *);
-#endif
-
-/* Grab a reference to a task's mm, if it is not already going away */
-extern struct mm_struct *get_task_mm(struct task_struct *task);
-/*
- * Grab a reference to a task's mm, if it is not already going away
- * and ptrace_may_access with the mode parameter passed to it
- * succeeds.
- */
-extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
-/* Remove the current tasks stale references to the old mm_struct */
-extern void mm_release(struct task_struct *, struct mm_struct *);
-
 #ifdef CONFIG_HAVE_COPY_THREAD_TLS
 extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
 			struct task_struct *, unsigned long);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d32e3932b2e3..be1ae55f5ab9 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -5,4 +5,99 @@
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
 
+/*
+ * Routines for handling mm_structs
+ */
+extern struct mm_struct * mm_alloc(void);
+
+/**
+ * mmgrab() - Pin a &struct mm_struct.
+ * @mm: The &struct mm_struct to pin.
+ *
+ * Make sure that @mm will not get freed even after the owning task
+ * exits. This doesn't guarantee that the associated address space
+ * will still exist later on and mmget_not_zero() has to be used before
+ * accessing it.
+ *
+ * This is a preferred way to to pin @mm for a longer/unbounded amount
+ * of time.
+ *
+ * Use mmdrop() to release the reference acquired by mmgrab().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmgrab(struct mm_struct *mm)
+{
+	atomic_inc(&mm->mm_count);
+}
+
+/* mmdrop drops the mm and the page tables */
+extern void __mmdrop(struct mm_struct *);
+static inline void mmdrop(struct mm_struct *mm)
+{
+	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+		__mmdrop(mm);
+}
+
+static inline void mmdrop_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmdrop(mm);
+}
+
+static inline void mmdrop_async(struct mm_struct *mm)
+{
+	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+		schedule_work(&mm->async_put_work);
+	}
+}
+
+/**
+ * mmget() - Pin the address space associated with a &struct mm_struct.
+ * @mm: The address space to pin.
+ *
+ * Make sure that the address space of the given &struct mm_struct doesn't
+ * go away. This does not protect against parts of the address space being
+ * modified or freed, however.
+ *
+ * Never use this function to pin this address space for an
+ * unbounded/indefinite amount of time.
+ *
+ * Use mmput() to release the reference acquired by mmget().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmget(struct mm_struct *mm)
+{
+	atomic_inc(&mm->mm_users);
+}
+
+static inline bool mmget_not_zero(struct mm_struct *mm)
+{
+	return atomic_inc_not_zero(&mm->mm_users);
+}
+
+/* mmput gets rid of the mappings and all user-space */
+extern void mmput(struct mm_struct *);
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+#endif
+
+/* Grab a reference to a task's mm, if it is not already going away */
+extern struct mm_struct *get_task_mm(struct task_struct *task);
+/*
+ * Grab a reference to a task's mm, if it is not already going away
+ * and ptrace_may_access with the mode parameter passed to it
+ * succeeds.
+ */
+extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
+/* Remove the current tasks stale references to the old mm_struct */
+extern void mm_release(struct task_struct *, struct mm_struct *);
+
 #endif /* _LINUX_SCHED_MM_H */
-- 
cgit v1.2.3


From 11701c6768367294c5086738d49196192aaf3d60 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 19:21:47 +0100
Subject: sched/headers: Move task->mm coredumping related defines and methods
 from <linux/sched.h> to <linux/sched/coredump.h>

This further reduces the size and complexity of <linux/sched.h>.

These are the definitions and APIs that are moved:

  # MMF_*:
  fs/binfmt_elf.c
  fs/binfmt_elf_fdpic.c
  fs/exec.c
  fs/proc/base.c
  include/linux/khugepaged.h
  include/linux/ksm.h
  include/linux/sched/coredump.h
  kernel/events/uprobes.c
  kernel/fork.c
  mm/huge_memory.c
  mm/khugepaged.c
  mm/ksm.c
  mm/memory.c
  mm/oom_kill.c

  # SUID_DUMP_*:
  arch/ia64/include/asm/processor.h
  fs/coredump.c
  fs/exec.c
  fs/proc/internal.h
  include/linux/sched/coredump.h
  kernel/ptrace.c
  kernel/sys.c
  kernel/sysctl.c

  # get_dumpable():
  arch/ia64/include/asm/processor.h
  fs/coredump.c
  fs/exec.c
  fs/proc/internal.h
  include/linux/sched/coredump.h
  kernel/ptrace.c
  kernel/sys.c

  # set_dumpable():
  fs/exec.c
  include/linux/sched/coredump.h
  kernel/cred.c
  kernel/sys.c

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h          | 68 -----------------------------------------
 include/linux/sched/coredump.h | 69 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d29bbe0ee41f..7934cd0acbc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -361,74 +361,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
 
-#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
-#define SUID_DUMP_USER		1	/* Dump as user of process */
-#define SUID_DUMP_ROOT		2	/* Dump as root */
-
-/* mm flags */
-
-/* for SUID_DUMP_* above */
-#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
-
-extern void set_dumpable(struct mm_struct *mm, int value);
-/*
- * This returns the actual value of the suid_dumpable flag. For things
- * that are using this for checking for privilege transitions, it must
- * test against SUID_DUMP_USER rather than treating it as a boolean
- * value.
- */
-static inline int __get_dumpable(unsigned long mm_flags)
-{
-	return mm_flags & MMF_DUMPABLE_MASK;
-}
-
-static inline int get_dumpable(struct mm_struct *mm)
-{
-	return __get_dumpable(mm->flags);
-}
-
-/* coredump filter bits */
-#define MMF_DUMP_ANON_PRIVATE	2
-#define MMF_DUMP_ANON_SHARED	3
-#define MMF_DUMP_MAPPED_PRIVATE	4
-#define MMF_DUMP_MAPPED_SHARED	5
-#define MMF_DUMP_ELF_HEADERS	6
-#define MMF_DUMP_HUGETLB_PRIVATE 7
-#define MMF_DUMP_HUGETLB_SHARED  8
-#define MMF_DUMP_DAX_PRIVATE	9
-#define MMF_DUMP_DAX_SHARED	10
-
-#define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS	9
-#define MMF_DUMP_FILTER_MASK \
-	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
-#define MMF_DUMP_FILTER_DEFAULT \
-	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
-	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
-
-#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
-#else
-# define MMF_DUMP_MASK_DEFAULT_ELF	0
-#endif
-					/* leave room for more dump flags */
-#define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
-#define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
-
-#define MMF_HAS_UPROBES		19	/* has uprobes */
-#define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_SKIP		21	/* mm is of no interest for the OOM killer */
-#define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
-
-#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
-
 struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index ab81e564e076..f46912aa4f4c 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -2,5 +2,74 @@
 #define _LINUX_SCHED_COREDUMP_H
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
+#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
+#define SUID_DUMP_USER		1	/* Dump as user of process */
+#define SUID_DUMP_ROOT		2	/* Dump as root */
+
+/* mm flags */
+
+/* for SUID_DUMP_* above */
+#define MMF_DUMPABLE_BITS 2
+#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+
+extern void set_dumpable(struct mm_struct *mm, int value);
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
+static inline int __get_dumpable(unsigned long mm_flags)
+{
+	return mm_flags & MMF_DUMPABLE_MASK;
+}
+
+static inline int get_dumpable(struct mm_struct *mm)
+{
+	return __get_dumpable(mm->flags);
+}
+
+/* coredump filter bits */
+#define MMF_DUMP_ANON_PRIVATE	2
+#define MMF_DUMP_ANON_SHARED	3
+#define MMF_DUMP_MAPPED_PRIVATE	4
+#define MMF_DUMP_MAPPED_SHARED	5
+#define MMF_DUMP_ELF_HEADERS	6
+#define MMF_DUMP_HUGETLB_PRIVATE 7
+#define MMF_DUMP_HUGETLB_SHARED  8
+#define MMF_DUMP_DAX_PRIVATE	9
+#define MMF_DUMP_DAX_SHARED	10
+
+#define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
+#define MMF_DUMP_FILTER_BITS	9
+#define MMF_DUMP_FILTER_MASK \
+	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+#define MMF_DUMP_FILTER_DEFAULT \
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+
+#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
+#else
+# define MMF_DUMP_MASK_DEFAULT_ELF	0
+#endif
+					/* leave room for more dump flags */
+#define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
+/*
+ * This one-shot flag is dropped due to necessity of changing exe once again
+ * on NFS restore
+ */
+//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
+
+#define MMF_HAS_UPROBES		19	/* has uprobes */
+#define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_SKIP		21	/* mm is of no interest for the OOM killer */
+#define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
+#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
+
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
 #endif /* _LINUX_SCHED_COREDUMP_H */
-- 
cgit v1.2.3


From c3edc4010e9d102eb7b8f17d15c2ebc425fed63c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 08:35:14 +0100
Subject: sched/headers: Move task_struct::signal and task_struct::sighand
 types and accessors into <linux/sched/signal.h>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

task_struct::signal and task_struct::sighand are pointers, which would normally make it
straightforward to not define those types in sched.h.

That is not so, because the types are accompanied by a myriad of APIs (macros and inline
functions) that dereference them.

Split the types and the APIs out of sched.h and move them into a new header, <linux/sched/signal.h>.

With this change sched.h does not know about 'struct signal' and 'struct sighand' anymore,
trying to put accessors into sched.h as a test fails the following way:

  ./include/linux/sched.h: In function ‘test_signal_types’:
  ./include/linux/sched.h:2461:18: error: dereferencing pointer to incomplete type ‘struct signal_struct’
                    ^

This reduces the size and complexity of sched.h significantly.

Update all headers and .c code that relied on getting the signal handling
functionality from <linux/sched.h> to include <linux/sched/signal.h>.

The list of affected files in the preparatory patch was partly generated by
grepping for the APIs, and partly by doing coverage build testing, both
all[yes|mod|def|no]config builds on 64-bit and 32-bit x86, and an array of
cross-architecture builds.

Nevertheless some (trivial) build breakage is still expected related to rare
Kconfig combinations and in-flight patches to various kernel code, but most
of it should be handled by this patch.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/nwfpe/fpmodule.c    |   2 +-
 arch/sh/kernel/cpu/sh4/fpu.c |   3 +-
 drivers/net/tap.c            |   2 +-
 include/linux/sched.h        | 499 +-----------------------------------------
 include/linux/sched/signal.h | 502 +++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup-v1.c    |   1 +
 mm/vmalloc.c                 |   2 +-
 net/smc/af_smc.c             |   2 +
 net/smc/smc_clc.c            |   2 +
 net/smc/smc_close.c          |   2 +
 net/smc/smc_rx.c             |   2 +
 net/smc/smc_tx.c             |   2 +
 12 files changed, 520 insertions(+), 501 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/nwfpe/fpmodule.c b/arch/arm/nwfpe/fpmodule.c
index ec717c190e2c..1365e8650843 100644
--- a/arch/arm/nwfpe/fpmodule.c
+++ b/arch/arm/nwfpe/fpmodule.c
@@ -31,7 +31,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 
 #include <asm/thread_notify.h>
diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c
index 69ab4d3c8d41..95fd2dcb83da 100644
--- a/arch/sh/kernel/cpu/sh4/fpu.c
+++ b/arch/sh/kernel/cpu/sh4/fpu.c
@@ -10,8 +10,7 @@
  *
  * FIXME! These routines have not been tested for big endian case.
  */
-#include <linux/sched.h>
-#include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 #include <cpu/fpu.h>
 #include <asm/processor.h>
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 35b55a2fa1a1..4d4173d25dd0 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/cache.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7934cd0acbc7..c1586104d4c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,9 @@ struct blk_plug;
 struct filename;
 struct nameidata;
 
+struct signal_struct;
+struct sighand_struct;
+
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
@@ -361,13 +364,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
 
-struct sighand_struct {
-	atomic_t		count;
-	struct k_sigaction	action[_NSIG];
-	spinlock_t		siglock;
-	wait_queue_head_t	signalfd_wqh;
-};
-
 struct pacct_struct {
 	int			ac_flag;
 	long			ac_exitcode;
@@ -485,195 +481,6 @@ struct thread_group_cputimer {
 #include <linux/rwsem.h>
 struct autogroup;
 
-/*
- * NOTE! "signal_struct" does not have its own
- * locking, because a shared signal_struct always
- * implies a shared sighand_struct, so locking
- * sighand_struct is always a proper superset of
- * the locking of signal_struct.
- */
-struct signal_struct {
-	atomic_t		sigcnt;
-	atomic_t		live;
-	int			nr_threads;
-	struct list_head	thread_head;
-
-	wait_queue_head_t	wait_chldexit;	/* for wait4() */
-
-	/* current thread group signal load-balancing target: */
-	struct task_struct	*curr_target;
-
-	/* shared signal handling: */
-	struct sigpending	shared_pending;
-
-	/* thread group exit support */
-	int			group_exit_code;
-	/* overloaded:
-	 * - notify group_exit_task when ->count is equal to notify_count
-	 * - everyone except group_exit_task is stopped during signal delivery
-	 *   of fatal signals, group_exit_task processes the signal.
-	 */
-	int			notify_count;
-	struct task_struct	*group_exit_task;
-
-	/* thread group stop support, overloads group_exit_code too */
-	int			group_stop_count;
-	unsigned int		flags; /* see SIGNAL_* flags below */
-
-	/*
-	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
-	 * manager, to re-parent orphan (double-forking) child processes
-	 * to this process instead of 'init'. The service manager is
-	 * able to receive SIGCHLD signals and is able to investigate
-	 * the process until it calls wait(). All children of this
-	 * process will inherit a flag if they should look for a
-	 * child_subreaper process at exit.
-	 */
-	unsigned int		is_child_subreaper:1;
-	unsigned int		has_child_subreaper:1;
-
-#ifdef CONFIG_POSIX_TIMERS
-
-	/* POSIX.1b Interval Timers */
-	int			posix_timer_id;
-	struct list_head	posix_timers;
-
-	/* ITIMER_REAL timer for the process */
-	struct hrtimer real_timer;
-	ktime_t it_real_incr;
-
-	/*
-	 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
-	 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
-	 * values are defined to 0 and 1 respectively
-	 */
-	struct cpu_itimer it[2];
-
-	/*
-	 * Thread group totals for process CPU timers.
-	 * See thread_group_cputimer(), et al, for details.
-	 */
-	struct thread_group_cputimer cputimer;
-
-	/* Earliest-expiration cache. */
-	struct task_cputime cputime_expires;
-
-	struct list_head cpu_timers[3];
-
-#endif
-
-	struct pid *leader_pid;
-
-#ifdef CONFIG_NO_HZ_FULL
-	atomic_t tick_dep_mask;
-#endif
-
-	struct pid *tty_old_pgrp;
-
-	/* boolean value for session group leader */
-	int leader;
-
-	struct tty_struct *tty; /* NULL if no tty */
-
-#ifdef CONFIG_SCHED_AUTOGROUP
-	struct autogroup *autogroup;
-#endif
-	/*
-	 * Cumulative resource counters for dead threads in the group,
-	 * and for reaped dead child processes forked by this group.
-	 * Live threads maintain their own counters and add to these
-	 * in __exit_signal, except for the group leader.
-	 */
-	seqlock_t stats_lock;
-	u64 utime, stime, cutime, cstime;
-	u64 gtime;
-	u64 cgtime;
-	struct prev_cputime prev_cputime;
-	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
-	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
-	unsigned long inblock, oublock, cinblock, coublock;
-	unsigned long maxrss, cmaxrss;
-	struct task_io_accounting ioac;
-
-	/*
-	 * Cumulative ns of schedule CPU time fo dead threads in the
-	 * group, not including a zombie group leader, (This only differs
-	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
-	 * other than jiffies.)
-	 */
-	unsigned long long sum_sched_runtime;
-
-	/*
-	 * We don't bother to synchronize most readers of this at all,
-	 * because there is no reader checking a limit that actually needs
-	 * to get both rlim_cur and rlim_max atomically, and either one
-	 * alone is a single word that can safely be read normally.
-	 * getrlimit/setrlimit use task_lock(current->group_leader) to
-	 * protect this instead of the siglock, because they really
-	 * have no need to disable irqs.
-	 */
-	struct rlimit rlim[RLIM_NLIMITS];
-
-#ifdef CONFIG_BSD_PROCESS_ACCT
-	struct pacct_struct pacct;	/* per-process accounting information */
-#endif
-#ifdef CONFIG_TASKSTATS
-	struct taskstats *stats;
-#endif
-#ifdef CONFIG_AUDIT
-	unsigned audit_tty;
-	struct tty_audit_buf *tty_audit_buf;
-#endif
-
-	/*
-	 * Thread is the potential origin of an oom condition; kill first on
-	 * oom
-	 */
-	bool oom_flag_origin;
-	short oom_score_adj;		/* OOM kill score adjustment */
-	short oom_score_adj_min;	/* OOM kill score adjustment min value.
-					 * Only settable by CAP_SYS_RESOURCE. */
-	struct mm_struct *oom_mm;	/* recorded mm when the thread group got
-					 * killed by the oom killer */
-
-	struct mutex cred_guard_mutex;	/* guard against foreign influences on
-					 * credential calculations
-					 * (notably. ptrace) */
-};
-
-/*
- * Bits in flags field of signal_struct.
- */
-#define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
-#define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
-#define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
-#define SIGNAL_GROUP_COREDUMP	0x00000008 /* coredump in progress */
-/*
- * Pending notifications to parent.
- */
-#define SIGNAL_CLD_STOPPED	0x00000010
-#define SIGNAL_CLD_CONTINUED	0x00000020
-#define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
-
-#define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
-
-#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
-			  SIGNAL_STOP_CONTINUED)
-
-static inline void signal_set_stop_flags(struct signal_struct *sig,
-					 unsigned int flags)
-{
-	WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
-	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
-}
-
-/* If true, all threads except ->group_exit_task have pending SIGKILL */
-static inline int signal_group_exit(const struct signal_struct *sig)
-{
-	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
-		(sig->group_exit_task != NULL);
-}
-
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -2126,190 +1933,8 @@ extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
 extern void proc_caches_init(void);
-extern void flush_signals(struct task_struct *);
-extern void ignore_signals(struct task_struct *);
-extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
-
-static inline int kernel_dequeue_signal(siginfo_t *info)
-{
-	struct task_struct *tsk = current;
-	siginfo_t __info;
-	int ret;
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
-	spin_unlock_irq(&tsk->sighand->siglock);
-
-	return ret;
-}
-
-static inline void kernel_signal_stop(void)
-{
-	spin_lock_irq(&current->sighand->siglock);
-	if (current->jobctl & JOBCTL_STOP_DEQUEUED)
-		__set_current_state(TASK_STOPPED);
-	spin_unlock_irq(&current->sighand->siglock);
-
-	schedule();
-}
 
 extern void release_task(struct task_struct * p);
-extern int send_sig_info(int, struct siginfo *, struct task_struct *);
-extern int force_sigsegv(int, struct task_struct *);
-extern int force_sig_info(int, struct siginfo *, struct task_struct *);
-extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
-extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
-extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
-				const struct cred *, u32);
-extern int kill_pgrp(struct pid *pid, int sig, int priv);
-extern int kill_pid(struct pid *pid, int sig, int priv);
-extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern __must_check bool do_notify_parent(struct task_struct *, int);
-extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
-extern void force_sig(int, struct task_struct *);
-extern int send_sig(int, struct task_struct *, int);
-extern int zap_other_threads(struct task_struct *p);
-extern struct sigqueue *sigqueue_alloc(void);
-extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
-extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
-
-#ifdef TIF_RESTORE_SIGMASK
-/*
- * Legacy restore_sigmask accessors.  These are inefficient on
- * SMP architectures because they require atomic operations.
- */
-
-/**
- * set_restore_sigmask() - make sure saved_sigmask processing gets done
- *
- * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
- * will run before returning to user mode, to process the flag.  For
- * all callers, TIF_SIGPENDING is already set or it's no harm to set
- * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
- * arch code will notice on return to user mode, in case those bits
- * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
- * signal code always gets run when TIF_RESTORE_SIGMASK is set.
- */
-static inline void set_restore_sigmask(void)
-{
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
-}
-static inline void clear_restore_sigmask(void)
-{
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
-}
-static inline bool test_restore_sigmask(void)
-{
-	return test_thread_flag(TIF_RESTORE_SIGMASK);
-}
-static inline bool test_and_clear_restore_sigmask(void)
-{
-	return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
-}
-
-#else	/* TIF_RESTORE_SIGMASK */
-
-/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
-static inline void set_restore_sigmask(void)
-{
-	current->restore_sigmask = true;
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
-}
-static inline void clear_restore_sigmask(void)
-{
-	current->restore_sigmask = false;
-}
-static inline bool test_restore_sigmask(void)
-{
-	return current->restore_sigmask;
-}
-static inline bool test_and_clear_restore_sigmask(void)
-{
-	if (!current->restore_sigmask)
-		return false;
-	current->restore_sigmask = false;
-	return true;
-}
-#endif
-
-static inline void restore_saved_sigmask(void)
-{
-	if (test_and_clear_restore_sigmask())
-		__set_current_blocked(&current->saved_sigmask);
-}
-
-static inline sigset_t *sigmask_to_save(void)
-{
-	sigset_t *res = &current->blocked;
-	if (unlikely(test_restore_sigmask()))
-		res = &current->saved_sigmask;
-	return res;
-}
-
-static inline int kill_cad_pid(int sig, int priv)
-{
-	return kill_pid(cad_pid, sig, priv);
-}
-
-/* These can be the second arg to send_sig_info/send_group_sig_info.  */
-#define SEND_SIG_NOINFO ((struct siginfo *) 0)
-#define SEND_SIG_PRIV	((struct siginfo *) 1)
-#define SEND_SIG_FORCED	((struct siginfo *) 2)
-
-/*
- * True if we are on the alternate signal stack.
- */
-static inline int on_sig_stack(unsigned long sp)
-{
-	/*
-	 * If the signal stack is SS_AUTODISARM then, by construction, we
-	 * can't be on the signal stack unless user code deliberately set
-	 * SS_AUTODISARM when we were already on it.
-	 *
-	 * This improves reliability: if user state gets corrupted such that
-	 * the stack pointer points very close to the end of the signal stack,
-	 * then this check will enable the signal to be handled anyway.
-	 */
-	if (current->sas_ss_flags & SS_AUTODISARM)
-		return 0;
-
-#ifdef CONFIG_STACK_GROWSUP
-	return sp >= current->sas_ss_sp &&
-		sp - current->sas_ss_sp < current->sas_ss_size;
-#else
-	return sp > current->sas_ss_sp &&
-		sp - current->sas_ss_sp <= current->sas_ss_size;
-#endif
-}
-
-static inline int sas_ss_flags(unsigned long sp)
-{
-	if (!current->sas_ss_size)
-		return SS_DISABLE;
-
-	return on_sig_stack(sp) ? SS_ONSTACK : 0;
-}
-
-static inline void sas_ss_reset(struct task_struct *p)
-{
-	p->sas_ss_sp = 0;
-	p->sas_ss_size = 0;
-	p->sas_ss_flags = SS_DISABLE;
-}
-
-static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
-{
-	if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
-#ifdef CONFIG_STACK_GROWSUP
-		return current->sas_ss_sp;
-#else
-		return current->sas_ss_sp + current->sas_ss_size;
-#endif
-	return sp;
-}
 
 #ifdef CONFIG_HAVE_COPY_THREAD_TLS
 extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
@@ -2338,10 +1963,8 @@ static inline void exit_thread(struct task_struct *tsk)
 #endif
 
 extern void exit_files(struct task_struct *);
-extern void __cleanup_sighand(struct sighand_struct *);
 
 extern void exit_itimers(struct signal_struct *);
-extern void flush_itimer_signals(void);
 
 extern void do_group_exit(int);
 
@@ -2376,81 +1999,6 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-#define tasklist_empty() \
-	list_empty(&init_task.tasks)
-
-#define next_task(p) \
-	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
-
-#define for_each_process(p) \
-	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
-
-extern bool current_is_single_threaded(void);
-
-/*
- * Careful: do_each_thread/while_each_thread is a double loop so
- *          'break' will not work as expected - use goto instead.
- */
-#define do_each_thread(g, t) \
-	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
-
-#define while_each_thread(g, t) \
-	while ((t = next_thread(t)) != g)
-
-#define __for_each_thread(signal, t)	\
-	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
-
-#define for_each_thread(p, t)		\
-	__for_each_thread((p)->signal, t)
-
-/* Careful: this is a double loop, 'break' won't work as expected. */
-#define for_each_process_thread(p, t)	\
-	for_each_process(p) for_each_thread(p, t)
-
-typedef int (*proc_visitor)(struct task_struct *p, void *data);
-void walk_process_tree(struct task_struct *top, proc_visitor, void *);
-
-static inline int get_nr_threads(struct task_struct *tsk)
-{
-	return tsk->signal->nr_threads;
-}
-
-static inline bool thread_group_leader(struct task_struct *p)
-{
-	return p->exit_signal >= 0;
-}
-
-/* Do to the insanities of de_thread it is possible for a process
- * to have the pid of the thread group leader without actually being
- * the thread group leader.  For iteration through the pids in proc
- * all we care about is that we have a task with the appropriate
- * pid, we don't actually care if we have the right task.
- */
-static inline bool has_group_leader_pid(struct task_struct *p)
-{
-	return task_pid(p) == p->signal->leader_pid;
-}
-
-static inline
-bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
-{
-	return p1->signal == p2->signal;
-}
-
-static inline struct task_struct *next_thread(const struct task_struct *p)
-{
-	return list_entry_rcu(p->thread_group.next,
-			      struct task_struct, thread_group);
-}
-
-static inline int thread_group_empty(struct task_struct *p)
-{
-	return list_empty(&p->thread_group);
-}
-
-#define delay_group_leader(p) \
-		(thread_group_leader(p) && !thread_group_empty(p))
-
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
@@ -2471,25 +2019,6 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
-extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
-							unsigned long *flags);
-
-static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
-						       unsigned long *flags)
-{
-	struct sighand_struct *ret;
-
-	ret = __lock_task_sighand(tsk, flags);
-	(void)__cond_lock(&tsk->sighand->siglock, ret);
-	return ret;
-}
-
-static inline void unlock_task_sighand(struct task_struct *tsk,
-						unsigned long *flags)
-{
-	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
-}
-
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 
 static inline struct thread_info *task_thread_info(struct task_struct *task)
@@ -2862,28 +2391,6 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 }
 #endif /* CONFIG_MEMCG */
 
-static inline unsigned long task_rlimit(const struct task_struct *tsk,
-		unsigned int limit)
-{
-	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
-}
-
-static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
-		unsigned int limit)
-{
-	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
-}
-
-static inline unsigned long rlimit(unsigned int limit)
-{
-	return task_rlimit(current, limit);
-}
-
-static inline unsigned long rlimit_max(unsigned int limit)
-{
-	return task_rlimit_max(current, limit);
-}
-
 #define SCHED_CPUFREQ_RT	(1U << 0)
 #define SCHED_CPUFREQ_DL	(1U << 1)
 #define SCHED_CPUFREQ_IOWAIT	(1U << 2)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index c6958a53fef3..53fe5450f431 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -8,4 +8,506 @@
 #include <linux/sched/jobctl.h>
 #include <linux/sched/task.h>
 
+/*
+ * Types defining task->signal and task->sighand and APIs using them:
+ */
+
+struct sighand_struct {
+	atomic_t		count;
+	struct k_sigaction	action[_NSIG];
+	spinlock_t		siglock;
+	wait_queue_head_t	signalfd_wqh;
+};
+
+/*
+ * NOTE! "signal_struct" does not have its own
+ * locking, because a shared signal_struct always
+ * implies a shared sighand_struct, so locking
+ * sighand_struct is always a proper superset of
+ * the locking of signal_struct.
+ */
+struct signal_struct {
+	atomic_t		sigcnt;
+	atomic_t		live;
+	int			nr_threads;
+	struct list_head	thread_head;
+
+	wait_queue_head_t	wait_chldexit;	/* for wait4() */
+
+	/* current thread group signal load-balancing target: */
+	struct task_struct	*curr_target;
+
+	/* shared signal handling: */
+	struct sigpending	shared_pending;
+
+	/* thread group exit support */
+	int			group_exit_code;
+	/* overloaded:
+	 * - notify group_exit_task when ->count is equal to notify_count
+	 * - everyone except group_exit_task is stopped during signal delivery
+	 *   of fatal signals, group_exit_task processes the signal.
+	 */
+	int			notify_count;
+	struct task_struct	*group_exit_task;
+
+	/* thread group stop support, overloads group_exit_code too */
+	int			group_stop_count;
+	unsigned int		flags; /* see SIGNAL_* flags below */
+
+	/*
+	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
+	 * manager, to re-parent orphan (double-forking) child processes
+	 * to this process instead of 'init'. The service manager is
+	 * able to receive SIGCHLD signals and is able to investigate
+	 * the process until it calls wait(). All children of this
+	 * process will inherit a flag if they should look for a
+	 * child_subreaper process at exit.
+	 */
+	unsigned int		is_child_subreaper:1;
+	unsigned int		has_child_subreaper:1;
+
+#ifdef CONFIG_POSIX_TIMERS
+
+	/* POSIX.1b Interval Timers */
+	int			posix_timer_id;
+	struct list_head	posix_timers;
+
+	/* ITIMER_REAL timer for the process */
+	struct hrtimer real_timer;
+	ktime_t it_real_incr;
+
+	/*
+	 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
+	 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
+	 * values are defined to 0 and 1 respectively
+	 */
+	struct cpu_itimer it[2];
+
+	/*
+	 * Thread group totals for process CPU timers.
+	 * See thread_group_cputimer(), et al, for details.
+	 */
+	struct thread_group_cputimer cputimer;
+
+	/* Earliest-expiration cache. */
+	struct task_cputime cputime_expires;
+
+	struct list_head cpu_timers[3];
+
+#endif
+
+	struct pid *leader_pid;
+
+#ifdef CONFIG_NO_HZ_FULL
+	atomic_t tick_dep_mask;
+#endif
+
+	struct pid *tty_old_pgrp;
+
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+	struct autogroup *autogroup;
+#endif
+	/*
+	 * Cumulative resource counters for dead threads in the group,
+	 * and for reaped dead child processes forked by this group.
+	 * Live threads maintain their own counters and add to these
+	 * in __exit_signal, except for the group leader.
+	 */
+	seqlock_t stats_lock;
+	u64 utime, stime, cutime, cstime;
+	u64 gtime;
+	u64 cgtime;
+	struct prev_cputime prev_cputime;
+	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+	unsigned long inblock, oublock, cinblock, coublock;
+	unsigned long maxrss, cmaxrss;
+	struct task_io_accounting ioac;
+
+	/*
+	 * Cumulative ns of schedule CPU time fo dead threads in the
+	 * group, not including a zombie group leader, (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sum_sched_runtime;
+
+	/*
+	 * We don't bother to synchronize most readers of this at all,
+	 * because there is no reader checking a limit that actually needs
+	 * to get both rlim_cur and rlim_max atomically, and either one
+	 * alone is a single word that can safely be read normally.
+	 * getrlimit/setrlimit use task_lock(current->group_leader) to
+	 * protect this instead of the siglock, because they really
+	 * have no need to disable irqs.
+	 */
+	struct rlimit rlim[RLIM_NLIMITS];
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	struct pacct_struct pacct;	/* per-process accounting information */
+#endif
+#ifdef CONFIG_TASKSTATS
+	struct taskstats *stats;
+#endif
+#ifdef CONFIG_AUDIT
+	unsigned audit_tty;
+	struct tty_audit_buf *tty_audit_buf;
+#endif
+
+	/*
+	 * Thread is the potential origin of an oom condition; kill first on
+	 * oom
+	 */
+	bool oom_flag_origin;
+	short oom_score_adj;		/* OOM kill score adjustment */
+	short oom_score_adj_min;	/* OOM kill score adjustment min value.
+					 * Only settable by CAP_SYS_RESOURCE. */
+	struct mm_struct *oom_mm;	/* recorded mm when the thread group got
+					 * killed by the oom killer */
+
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
+};
+
+/*
+ * Bits in flags field of signal_struct.
+ */
+#define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
+#define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
+#define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
+#define SIGNAL_GROUP_COREDUMP	0x00000008 /* coredump in progress */
+/*
+ * Pending notifications to parent.
+ */
+#define SIGNAL_CLD_STOPPED	0x00000010
+#define SIGNAL_CLD_CONTINUED	0x00000020
+#define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
+
+#define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
+
+#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
+			  SIGNAL_STOP_CONTINUED)
+
+static inline void signal_set_stop_flags(struct signal_struct *sig,
+					 unsigned int flags)
+{
+	WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
+	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
+}
+
+/* If true, all threads except ->group_exit_task have pending SIGKILL */
+static inline int signal_group_exit(const struct signal_struct *sig)
+{
+	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
+		(sig->group_exit_task != NULL);
+}
+
+extern void flush_signals(struct task_struct *);
+extern void ignore_signals(struct task_struct *);
+extern void flush_signal_handlers(struct task_struct *, int force_default);
+extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
+
+static inline int kernel_dequeue_signal(siginfo_t *info)
+{
+	struct task_struct *tsk = current;
+	siginfo_t __info;
+	int ret;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	return ret;
+}
+
+static inline void kernel_signal_stop(void)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+		__set_current_state(TASK_STOPPED);
+	spin_unlock_irq(&current->sighand->siglock);
+
+	schedule();
+}
+extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+extern int force_sigsegv(int, struct task_struct *);
+extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
+extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
+extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
+				const struct cred *, u32);
+extern int kill_pgrp(struct pid *pid, int sig, int priv);
+extern int kill_pid(struct pid *pid, int sig, int priv);
+extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern __must_check bool do_notify_parent(struct task_struct *, int);
+extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
+extern void force_sig(int, struct task_struct *);
+extern int send_sig(int, struct task_struct *, int);
+extern int zap_other_threads(struct task_struct *p);
+extern struct sigqueue *sigqueue_alloc(void);
+extern void sigqueue_free(struct sigqueue *);
+extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
+extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
+
+#ifdef TIF_RESTORE_SIGMASK
+/*
+ * Legacy restore_sigmask accessors.  These are inefficient on
+ * SMP architectures because they require atomic operations.
+ */
+
+/**
+ * set_restore_sigmask() - make sure saved_sigmask processing gets done
+ *
+ * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
+ * will run before returning to user mode, to process the flag.  For
+ * all callers, TIF_SIGPENDING is already set or it's no harm to set
+ * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
+ * arch code will notice on return to user mode, in case those bits
+ * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
+ * signal code always gets run when TIF_RESTORE_SIGMASK is set.
+ */
+static inline void set_restore_sigmask(void)
+{
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+}
+static inline void clear_restore_sigmask(void)
+{
+	clear_thread_flag(TIF_RESTORE_SIGMASK);
+}
+static inline bool test_restore_sigmask(void)
+{
+	return test_thread_flag(TIF_RESTORE_SIGMASK);
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
+}
+
+#else	/* TIF_RESTORE_SIGMASK */
+
+/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
+static inline void set_restore_sigmask(void)
+{
+	current->restore_sigmask = true;
+	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+}
+static inline void clear_restore_sigmask(void)
+{
+	current->restore_sigmask = false;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current->restore_sigmask;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	if (!current->restore_sigmask)
+		return false;
+	current->restore_sigmask = false;
+	return true;
+}
+#endif
+
+static inline void restore_saved_sigmask(void)
+{
+	if (test_and_clear_restore_sigmask())
+		__set_current_blocked(&current->saved_sigmask);
+}
+
+static inline sigset_t *sigmask_to_save(void)
+{
+	sigset_t *res = &current->blocked;
+	if (unlikely(test_restore_sigmask()))
+		res = &current->saved_sigmask;
+	return res;
+}
+
+static inline int kill_cad_pid(int sig, int priv)
+{
+	return kill_pid(cad_pid, sig, priv);
+}
+
+/* These can be the second arg to send_sig_info/send_group_sig_info.  */
+#define SEND_SIG_NOINFO ((struct siginfo *) 0)
+#define SEND_SIG_PRIV	((struct siginfo *) 1)
+#define SEND_SIG_FORCED	((struct siginfo *) 2)
+
+/*
+ * True if we are on the alternate signal stack.
+ */
+static inline int on_sig_stack(unsigned long sp)
+{
+	/*
+	 * If the signal stack is SS_AUTODISARM then, by construction, we
+	 * can't be on the signal stack unless user code deliberately set
+	 * SS_AUTODISARM when we were already on it.
+	 *
+	 * This improves reliability: if user state gets corrupted such that
+	 * the stack pointer points very close to the end of the signal stack,
+	 * then this check will enable the signal to be handled anyway.
+	 */
+	if (current->sas_ss_flags & SS_AUTODISARM)
+		return 0;
+
+#ifdef CONFIG_STACK_GROWSUP
+	return sp >= current->sas_ss_sp &&
+		sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+	return sp > current->sas_ss_sp &&
+		sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
+}
+
+static inline int sas_ss_flags(unsigned long sp)
+{
+	if (!current->sas_ss_size)
+		return SS_DISABLE;
+
+	return on_sig_stack(sp) ? SS_ONSTACK : 0;
+}
+
+static inline void sas_ss_reset(struct task_struct *p)
+{
+	p->sas_ss_sp = 0;
+	p->sas_ss_size = 0;
+	p->sas_ss_flags = SS_DISABLE;
+}
+
+static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
+{
+	if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
+#ifdef CONFIG_STACK_GROWSUP
+		return current->sas_ss_sp;
+#else
+		return current->sas_ss_sp + current->sas_ss_size;
+#endif
+	return sp;
+}
+
+extern void __cleanup_sighand(struct sighand_struct *);
+extern void flush_itimer_signals(void);
+
+#define tasklist_empty() \
+	list_empty(&init_task.tasks)
+
+#define next_task(p) \
+	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
+
+#define for_each_process(p) \
+	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+
+extern bool current_is_single_threaded(void);
+
+/*
+ * Careful: do_each_thread/while_each_thread is a double loop so
+ *          'break' will not work as expected - use goto instead.
+ */
+#define do_each_thread(g, t) \
+	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
+
+#define while_each_thread(g, t) \
+	while ((t = next_thread(t)) != g)
+
+#define __for_each_thread(signal, t)	\
+	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+
+#define for_each_thread(p, t)		\
+	__for_each_thread((p)->signal, t)
+
+/* Careful: this is a double loop, 'break' won't work as expected. */
+#define for_each_process_thread(p, t)	\
+	for_each_process(p) for_each_thread(p, t)
+
+typedef int (*proc_visitor)(struct task_struct *p, void *data);
+void walk_process_tree(struct task_struct *top, proc_visitor, void *);
+
+static inline int get_nr_threads(struct task_struct *tsk)
+{
+	return tsk->signal->nr_threads;
+}
+
+static inline bool thread_group_leader(struct task_struct *p)
+{
+	return p->exit_signal >= 0;
+}
+
+/* Do to the insanities of de_thread it is possible for a process
+ * to have the pid of the thread group leader without actually being
+ * the thread group leader.  For iteration through the pids in proc
+ * all we care about is that we have a task with the appropriate
+ * pid, we don't actually care if we have the right task.
+ */
+static inline bool has_group_leader_pid(struct task_struct *p)
+{
+	return task_pid(p) == p->signal->leader_pid;
+}
+
+static inline
+bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
+{
+	return p1->signal == p2->signal;
+}
+
+static inline struct task_struct *next_thread(const struct task_struct *p)
+{
+	return list_entry_rcu(p->thread_group.next,
+			      struct task_struct, thread_group);
+}
+
+static inline int thread_group_empty(struct task_struct *p)
+{
+	return list_empty(&p->thread_group);
+}
+
+#define delay_group_leader(p) \
+		(thread_group_leader(p) && !thread_group_empty(p))
+
+extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+							unsigned long *flags);
+
+static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+						       unsigned long *flags)
+{
+	struct sighand_struct *ret;
+
+	ret = __lock_task_sighand(tsk, flags);
+	(void)__cond_lock(&tsk->sighand->siglock, ret);
+	return ret;
+}
+
+static inline void unlock_task_sighand(struct task_struct *tsk,
+						unsigned long *flags)
+{
+	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+}
+
+static inline unsigned long task_rlimit(const struct task_struct *tsk,
+		unsigned int limit)
+{
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
+}
+
+static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
+		unsigned int limit)
+{
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
+}
+
+static inline unsigned long rlimit(unsigned int limit)
+{
+	return task_rlimit(current, limit);
+}
+
+static inline unsigned long rlimit_max(unsigned int limit)
+{
+	return task_rlimit_max(current, limit);
+}
+
 #endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fc34bcf2329f..08d2cb605101 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -5,6 +5,7 @@
 #include <linux/sort.h>
 #include <linux/delay.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be93949b4885..b4024d688f38 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,7 +12,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5d4208ad029e..85837ab90e89 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -27,6 +27,8 @@
 #include <linux/inetdevice.h>
 #include <linux/workqueue.h>
 #include <linux/in.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/smc.h>
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index cc6b6f8651eb..e41f594a1e1d 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -11,6 +11,8 @@
 
 #include <linux/in.h>
 #include <linux/if_ether.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/tcp.h>
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 03dfcc6b7661..67a71d170bed 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -9,6 +9,8 @@
  */
 
 #include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 
 #include "smc.h"
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 5d1878732f46..c4ef9a4ec569 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -11,6 +11,8 @@
 
 #include <linux/net.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 
 #include "smc.h"
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 6e73b28915ea..69a0013dd25c 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -15,6 +15,8 @@
 #include <linux/net.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 
 #include "smc.h"
-- 
cgit v1.2.3


From bcbb6a5bf7df6e37ba652d1f426eab042ec4f56b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 10:22:42 +0100
Subject: sched/headers: Move 'struct user_struct' definition and APIs to the
 new <linux/sched/user.h> header

'struct user_struct' was added to sched.h historically, but it's actually
entirely independent of task_struct and of scheduler details, so move
it to its own header.

Fix up .c files using those facilities.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 53 ---------------------------------------------
 include/linux/sched/user.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c1586104d4c0..71efbcafaa31 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -349,7 +349,6 @@ extern void io_schedule(void);
 void __noreturn do_task_dead(void);
 
 struct nsproxy;
-struct user_namespace;
 
 #ifdef CONFIG_MMU
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
@@ -481,49 +480,6 @@ struct thread_group_cputimer {
 #include <linux/rwsem.h>
 struct autogroup;
 
-/*
- * Some day this will be a full-fledged user tracking system..
- */
-struct user_struct {
-	atomic_t __count;	/* reference count */
-	atomic_t processes;	/* How many processes does this user have? */
-	atomic_t sigpending;	/* How many pending signals does this user have? */
-#ifdef CONFIG_FANOTIFY
-	atomic_t fanotify_listeners;
-#endif
-#ifdef CONFIG_EPOLL
-	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
-#endif
-#ifdef CONFIG_POSIX_MQUEUE
-	/* protected by mq_lock	*/
-	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
-#endif
-	unsigned long locked_shm; /* How many pages of mlocked shm ? */
-	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
-	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
-
-#ifdef CONFIG_KEYS
-	struct key *uid_keyring;	/* UID specific keyring */
-	struct key *session_keyring;	/* UID's default session keyring */
-#endif
-
-	/* Hash table maintenance information */
-	struct hlist_node uidhash_node;
-	kuid_t uid;
-
-#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
-	atomic_long_t locked_vm;
-#endif
-};
-
-extern int uids_sysfs_init(void);
-
-extern struct user_struct *find_user(kuid_t);
-
-extern struct user_struct root_user;
-#define INIT_USER (&root_user)
-
-
 struct backing_dev_info;
 struct reclaim_state;
 
@@ -1908,15 +1864,6 @@ extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
 
-/* per-UID process charging. */
-extern struct user_struct * alloc_uid(kuid_t);
-static inline struct user_struct *get_uid(struct user_struct *u)
-{
-	atomic_inc(&u->__count);
-	return u;
-}
-extern void free_uid(struct user_struct *);
-
 #include <asm/current.h>
 
 extern void xtime_update(unsigned long ticks);
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 83238e5ddadd..40c6363da5ef 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -3,4 +3,58 @@
 
 #include <linux/sched.h>
 
+struct key;
+
+/*
+ * Some day this will be a full-fledged user tracking system..
+ */
+struct user_struct {
+	atomic_t __count;	/* reference count */
+	atomic_t processes;	/* How many processes does this user have? */
+	atomic_t sigpending;	/* How many pending signals does this user have? */
+#ifdef CONFIG_FANOTIFY
+	atomic_t fanotify_listeners;
+#endif
+#ifdef CONFIG_EPOLL
+	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
+#endif
+#ifdef CONFIG_POSIX_MQUEUE
+	/* protected by mq_lock	*/
+	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
+#endif
+	unsigned long locked_shm; /* How many pages of mlocked shm ? */
+	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
+	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
+
+#ifdef CONFIG_KEYS
+	struct key *uid_keyring;	/* UID specific keyring */
+	struct key *session_keyring;	/* UID's default session keyring */
+#endif
+
+	/* Hash table maintenance information */
+	struct hlist_node uidhash_node;
+	kuid_t uid;
+
+#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
+	atomic_long_t locked_vm;
+#endif
+};
+
+extern int uids_sysfs_init(void);
+
+extern struct user_struct *find_user(kuid_t);
+
+extern struct user_struct root_user;
+#define INIT_USER (&root_user)
+
+
+/* per-UID process charging. */
+extern struct user_struct * alloc_uid(kuid_t);
+static inline struct user_struct *get_uid(struct user_struct *u)
+{
+	atomic_inc(&u->__count);
+	return u;
+}
+extern void free_uid(struct user_struct *);
+
 #endif /* _LINUX_SCHED_USER_H */
-- 
cgit v1.2.3


From d151b27d3f86589308cd8c891fdfd2db5f8e80d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 11:17:23 +0100
Subject: sched/headers: Move softlockup detector watchdog methods to
 <linux/nmi.h>

These methods don't belong into <linux/sched.h>, they are neither directly
related to task_struct or are scheduler functionality.

Put them next to the other watchdog methods in <linux/nmi.h>.

( Arguably that header's name is a misnomer, and this patch makes it
  more so - but it should be renamed in another patch. )

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/sched.h | 37 -------------------------------------
 2 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 0a3fadc32693..aa3cd0878270 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -7,6 +7,43 @@
 #include <linux/sched.h>
 #include <asm/irq.h>
 
+#ifdef CONFIG_LOCKUP_DETECTOR
+extern void touch_softlockup_watchdog_sched(void);
+extern void touch_softlockup_watchdog(void);
+extern void touch_softlockup_watchdog_sync(void);
+extern void touch_all_softlockup_watchdogs(void);
+extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos);
+extern unsigned int  softlockup_panic;
+extern unsigned int  hardlockup_panic;
+void lockup_detector_init(void);
+#else
+static inline void touch_softlockup_watchdog_sched(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+static inline void touch_softlockup_watchdog_sync(void)
+{
+}
+static inline void touch_all_softlockup_watchdogs(void)
+{
+}
+static inline void lockup_detector_init(void)
+{
+}
+#endif
+
+#ifdef CONFIG_DETECT_HUNG_TASK
+void reset_hung_task_detector(void);
+#else
+static inline void reset_hung_task_detector(void)
+{
+}
+#endif
+
 /*
  * The run state of the lockup detectors is controlled by the content of the
  * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 71efbcafaa31..8a1d296c53a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -286,43 +286,6 @@ extern int sched_cpu_dying(unsigned int cpu);
 
 extern void sched_show_task(struct task_struct *p);
 
-#ifdef CONFIG_LOCKUP_DETECTOR
-extern void touch_softlockup_watchdog_sched(void);
-extern void touch_softlockup_watchdog(void);
-extern void touch_softlockup_watchdog_sync(void);
-extern void touch_all_softlockup_watchdogs(void);
-extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
-				  void __user *buffer,
-				  size_t *lenp, loff_t *ppos);
-extern unsigned int  softlockup_panic;
-extern unsigned int  hardlockup_panic;
-void lockup_detector_init(void);
-#else
-static inline void touch_softlockup_watchdog_sched(void)
-{
-}
-static inline void touch_softlockup_watchdog(void)
-{
-}
-static inline void touch_softlockup_watchdog_sync(void)
-{
-}
-static inline void touch_all_softlockup_watchdogs(void)
-{
-}
-static inline void lockup_detector_init(void)
-{
-}
-#endif
-
-#ifdef CONFIG_DETECT_HUNG_TASK
-void reset_hung_task_detector(void);
-#else
-static inline void reset_hung_task_detector(void)
-{
-}
-#endif
-
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
-- 
cgit v1.2.3


From 8d88460edc05224b8d7ae3372173153876a02825 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 12:06:10 +0100
Subject: sched/headers: Move 'struct pacct_struct' and 'struct cpu_itimer'
 form <linux/sched.h> to <linux/sched/signal.h>

These structures are actually part of 'struct signal', so move them to <linux/sched/signal.h>
where they belong.

This further decreases the size and complexity of <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h        | 13 -------------
 include/linux/sched/signal.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8a1d296c53a0..8bf111efe98e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -326,19 +326,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
 
-struct pacct_struct {
-	int			ac_flag;
-	long			ac_exitcode;
-	unsigned long		ac_mem;
-	u64			ac_utime, ac_stime;
-	unsigned long		ac_minflt, ac_majflt;
-};
-
-struct cpu_itimer {
-	u64 expires;
-	u64 incr;
-};
-
 /**
  * struct prev_cputime - snaphsot of system and user cputime
  * @utime: time spent in user mode
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 53fe5450f431..30e7ceed3ef6 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -19,6 +19,22 @@ struct sighand_struct {
 	wait_queue_head_t	signalfd_wqh;
 };
 
+/*
+ * Per-process accounting stats:
+ */
+struct pacct_struct {
+	int			ac_flag;
+	long			ac_exitcode;
+	unsigned long		ac_mem;
+	u64			ac_utime, ac_stime;
+	unsigned long		ac_minflt, ac_majflt;
+};
+
+struct cpu_itimer {
+	u64 expires;
+	u64 incr;
+};
+
 /*
  * NOTE! "signal_struct" does not have its own
  * locking, because a shared signal_struct always
-- 
cgit v1.2.3


From 7284c6d419b5a6ae9806927f1fd4f0cfd19a16f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 12:14:19 +0100
Subject: sched/headers: Move the cpufreq interfaces to <linux/sched/cpufreq.h>

No need to have this in the generic <linux/sched.h> header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h         | 17 -----------------
 include/linux/sched/cpufreq.h | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bf111efe98e..aae79706ec4f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2288,21 +2288,4 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 }
 #endif /* CONFIG_MEMCG */
 
-#define SCHED_CPUFREQ_RT	(1U << 0)
-#define SCHED_CPUFREQ_DL	(1U << 1)
-#define SCHED_CPUFREQ_IOWAIT	(1U << 2)
-
-#define SCHED_CPUFREQ_RT_DL	(SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
-
-#ifdef CONFIG_CPU_FREQ
-struct update_util_data {
-       void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
-};
-
-void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
-                       void (*func)(struct update_util_data *data, u64 time,
-				    unsigned int flags));
-void cpufreq_remove_update_util_hook(int cpu);
-#endif /* CONFIG_CPU_FREQ */
-
 #endif
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 07ed9664fa20..2bdcfbfc4c30 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -3,4 +3,25 @@
 
 #include <linux/sched.h>
 
+/*
+ * Interface between cpufreq drivers and the scheduler:
+ */
+
+#define SCHED_CPUFREQ_RT	(1U << 0)
+#define SCHED_CPUFREQ_DL	(1U << 1)
+#define SCHED_CPUFREQ_IOWAIT	(1U << 2)
+
+#define SCHED_CPUFREQ_RT_DL	(SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
+
+#ifdef CONFIG_CPU_FREQ
+struct update_util_data {
+       void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
+};
+
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+                       void (*func)(struct update_util_data *data, u64 time,
+				    unsigned int flags));
+void cpufreq_remove_update_util_hook(int cpu);
+#endif /* CONFIG_CPU_FREQ */
+
 #endif /* _LINUX_SCHED_CPUFREQ_H */
-- 
cgit v1.2.3


From 4240c8bf877f1145571106a2934c5cea0b51b178 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 12:18:24 +0100
Subject: sched/headers: Move more mm_struct related functionality from
 <linux/sched.h> to <linux/sched/mm.h>

Neither the mmap_layout nor the mm_update_next_owner() methods need to be
in <linux/sched.h> - move them to the more appropriate <linux/sched/mm.h> header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    | 21 ---------------------
 include/linux/sched/mm.h | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aae79706ec4f..1ddb82be6b50 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,19 +313,6 @@ void __noreturn do_task_dead(void);
 
 struct nsproxy;
 
-#ifdef CONFIG_MMU
-extern void arch_pick_mmap_layout(struct mm_struct *mm);
-extern unsigned long
-arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
-		       unsigned long, unsigned long);
-extern unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
-			  unsigned long len, unsigned long pgoff,
-			  unsigned long flags);
-#else
-static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
-#endif
-
 /**
  * struct prev_cputime - snaphsot of system and user cputime
  * @utime: time spent in user mode
@@ -2280,12 +2267,4 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
-#ifdef CONFIG_MEMCG
-extern void mm_update_next_owner(struct mm_struct *mm);
-#else
-static inline void mm_update_next_owner(struct mm_struct *mm)
-{
-}
-#endif /* CONFIG_MEMCG */
-
 #endif
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index be1ae55f5ab9..93fad9f0f466 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -100,4 +100,25 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 
+#ifdef CONFIG_MEMCG
+extern void mm_update_next_owner(struct mm_struct *mm);
+#else
+static inline void mm_update_next_owner(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_MEMCG */
+
+#ifdef CONFIG_MMU
+extern void arch_pick_mmap_layout(struct mm_struct *mm);
+extern unsigned long
+arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+		       unsigned long, unsigned long);
+extern unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+			  unsigned long len, unsigned long pgoff,
+			  unsigned long flags);
+#else
+static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
+#endif
+
 #endif /* _LINUX_SCHED_MM_H */
-- 
cgit v1.2.3


From abe722a1c59728c6c0ea4e4d5efcfe397c8abebc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 12:27:56 +0100
Subject: sched/headers: Move the 'init_mm' declaration from <linux/sched.h> to
 <linux/mm_types.h>

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h | 2 ++
 include/linux/sched.h    | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6daaf0a51968..28bc710d3467 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -546,6 +546,8 @@ struct mm_struct {
 	struct work_struct async_put_work;
 };
 
+extern struct mm_struct init_mm;
+
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1ddb82be6b50..253d8ab6256c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1782,8 +1782,6 @@ static inline int kstack_end(void *addr)
 extern union thread_union init_thread_union;
 extern struct task_struct init_task;
 
-extern struct   mm_struct init_mm;
-
 extern struct pid_namespace init_pid_ns;
 
 /*
-- 
cgit v1.2.3


From d026ce796cbca3c49678a68bb4a39fb4b9cf8192 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 12:32:21 +0100
Subject: sched/headers: Move in_vfork() from <linux/sched.h> to
 <linux/sched/mm.h>

The in_vfork() function deals with task->mm, so it better belongs
into <linux/sched/mm.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    | 26 --------------------------
 include/linux/sched/mm.h | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253d8ab6256c..aa60812b4b7a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1242,32 +1242,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 #define TNF_FAULT_LOCAL	0x08
 #define TNF_MIGRATE_FAIL 0x10
 
-static inline bool in_vfork(struct task_struct *tsk)
-{
-	bool ret;
-
-	/*
-	 * need RCU to access ->real_parent if CLONE_VM was used along with
-	 * CLONE_PARENT.
-	 *
-	 * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
-	 * imply CLONE_VM
-	 *
-	 * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
-	 * ->real_parent is not necessarily the task doing vfork(), so in
-	 * theory we can't rely on task_lock() if we want to dereference it.
-	 *
-	 * And in this case we can't trust the real_parent->mm == tsk->mm
-	 * check, it can be false negative. But we do not care, if init or
-	 * another oom-unkillable task does this it should blame itself.
-	 */
-	rcu_read_lock();
-	ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
-	rcu_read_unlock();
-
-	return ret;
-}
-
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 93fad9f0f466..64d83fa8d93a 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -121,4 +121,30 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
 
+static inline bool in_vfork(struct task_struct *tsk)
+{
+	bool ret;
+
+	/*
+	 * need RCU to access ->real_parent if CLONE_VM was used along with
+	 * CLONE_PARENT.
+	 *
+	 * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
+	 * imply CLONE_VM
+	 *
+	 * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
+	 * ->real_parent is not necessarily the task doing vfork(), so in
+	 * theory we can't rely on task_lock() if we want to dereference it.
+	 *
+	 * And in this case we can't trust the real_parent->mm == tsk->mm
+	 * check, it can be false negative. But we do not care, if init or
+	 * another oom-unkillable task does this it should blame itself.
+	 */
+	rcu_read_lock();
+	ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
-- 
cgit v1.2.3


From 5647028d550c959577527cec9c0f29fbcea029ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 12:39:17 +0100
Subject: sched/headers: Move the NUMA balancing interfaces from
 <linux/sched.h> to <linux/sched/numa_balancing.h>

Split out the interface between the scheduler and the MM which
deals with page fault driven NUMA balancing, into the new
<linux/sched/numa_balancing.h> header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h                | 35 -------------------------------
 include/linux/sched/numa_balancing.h | 40 ++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa60812b4b7a..fcaea1e7b08a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1236,41 +1236,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 }
 #endif
 
-#define TNF_MIGRATED	0x01
-#define TNF_NO_GROUP	0x02
-#define TNF_SHARED	0x04
-#define TNF_FAULT_LOCAL	0x08
-#define TNF_MIGRATE_FAIL 0x10
-
-#ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int last_node, int node, int pages, int flags);
-extern pid_t task_numa_group_id(struct task_struct *p);
-extern void set_numabalancing_state(bool enabled);
-extern void task_numa_free(struct task_struct *p);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
-					int src_nid, int dst_cpu);
-#else
-static inline void task_numa_fault(int last_node, int node, int pages,
-				   int flags)
-{
-}
-static inline pid_t task_numa_group_id(struct task_struct *p)
-{
-	return 0;
-}
-static inline void set_numabalancing_state(bool enabled)
-{
-}
-static inline void task_numa_free(struct task_struct *p)
-{
-}
-static inline bool should_numa_migrate_memory(struct task_struct *p,
-				struct page *page, int src_nid, int dst_cpu)
-{
-	return true;
-}
-#endif
-
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 999182279f78..35d5fc77b4be 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -1,6 +1,46 @@
 #ifndef _LINUX_SCHED_NUMA_BALANCING_H
 #define _LINUX_SCHED_NUMA_BALANCING_H
 
+/*
+ * This is the interface between the scheduler and the MM that
+ * implements memory access pattern based NUMA-balancing:
+ */
+
 #include <linux/sched.h>
 
+#define TNF_MIGRATED	0x01
+#define TNF_NO_GROUP	0x02
+#define TNF_SHARED	0x04
+#define TNF_FAULT_LOCAL	0x08
+#define TNF_MIGRATE_FAIL 0x10
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int last_node, int node, int pages, int flags);
+extern pid_t task_numa_group_id(struct task_struct *p);
+extern void set_numabalancing_state(bool enabled);
+extern void task_numa_free(struct task_struct *p);
+extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+					int src_nid, int dst_cpu);
+#else
+static inline void task_numa_fault(int last_node, int node, int pages,
+				   int flags)
+{
+}
+static inline pid_t task_numa_group_id(struct task_struct *p)
+{
+	return 0;
+}
+static inline void set_numabalancing_state(bool enabled)
+{
+}
+static inline void task_numa_free(struct task_struct *p)
+{
+}
+static inline bool should_numa_migrate_memory(struct task_struct *p,
+				struct page *page, int src_nid, int dst_cpu)
+{
+	return true;
+}
+#endif
+
 #endif /* _LINUX_SCHED_NUMA_BALANCING_H */
-- 
cgit v1.2.3


From c41cfc6c5ba46050b416c0b0b2621cbe68c4669c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 12:45:58 +0100
Subject: sched/headers: Move the JOBCTL_ defines and methods from
 <linux/sched.h> to <linux/sched/jobctl.h>

Only a small fraction of sched.h users actually utilizes these defines,
and they are not scheduler functionality in any case, so move them
into their separate header.

(Also make <linux/sched/jobctl.h> a self-contained header.)

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h        | 30 ------------------------------
 include/linux/sched/jobctl.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fcaea1e7b08a..46ba8f1b5f1f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1549,36 +1549,6 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 
-/*
- * task->jobctl flags
- */
-#define JOBCTL_STOP_SIGMASK	0xffff	/* signr of the last group stop */
-
-#define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
-#define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
-#define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
-#define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
-#define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
-#define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
-#define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
-
-#define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
-#define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
-#define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
-#define JOBCTL_TRAP_STOP	(1UL << JOBCTL_TRAP_STOP_BIT)
-#define JOBCTL_TRAP_NOTIFY	(1UL << JOBCTL_TRAP_NOTIFY_BIT)
-#define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
-#define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
-
-#define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
-#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
-
-extern bool task_set_jobctl_pending(struct task_struct *task,
-				    unsigned long mask);
-extern void task_clear_jobctl_trapping(struct task_struct *task);
-extern void task_clear_jobctl_pending(struct task_struct *task,
-				      unsigned long mask);
-
 static inline void rcu_copy_process(struct task_struct *p)
 {
 #ifdef CONFIG_PREEMPT_RCU
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index 228e4644937b..016afa0fb3bb 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -1,4 +1,36 @@
 #ifndef _LINUX_SCHED_JOBCTL_H
 #define _LINUX_SCHED_JOBCTL_H
 
+#include <linux/types.h>
+
+struct task_struct;
+
+/*
+ * task->jobctl flags
+ */
+#define JOBCTL_STOP_SIGMASK	0xffff	/* signr of the last group stop */
+
+#define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
+#define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
+#define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
+#define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
+#define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
+#define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
+#define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
+
+#define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP	(1UL << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY	(1UL << JOBCTL_TRAP_NOTIFY_BIT)
+#define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
+
+#define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
+#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
+
+extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask);
+extern void task_clear_jobctl_trapping(struct task_struct *task);
+extern void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask);
+
 #endif /* _LINUX_SCHED_JOBCTL_H */
-- 
cgit v1.2.3


From 30a1baab81189f01c427c4939610b2dde2dbec37 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 10:06:45 +0100
Subject: sched/headers: Remove various unrelated headers from <linux/sched.h>

Remove the following header inclusions from <linux/sched.h>:

	#include <asm/param.h>
	#include <linux/threads.h>
	#include <linux/kernel.h>
	#include <linux/types.h>
	#include <linux/timex.h>
	#include <linux/jiffies.h>
	#include <linux/rbtree.h>
	#include <linux/thread_info.h>
	#include <linux/cpumask.h>
	#include <linux/errno.h>
	#include <linux/nodemask.h>
	#include <linux/preempt.h>
	#include <asm/page.h>
	#include <linux/smp.h>
	#include <linux/compiler.h>
	#include <linux/completion.h>
	#include <linux/percpu.h>
	#include <linux/topology.h>
	#include <linux/rcupdate.h>
	#include <linux/time.h>
	#include <linux/timer.h>
	#include <linux/llist.h>
	#include <linux/uidgid.h>
	#include <asm/processor.h>

because they are either not required, or are already included
naturally as part of the remaining headers.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 46ba8f1b5f1f..7752025679c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -5,60 +5,32 @@
 
 #include <linux/sched/prio.h>
 
-#include <asm/param.h>	/* for HZ */
-
 #include <linux/capability.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/timex.h>
-#include <linux/jiffies.h>
 #include <linux/mutex.h>
 #include <linux/plist.h>
-#include <linux/rbtree.h>
-#include <linux/thread_info.h>
-#include <linux/cpumask.h>
-#include <linux/errno.h>
-#include <linux/nodemask.h>
 #include <linux/mm_types.h>
-#include <linux/preempt.h>
-
-#include <asm/page.h>
 #include <asm/ptrace.h>
 
-#include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
 #include <linux/signal.h>
-#include <linux/compiler.h>
-#include <linux/completion.h>
 #include <linux/signal_types.h>
 #include <linux/pid.h>
-#include <linux/percpu.h>
-#include <linux/topology.h>
 #include <linux/seccomp.h>
-#include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
-#include <linux/time.h>
-#include <linux/param.h>
 #include <linux/resource.h>
-#include <linux/timer.h>
 #include <linux/hrtimer.h>
 #include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
-#include <linux/llist.h>
-#include <linux/uidgid.h>
 #include <linux/gfp.h>
 #include <linux/topology.h>
 #include <linux/magic.h>
 #include <linux/cgroup-defs.h>
 
-#include <asm/processor.h>
-
 struct sched_attr;
 struct sched_param;
 
-- 
cgit v1.2.3


From 9a07000400c853c57477c2a5138ae9f556c40897 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 19:10:05 +0100
Subject: sched/headers: Move CONFIG_TASK_XACCT bits from <linux/sched.h> to
 <linux/sched/xacct.h>

The CONFIG_TASK_XACCT=y accounting inline functions are only used by
fs/read_write.c, so move them into their separate header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       | 38 --------------------------------------
 include/linux/sched/xacct.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7752025679c0..1201312e111e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2104,44 +2104,6 @@ extern struct task_group root_task_group;
 extern int task_can_switch_user(struct user_struct *up,
 					struct task_struct *tsk);
 
-#ifdef CONFIG_TASK_XACCT
-static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
-{
-	tsk->ioac.rchar += amt;
-}
-
-static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
-{
-	tsk->ioac.wchar += amt;
-}
-
-static inline void inc_syscr(struct task_struct *tsk)
-{
-	tsk->ioac.syscr++;
-}
-
-static inline void inc_syscw(struct task_struct *tsk)
-{
-	tsk->ioac.syscw++;
-}
-#else
-static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
-{
-}
-
-static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
-{
-}
-
-static inline void inc_syscr(struct task_struct *tsk)
-{
-}
-
-static inline void inc_syscw(struct task_struct *tsk)
-{
-}
-#endif
-
 #ifndef TASK_SIZE_OF
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
diff --git a/include/linux/sched/xacct.h b/include/linux/sched/xacct.h
index 890f7ce5cd27..a28156a0d34a 100644
--- a/include/linux/sched/xacct.h
+++ b/include/linux/sched/xacct.h
@@ -1,6 +1,48 @@
 #ifndef _LINUX_SCHED_XACCT_H
 #define _LINUX_SCHED_XACCT_H
 
+/*
+ * Extended task accounting methods:
+ */
+
 #include <linux/sched.h>
 
+#ifdef CONFIG_TASK_XACCT
+static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
+{
+	tsk->ioac.rchar += amt;
+}
+
+static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
+{
+	tsk->ioac.wchar += amt;
+}
+
+static inline void inc_syscr(struct task_struct *tsk)
+{
+	tsk->ioac.syscr++;
+}
+
+static inline void inc_syscw(struct task_struct *tsk)
+{
+	tsk->ioac.syscw++;
+}
+#else
+static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
+{
+}
+
+static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
+{
+}
+
+static inline void inc_syscr(struct task_struct *tsk)
+{
+}
+
+static inline void inc_syscw(struct task_struct *tsk)
+{
+}
+#endif
+
 #endif /* _LINUX_SCHED_XACCT_H */
-- 
cgit v1.2.3


From 2a1f062a4acf0be50516ceece92a7182a173d55a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 19:15:33 +0100
Subject: sched/headers: Move signal wakeup & sigpending methods from
 <linux/sched.h> into <linux/sched/signal.h>

This reduces the size of <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h        | 51 ------------------------------------------
 include/linux/sched/signal.h | 53 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1201312e111e..1e0400069e95 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1931,37 +1931,6 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
-static inline int restart_syscall(void)
-{
-	set_tsk_thread_flag(current, TIF_SIGPENDING);
-	return -ERESTARTNOINTR;
-}
-
-static inline int signal_pending(struct task_struct *p)
-{
-	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
-}
-
-static inline int __fatal_signal_pending(struct task_struct *p)
-{
-	return unlikely(sigismember(&p->pending.signal, SIGKILL));
-}
-
-static inline int fatal_signal_pending(struct task_struct *p)
-{
-	return signal_pending(p) && __fatal_signal_pending(p);
-}
-
-static inline int signal_pending_state(long state, struct task_struct *p)
-{
-	if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
-		return 0;
-	if (!signal_pending(p))
-		return 0;
-
-	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
-}
-
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
@@ -2028,26 +1997,6 @@ static __always_inline bool need_resched(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
-/*
- * Reevaluate whether the task has signals pending delivery.
- * Wake the task if so.
- * This is required every time the blocked sigset_t changes.
- * callers must hold sighand->siglock.
- */
-extern void recalc_sigpending_and_wake(struct task_struct *t);
-extern void recalc_sigpending(void);
-
-extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
-
-static inline void signal_wake_up(struct task_struct *t, bool resume)
-{
-	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
-}
-static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
-{
-	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
-}
-
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
  */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 30e7ceed3ef6..b3545fb4333a 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -3,10 +3,10 @@
 
 #include <linux/rculist.h>
 #include <linux/signal.h>
-#include <linux/cred.h>
 #include <linux/sched.h>
 #include <linux/sched/jobctl.h>
 #include <linux/sched/task.h>
+#include <linux/cred.h>
 
 /*
  * Types defining task->signal and task->sighand and APIs using them:
@@ -271,6 +271,57 @@ extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 
+static inline int restart_syscall(void)
+{
+	set_tsk_thread_flag(current, TIF_SIGPENDING);
+	return -ERESTARTNOINTR;
+}
+
+static inline int signal_pending(struct task_struct *p)
+{
+	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+}
+
+static inline int __fatal_signal_pending(struct task_struct *p)
+{
+	return unlikely(sigismember(&p->pending.signal, SIGKILL));
+}
+
+static inline int fatal_signal_pending(struct task_struct *p)
+{
+	return signal_pending(p) && __fatal_signal_pending(p);
+}
+
+static inline int signal_pending_state(long state, struct task_struct *p)
+{
+	if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
+		return 0;
+	if (!signal_pending(p))
+		return 0;
+
+	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
+}
+
+/*
+ * Reevaluate whether the task has signals pending delivery.
+ * Wake the task if so.
+ * This is required every time the blocked sigset_t changes.
+ * callers must hold sighand->siglock.
+ */
+extern void recalc_sigpending_and_wake(struct task_struct *t);
+extern void recalc_sigpending(void);
+
+extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
+
+static inline void signal_wake_up(struct task_struct *t, bool resume)
+{
+	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
+}
+static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
+{
+	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
+}
+
 #ifdef TIF_RESTORE_SIGMASK
 /*
  * Legacy restore_sigmask accessors.  These are inefficient on
-- 
cgit v1.2.3


From 74444edaa0b2ef946e846d5ddf1ba90efcd7c200 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 20:43:54 +0100
Subject: sched/headers: Move the memalloc_noio_*() APIs to <linux/sched/mm.h>

In preparation to remove the <linux/gfp.h> include from <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    | 22 ----------------------
 include/linux/sched/mm.h | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e0400069e95..dbc3ff04750a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1468,28 +1468,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *s
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- * __GFP_FS is also cleared as it implies __GFP_IO.
- */
-static inline gfp_t memalloc_noio_flags(gfp_t flags)
-{
-	if (unlikely(current->flags & PF_MEMALLOC_NOIO))
-		flags &= ~(__GFP_IO | __GFP_FS);
-	return flags;
-}
-
-static inline unsigned int memalloc_noio_save(void)
-{
-	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
-	current->flags |= PF_MEMALLOC_NOIO;
-	return flags;
-}
-
-static inline void memalloc_noio_restore(unsigned int flags)
-{
-	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
-}
-
 /* Per-process atomic flags. */
 #define PFA_NO_NEW_PRIVS 0	/* May not gain new privileges. */
 #define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 64d83fa8d93a..62dca4e0b4e0 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -147,4 +147,26 @@ static inline bool in_vfork(struct task_struct *tsk)
 	return ret;
 }
 
+/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
+ * __GFP_FS is also cleared as it implies __GFP_IO.
+ */
+static inline gfp_t memalloc_noio_flags(gfp_t flags)
+{
+	if (unlikely(current->flags & PF_MEMALLOC_NOIO))
+		flags &= ~(__GFP_IO | __GFP_FS);
+	return flags;
+}
+
+static inline unsigned int memalloc_noio_save(void)
+{
+	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
+	current->flags |= PF_MEMALLOC_NOIO;
+	return flags;
+}
+
+static inline void memalloc_noio_restore(unsigned int flags)
+{
+	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
-- 
cgit v1.2.3


From 3605df49556ebb7641d1f8ec2e7eeecf4dd734bf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 11:03:12 +0100
Subject: sched/headers: Move task statistics APIs from <linux/sched.h> to
 <linux/sched/stat.h>

There are a number of task statistics related variables and methods exported
via sched.h - collect them into <linux/sched/stat.h> and include it from
their usage sites.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/s390/appldata/appldata_base.c |  1 +
 include/linux/sched.h              | 10 ----------
 include/linux/sched/stat.h         | 18 ++++++++++++++++++
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 5a8dfa22da7c..ef3fb1b9201f 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -12,6 +12,7 @@
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
 #include <linux/module.h>
+#include <linux/sched/stat.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dbc3ff04750a..415baf253517 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,16 +46,6 @@ struct nameidata;
 struct signal_struct;
 struct sighand_struct;
 
-extern unsigned long total_forks;
-extern int nr_threads;
-DECLARE_PER_CPU(unsigned long, process_counts);
-extern int nr_processes(void);
-extern unsigned long nr_running(void);
-extern bool single_task_running(void);
-extern unsigned long nr_iowait(void);
-extern unsigned long nr_iowait_cpu(int cpu);
-extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void cpu_load_update_nohz_start(void);
 extern void cpu_load_update_nohz_stop(void);
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 30fe012aecb0..0d52ceb6d3ae 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -3,4 +3,22 @@
 
 #include <linux/sched.h>
 
+/*
+ * Various counters maintained by the scheduler and fork(),
+ * exposed via /proc, sys.c or used by drivers via these APIs.
+ *
+ * ( Note that all these values are aquired without locking,
+ *   so they can only be relied on in narrow circumstances. )
+ */
+
+extern unsigned long total_forks;
+extern int nr_threads;
+DECLARE_PER_CPU(unsigned long, process_counts);
+extern int nr_processes(void);
+extern unsigned long nr_running(void);
+extern bool single_task_running(void);
+extern unsigned long nr_iowait(void);
+extern unsigned long nr_iowait_cpu(int cpu);
+extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
+
 #endif /* _LINUX_SCHED_STAT_H */
-- 
cgit v1.2.3


From 752b3ca734cbc17c0456b846ae886c0ed39c5703 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 11:12:59 +0100
Subject: sched/headers: Move the NOHZ APIs from <linux/sched.h> to
 <linux/sched/nohz.h>

There's a number of NOHZ/dyntics related functionality in <linux/sched.h>,
but only a handful of timer files are making use of them.

Move them into their own header. This better documents these APIs
and unclutters <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 35 -----------------------------------
 include/linux/sched/nohz.h | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 415baf253517..5f9bfc603d19 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,14 +46,6 @@ struct nameidata;
 struct signal_struct;
 struct sighand_struct;
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz_start(void);
-extern void cpu_load_update_nohz_stop(void);
-#else
-static inline void cpu_load_update_nohz_start(void) { }
-static inline void cpu_load_update_nohz_stop(void) { }
-#endif
-
 extern void dump_cpu_task(int cpu);
 
 struct seq_file;
@@ -204,15 +196,6 @@ extern cpumask_var_t cpu_isolated_map;
 
 extern int runqueue_is_locked(int cpu);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void nohz_balance_enter_idle(int cpu);
-extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(void);
-#else
-static inline void nohz_balance_enter_idle(int cpu) { }
-static inline void set_cpu_sd_state_idle(void) { }
-#endif
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
@@ -1535,14 +1518,6 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif
 
-#ifdef CONFIG_NO_HZ_COMMON
-void calc_load_enter_idle(void);
-void calc_load_exit_idle(void);
-#else
-static inline void calc_load_enter_idle(void) { }
-static inline void calc_load_exit_idle(void) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-
 #ifndef cpu_relax_yield
 #define cpu_relax_yield() cpu_relax()
 #endif
@@ -1563,16 +1538,6 @@ extern void idle_task_exit(void);
 static inline void idle_task_exit(void) {}
 #endif
 
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
-extern void wake_up_nohz_cpu(int cpu);
-#else
-static inline void wake_up_nohz_cpu(int cpu) { }
-#endif
-
-#ifdef CONFIG_NO_HZ_FULL
-extern u64 scheduler_tick_max_deferment(void);
-#endif
-
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index fe6caf0dab10..9471f0736a3a 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -3,4 +3,43 @@
 
 #include <linux/sched.h>
 
+/*
+ * This is the interface between the scheduler and nohz/dyntics:
+ */
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
+#else
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_balance_enter_idle(int cpu);
+extern void set_cpu_sd_state_idle(void);
+extern int get_nohz_timer_target(void);
+#else
+static inline void nohz_balance_enter_idle(int cpu) { }
+static inline void set_cpu_sd_state_idle(void) { }
+#endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+void calc_load_enter_idle(void);
+void calc_load_exit_idle(void);
+#else
+static inline void calc_load_enter_idle(void) { }
+static inline void calc_load_exit_idle(void) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+extern void wake_up_nohz_cpu(int cpu);
+#else
+static inline void wake_up_nohz_cpu(int cpu) { }
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
+extern u64 scheduler_tick_max_deferment(void);
+#endif
+
 #endif /* _LINUX_SCHED_NOHZ_H */
-- 
cgit v1.2.3


From d3d1e320d43a7bad9603acf0214406a1e8795c63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 11:16:09 +0100
Subject: sched/headers: Move debugging functions from <linux/sched.h> to
 <linux/sched/debug.h>

Collect the various scheduler and task state debugging APIs scattered
around <linux/sched.h> into the new <linux/sched/debug.h> header.

In particular the show_regs() and show_stack() prototype affects many files,
update them.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       | 36 -----------------------------------
 include/linux/sched/debug.h | 46 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f9bfc603d19..76a2e522be29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,15 +46,9 @@ struct nameidata;
 struct signal_struct;
 struct sighand_struct;
 
-extern void dump_cpu_task(int cpu);
-
 struct seq_file;
 struct cfs_rq;
 struct task_group;
-#ifdef CONFIG_SCHED_DEBUG
-extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
-extern void proc_sched_set_task(struct task_struct *p);
-#endif
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -196,25 +190,6 @@ extern cpumask_var_t cpu_isolated_map;
 
 extern int runqueue_is_locked(int cpu);
 
-/*
- * Only dump TASK_* tasks. (0 for all tasks)
- */
-extern void show_state_filter(unsigned long state_filter);
-
-static inline void show_state(void)
-{
-	show_state_filter(0);
-}
-
-extern void show_regs(struct pt_regs *);
-
-/*
- * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
- * task), SP is the stack pointer of the first frame that should be shown in the back
- * trace (or NULL if the entire call-chain of the task should be shown).
- */
-extern void show_stack(struct task_struct *task, unsigned long *sp);
-
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
@@ -229,17 +204,6 @@ extern int sched_cpu_dying(unsigned int cpu);
 # define sched_cpu_dying	NULL
 #endif
 
-extern void sched_show_task(struct task_struct *p);
-
-/* Attach to any functions which should be ignored in wchan output. */
-#define __sched		__attribute__((__section__(".sched.text")))
-
-/* Linker adds these: start and end of __sched functions */
-extern char __sched_text_start[], __sched_text_end[];
-
-/* Is this address in the __sched functions? */
-extern int in_sched_functions(unsigned long addr);
-
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index 55bc0cd35fd5..853bbef0b47b 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -3,4 +3,50 @@
 
 #include <linux/sched.h>
 
+/*
+ * Various scheduler/task debugging interfaces:
+ */
+
+struct task_struct;
+
+extern void dump_cpu_task(int cpu);
+
+/*
+ * Only dump TASK_* tasks. (0 for all tasks)
+ */
+extern void show_state_filter(unsigned long state_filter);
+
+static inline void show_state(void)
+{
+	show_state_filter(0);
+}
+
+struct pt_regs;
+
+extern void show_regs(struct pt_regs *);
+
+/*
+ * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
+ * task), SP is the stack pointer of the first frame that should be shown in the back
+ * trace (or NULL if the entire call-chain of the task should be shown).
+ */
+extern void show_stack(struct task_struct *task, unsigned long *sp);
+
+extern void sched_show_task(struct task_struct *p);
+
+#ifdef CONFIG_SCHED_DEBUG
+struct seq_file;
+extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_set_task(struct task_struct *p);
+#endif
+
+/* Attach to any functions which should be ignored in wchan output. */
+#define __sched		__attribute__((__section__(".sched.text")))
+
+/* Linker adds these: start and end of __sched functions */
+extern char __sched_text_start[], __sched_text_end[];
+
+/* Is this address in the __sched functions? */
+extern int in_sched_functions(unsigned long addr);
+
 #endif /* _LINUX_SCHED_DEBUG_H */
-- 
cgit v1.2.3


From 63cc9d6fca8c220c2406f1376100a9acb55197af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 12:04:39 +0100
Subject: sched/headers, time/timekeeping: Move the xtime_update() prototype
 from <linux/sched.h> to <linux/time.h>

This was in <linux/sched.h> only for hysterical raisins.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 include/linux/time.h  | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76a2e522be29..5a4fbc76feb4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1579,8 +1579,6 @@ extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 
 #include <asm/current.h>
 
-extern void xtime_update(unsigned long ticks);
-
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
diff --git a/include/linux/time.h b/include/linux/time.h
index 23f0f5ce3090..4f4ab65b6e61 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -167,6 +167,8 @@ static inline bool timespec_inject_offset_valid(const struct timespec *ts)
 extern u32 (*arch_gettimeoffset)(void);
 #endif
 
+extern void xtime_update(unsigned long ticks);
+
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
-- 
cgit v1.2.3


From 70b8157e61d0143fb44ae9482557d7aca365da3d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 12:11:00 +0100
Subject: sched/headers: Move <asm/current.h> include from the middle of
 <linux/sched.h> to the header portion

Linux-0.01 already defined 'current' in the middle of sched.h, so this
is an ancient historical precedent - but still in a modern kernel it
looks a bit weird that we have:

	#include <asm/current.h>

in the middle of the header.

Move it further up. If this was done for some obscure dependency
reasons then we'll trigger and document it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a4fbc76feb4..ac0fed4c3130 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -31,6 +31,8 @@
 #include <linux/magic.h>
 #include <linux/cgroup-defs.h>
 
+#include <asm/current.h>
+
 struct sched_attr;
 struct sched_param;
 
@@ -1577,8 +1579,6 @@ extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
 
-#include <asm/current.h>
-
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
-- 
cgit v1.2.3


From 0ca0156973a47e689f3bc817e26e15fff3f84eec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 14:52:01 +0100
Subject: sched/headers: Split hotplug CPU interfaces out of <linux/sched.h>
 into <linux/sched/hotplug.h>

Split the CPU hotplug scheduler APIs out of the common header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h         | 15 ---------------
 include/linux/sched/hotplug.h | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ac0fed4c3130..25cc0adb3e08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -196,15 +196,6 @@ extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
-extern int sched_cpu_starting(unsigned int cpu);
-extern int sched_cpu_activate(unsigned int cpu);
-extern int sched_cpu_deactivate(unsigned int cpu);
-
-#ifdef CONFIG_HOTPLUG_CPU
-extern int sched_cpu_dying(unsigned int cpu);
-#else
-# define sched_cpu_dying	NULL
-#endif
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long schedule_timeout(signed long timeout);
@@ -1498,12 +1489,6 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif
 
-#ifdef CONFIG_HOTPLUG_CPU
-extern void idle_task_exit(void);
-#else
-static inline void idle_task_exit(void) {}
-#endif
-
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index 34670ed24894..c608d3c1ddb8 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -3,4 +3,24 @@
 
 #include <linux/sched.h>
 
+/*
+ * Scheduler interfaces for hotplug CPU support:
+ */
+
+extern int sched_cpu_starting(unsigned int cpu);
+extern int sched_cpu_activate(unsigned int cpu);
+extern int sched_cpu_deactivate(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_dying(unsigned int cpu);
+#else
+# define sched_cpu_dying	NULL
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern void idle_task_exit(void);
+#else
+static inline void idle_task_exit(void) {}
+#endif
+
 #endif /* _LINUX_SCHED_HOTPLUG_H */
-- 
cgit v1.2.3


From 901b14bd946a8b7ea211105b6207e082ddd36846 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 15:24:12 +0100
Subject: sched/headers: Move task lifetime APIs from <linux/sched.h> to
 <linux/sched/task.h>

There's a fair amount of task lifetime management (a.k.a fork()/exit())
related APIs in <linux/sched.h>, but only a small fraction of
the users of the generic sched.h header make use of them.

Move these functions to the <linux/sched/task.h> header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 91 -------------------------------------------
 include/linux/sched/task.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 25cc0adb3e08..b1677c8db03f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -165,28 +165,10 @@ struct task_group;
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
-#include <linux/spinlock.h>
-
-/*
- * This serializes "schedule()" and also protects
- * the run-queue from deletions/modifications (but
- * _adding_ to the beginning of the run-queue has
- * a separate lock).
- */
-extern rwlock_t tasklist_lock;
-extern spinlock_t mmlist_lock;
-
 struct task_struct;
 
-#ifdef CONFIG_PROVE_RCU
-extern int lockdep_tasklist_lock_is_held(void);
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
 extern void sched_init(void);
 extern void sched_init_smp(void);
-extern asmlinkage void schedule_tail(struct task_struct *prev);
-extern void init_idle(struct task_struct *idle, int cpu);
-extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern cpumask_var_t cpu_isolated_map;
 
@@ -211,8 +193,6 @@ extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
-void __noreturn do_task_dead(void);
-
 struct nsproxy;
 
 /**
@@ -1120,24 +1100,6 @@ struct task_struct {
  */
 };
 
-#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
-extern int arch_task_struct_size __read_mostly;
-#else
-# define arch_task_struct_size (sizeof(struct task_struct))
-#endif
-
-#ifdef CONFIG_VMAP_STACK
-static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
-{
-	return t->stack_vm_area;
-}
-#else
-static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
-{
-	return NULL;
-}
-#endif
-
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
@@ -1429,21 +1391,6 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 
-static inline void rcu_copy_process(struct task_struct *p)
-{
-#ifdef CONFIG_PREEMPT_RCU
-	p->rcu_read_lock_nesting = 0;
-	p->rcu_read_unlock_special.s = 0;
-	p->rcu_blocked_node = NULL;
-	INIT_LIST_HEAD(&p->rcu_node_entry);
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_TASKS_RCU
-	p->rcu_tasks_holdout = false;
-	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
-	p->rcu_tasks_idle_cpu = -1;
-#endif /* #ifdef CONFIG_TASKS_RCU */
-}
-
 static inline void tsk_restore_flags(struct task_struct *task,
 				unsigned long orig_flags, unsigned long flags)
 {
@@ -1572,45 +1519,11 @@ extern void wake_up_new_task(struct task_struct *tsk);
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
-extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_dead(struct task_struct *p);
-
-extern void proc_caches_init(void);
-
-extern void release_task(struct task_struct * p);
-
-#ifdef CONFIG_HAVE_COPY_THREAD_TLS
-extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
-			struct task_struct *, unsigned long);
-#else
-extern int copy_thread(unsigned long, unsigned long, unsigned long,
-			struct task_struct *);
-
-/* Architectures that haven't opted into copy_thread_tls get the tls argument
- * via pt_regs, so ignore the tls argument passed via C. */
-static inline int copy_thread_tls(
-		unsigned long clone_flags, unsigned long sp, unsigned long arg,
-		struct task_struct *p, unsigned long tls)
-{
-	return copy_thread(clone_flags, sp, arg, p);
-}
-#endif
-extern void flush_thread(void);
-
-#ifdef CONFIG_HAVE_EXIT_THREAD
-extern void exit_thread(struct task_struct *tsk);
-#else
-static inline void exit_thread(struct task_struct *tsk)
-{
-}
-#endif
 
 extern void exit_files(struct task_struct *);
 
 extern void exit_itimers(struct signal_struct *);
 
-extern void do_group_exit(int);
-
 extern int do_execve(struct filename *,
 		     const char __user * const __user *,
 		     const char __user * const __user *);
@@ -1618,10 +1531,6 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
-extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
-struct task_struct *fork_idle(int);
-extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
 static inline void set_task_comm(struct task_struct *tsk, const char *from)
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0023c91ff821..e93638a03515 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -1,6 +1,103 @@
 #ifndef _LINUX_SCHED_TASK_H
 #define _LINUX_SCHED_TASK_H
 
+/*
+ * Interface between the scheduler and various task lifetime (fork()/exit())
+ * functionality:
+ */
+
 #include <linux/sched.h>
 
+/*
+ * This serializes "schedule()" and also protects
+ * the run-queue from deletions/modifications (but
+ * _adding_ to the beginning of the run-queue has
+ * a separate lock).
+ */
+extern rwlock_t tasklist_lock;
+extern spinlock_t mmlist_lock;
+
+#ifdef CONFIG_PROVE_RCU
+extern int lockdep_tasklist_lock_is_held(void);
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+extern asmlinkage void schedule_tail(struct task_struct *prev);
+extern void init_idle(struct task_struct *idle, int cpu);
+extern void init_idle_bootup_task(struct task_struct *idle);
+
+static inline void rcu_copy_process(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RCU
+	p->rcu_read_lock_nesting = 0;
+	p->rcu_read_unlock_special.s = 0;
+	p->rcu_blocked_node = NULL;
+	INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+	p->rcu_tasks_holdout = false;
+	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+	p->rcu_tasks_idle_cpu = -1;
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
+extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
+extern void sched_dead(struct task_struct *p);
+
+void __noreturn do_task_dead(void);
+
+extern void proc_caches_init(void);
+
+extern void release_task(struct task_struct * p);
+
+#ifdef CONFIG_HAVE_COPY_THREAD_TLS
+extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
+			struct task_struct *, unsigned long);
+#else
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+			struct task_struct *);
+
+/* Architectures that haven't opted into copy_thread_tls get the tls argument
+ * via pt_regs, so ignore the tls argument passed via C. */
+static inline int copy_thread_tls(
+		unsigned long clone_flags, unsigned long sp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
+{
+	return copy_thread(clone_flags, sp, arg, p);
+}
+#endif
+extern void flush_thread(void);
+
+#ifdef CONFIG_HAVE_EXIT_THREAD
+extern void exit_thread(struct task_struct *tsk);
+#else
+static inline void exit_thread(struct task_struct *tsk)
+{
+}
+#endif
+extern void do_group_exit(int);
+
+extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
+extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
+struct task_struct *fork_idle(int);
+extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+extern int arch_task_struct_size __read_mostly;
+#else
+# define arch_task_struct_size (sizeof(struct task_struct))
+#endif
+
+#ifdef CONFIG_VMAP_STACK
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+	return t->stack_vm_area;
+}
+#else
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _LINUX_SCHED_TASK_H */
-- 
cgit v1.2.3


From 6bfbaa51ed47774492d83d182a86068cc35aa4c6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 21:37:49 +0100
Subject: sched/headers, RCU: Move rcu_copy_process() from <linux/sched/task.h>
 to kernel/fork.c

Move rcu_copy_process() into kernel/fork.c, which is the only
user of this inline function.

This simplifies <linux/sched/task.h> to the level that <linux/sched.h>
does not have to be included in it anymore - which change is done
in a subsequent patch.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/task.h | 15 ---------------
 kernel/fork.c              | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index e93638a03515..20ed9108f261 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -25,21 +25,6 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
-static inline void rcu_copy_process(struct task_struct *p)
-{
-#ifdef CONFIG_PREEMPT_RCU
-	p->rcu_read_lock_nesting = 0;
-	p->rcu_read_unlock_special.s = 0;
-	p->rcu_blocked_node = NULL;
-	INIT_LIST_HEAD(&p->rcu_node_entry);
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_TASKS_RCU
-	p->rcu_tasks_holdout = false;
-	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
-	p->rcu_tasks_idle_cpu = -1;
-#endif /* #ifdef CONFIG_TASKS_RCU */
-}
-
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 916e78004b8f..6c463c80e93d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1465,6 +1465,21 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
 	 task->pids[type].pid = pid;
 }
 
+static inline void rcu_copy_process(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RCU
+	p->rcu_read_lock_nesting = 0;
+	p->rcu_read_unlock_special.s = 0;
+	p->rcu_blocked_node = NULL;
+	INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+	p->rcu_tasks_holdout = false;
+	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+	p->rcu_tasks_idle_cpu = -1;
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
-- 
cgit v1.2.3


From c7af7877eeacfeaaf6a1b6f54c481292ef116837 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 22:01:58 +0100
Subject: sched/core: Move, sort and clean up <linux/sched.h> structure
 predeclarations

Most of the structure predeclarations were at the head of sched.h, but not
all of them - there were a number of lines spread around sched.h, in
random places.

Move them to the head, and also sort them alphabetically.

Remove unused entries.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 56 +++++++++++++++++++++------------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b1677c8db03f..5398356e33c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -33,24 +33,35 @@
 
 #include <asm/current.h>
 
-struct sched_attr;
-struct sched_param;
-
-struct futex_pi_state;
-struct robust_list_head;
+/* task_struct member predeclarations: */
+struct audit_context;
+struct autogroup;
+struct backing_dev_info;
 struct bio_list;
-struct fs_struct;
-struct perf_event_context;
 struct blk_plug;
+struct cfs_rq;
 struct filename;
+struct fs_struct;
+struct futex_pi_state;
+struct io_context;
+struct mempolicy;
 struct nameidata;
-
-struct signal_struct;
-struct sighand_struct;
-
+struct nsproxy;
+struct perf_event_context;
+struct pid_namespace;
+struct pipe_inode_info;
+struct rcu_node;
+struct reclaim_state;
+struct robust_list_head;
+struct sched_attr;
+struct sched_param;
 struct seq_file;
-struct cfs_rq;
+struct sighand_struct;
+struct signal_struct;
+struct task_delay_info;
 struct task_group;
+struct task_struct;
+struct uts_namespace;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -165,8 +176,6 @@ struct task_group;
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
-struct task_struct;
-
 extern void sched_init(void);
 extern void sched_init_smp(void);
 
@@ -193,8 +202,6 @@ extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
-struct nsproxy;
-
 /**
  * struct prev_cputime - snaphsot of system and user cputime
  * @utime: time spent in user mode
@@ -297,10 +304,6 @@ struct thread_group_cputimer {
 };
 
 #include <linux/rwsem.h>
-struct autogroup;
-
-struct backing_dev_info;
-struct reclaim_state;
 
 #ifdef CONFIG_SCHED_INFO
 struct sched_info {
@@ -314,8 +317,6 @@ struct sched_info {
 };
 #endif /* CONFIG_SCHED_INFO */
 
-struct task_delay_info;
-
 static inline int sched_info_on(void)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -342,20 +343,12 @@ void force_schedstat_enabled(void);
 # define SCHED_FIXEDPOINT_SHIFT	10
 # define SCHED_FIXEDPOINT_SCALE	(1L << SCHED_FIXEDPOINT_SHIFT)
 
-struct io_context;			/* See blkdev.h */
-
-
 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
 extern void prefetch_stack(struct task_struct *t);
 #else
 static inline void prefetch_stack(struct task_struct *t) { }
 #endif
 
-struct audit_context;		/* See audit.c */
-struct mempolicy;
-struct pipe_inode_info;
-struct uts_namespace;
-
 struct load_weight {
 	unsigned long weight;
 	u32 inv_weight;
@@ -564,7 +557,6 @@ union rcu_special {
 	} b; /* Bits. */
 	u32 s; /* Set of bits. */
 };
-struct rcu_node;
 
 enum perf_event_task_context {
 	perf_invalid_context = -1,
@@ -1125,8 +1117,6 @@ static inline struct pid *task_session(struct task_struct *task)
 	return task->group_leader->pids[PIDTYPE_SID].pid;
 }
 
-struct pid_namespace;
-
 /*
  * the helpers to get the task's different pids as they are seen
  * from various namespaces
-- 
cgit v1.2.3


From d04b0ad37e4b6ac39a56c823ae76ab37cd044dc7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 22:07:57 +0100
Subject: sched/headers: Move the PREEMPT_COUNT defines from <linux/sched.h> to
 <linux/preempt.h>

These defines are not really part of the scheduler's driver API, but are
related to the preempt count - so move them to <linux/preempt.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 21 +++++++++++++++++++++
 include/linux/sched.h   | 21 ---------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7eeceac52dea..cae461224948 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -55,6 +55,27 @@
 /* We use the MSB mostly because its available */
 #define PREEMPT_NEED_RESCHED	0x80000000
 
+#define PREEMPT_DISABLED	(PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+
+/*
+ * Disable preemption until the scheduler is running -- use an unconditional
+ * value so that it also works on !PREEMPT_COUNT kernels.
+ *
+ * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+ */
+#define INIT_PREEMPT_COUNT	PREEMPT_OFFSET
+
+/*
+ * Initial preempt_count value; reflects the preempt_count schedule invariant
+ * which states that during context switches:
+ *
+ *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+ *
+ * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ * Note: See finish_task_switch().
+ */
+#define FORK_PREEMPT_COUNT	(2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+
 /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
 #include <asm/preempt.h>
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5398356e33c7..3b3e31da416e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -265,27 +265,6 @@ struct task_cputime_atomic {
 		.sum_exec_runtime = ATOMIC64_INIT(0),		\
 	}
 
-#define PREEMPT_DISABLED	(PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
-
-/*
- * Disable preemption until the scheduler is running -- use an unconditional
- * value so that it also works on !PREEMPT_COUNT kernels.
- *
- * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
- */
-#define INIT_PREEMPT_COUNT	PREEMPT_OFFSET
-
-/*
- * Initial preempt_count value; reflects the preempt_count schedule invariant
- * which states that during context switches:
- *
- *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
- *
- * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
- * Note: See finish_task_switch().
- */
-#define FORK_PREEMPT_COUNT	(2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
-
 /**
  * struct thread_group_cputimer - thread group interval timer counts
  * @cputime_atomic:	atomic thread group interval timers.
-- 
cgit v1.2.3


From f3ac60671954c8d413532627b1be13a76f394c49 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 22:59:33 +0100
Subject: sched/headers: Move task-stack related APIs from <linux/sched.h> to
 <linux/sched/task_stack.h>

Split out the task->stack related functionality, which is not really
part of the core scheduler APIs.

Only keep task_thread_info() because it's used by sched.h.

Update the code that uses those facilities.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/kernel/smp-cps.c       |   2 +-
 arch/mips/sibyte/sb1250/smp.c    |   2 +-
 include/linux/sched.h            | 115 +++------------------------------------
 include/linux/sched/task_stack.h | 104 +++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 1d3188c23bb8..6d45f05538c8 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -11,7 +11,7 @@
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <linux/irqchip/mips-gic.h>
-#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/sched/hotplug.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index 1cf66f5ff23d..0a4a2c3982d8 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -21,7 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 #include <linux/kernel_stat.h>
-#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/mmu_context.h>
 #include <asm/io.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b3e31da416e..af9590c8bfb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1450,6 +1450,15 @@ union thread_union {
 	unsigned long stack[THREAD_SIZE/sizeof(long)];
 };
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+static inline struct thread_info *task_thread_info(struct task_struct *task)
+{
+	return &task->thread_info;
+}
+#elif !defined(__HAVE_THREAD_FUNCTIONS)
+# define task_thread_info(task)	((struct thread_info *)(task)->stack)
+#endif
+
 #ifndef __HAVE_ARCH_KSTACK_END
 static inline int kstack_end(void *addr)
 {
@@ -1540,112 +1549,6 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
-#ifdef CONFIG_THREAD_INFO_IN_TASK
-
-static inline struct thread_info *task_thread_info(struct task_struct *task)
-{
-	return &task->thread_info;
-}
-
-/*
- * When accessing the stack of a non-current task that might exit, use
- * try_get_task_stack() instead.  task_stack_page will return a pointer
- * that could get freed out from under you.
- */
-static inline void *task_stack_page(const struct task_struct *task)
-{
-	return task->stack;
-}
-
-#define setup_thread_stack(new,old)	do { } while(0)
-
-static inline unsigned long *end_of_stack(const struct task_struct *task)
-{
-	return task->stack;
-}
-
-#elif !defined(__HAVE_THREAD_FUNCTIONS)
-
-#define task_thread_info(task)	((struct thread_info *)(task)->stack)
-#define task_stack_page(task)	((void *)(task)->stack)
-
-static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
-{
-	*task_thread_info(p) = *task_thread_info(org);
-	task_thread_info(p)->task = p;
-}
-
-/*
- * Return the address of the last usable long on the stack.
- *
- * When the stack grows down, this is just above the thread
- * info struct. Going any lower will corrupt the threadinfo.
- *
- * When the stack grows up, this is the highest address.
- * Beyond that position, we corrupt data on the next page.
- */
-static inline unsigned long *end_of_stack(struct task_struct *p)
-{
-#ifdef CONFIG_STACK_GROWSUP
-	return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
-#else
-	return (unsigned long *)(task_thread_info(p) + 1);
-#endif
-}
-
-#endif
-
-#ifdef CONFIG_THREAD_INFO_IN_TASK
-static inline void *try_get_task_stack(struct task_struct *tsk)
-{
-	return atomic_inc_not_zero(&tsk->stack_refcount) ?
-		task_stack_page(tsk) : NULL;
-}
-
-extern void put_task_stack(struct task_struct *tsk);
-#else
-static inline void *try_get_task_stack(struct task_struct *tsk)
-{
-	return task_stack_page(tsk);
-}
-
-static inline void put_task_stack(struct task_struct *tsk) {}
-#endif
-
-#define task_stack_end_corrupted(task) \
-		(*(end_of_stack(task)) != STACK_END_MAGIC)
-
-static inline int object_is_on_stack(void *obj)
-{
-	void *stack = task_stack_page(current);
-
-	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
-}
-
-extern void thread_stack_cache_init(void);
-
-#ifdef CONFIG_DEBUG_STACK_USAGE
-static inline unsigned long stack_not_used(struct task_struct *p)
-{
-	unsigned long *n = end_of_stack(p);
-
-	do { 	/* Skip over canary */
-# ifdef CONFIG_STACK_GROWSUP
-		n--;
-# else
-		n++;
-# endif
-	} while (!*n);
-
-# ifdef CONFIG_STACK_GROWSUP
-	return (unsigned long)end_of_stack(p) - (unsigned long)n;
-# else
-	return (unsigned long)n - (unsigned long)end_of_stack(p);
-# endif
-}
-#endif
-extern void set_task_stack_end_magic(struct task_struct *tsk);
-
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index d2d578e85f7d..aaa5c2a6a0e9 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -1,7 +1,111 @@
 #ifndef _LINUX_SCHED_TASK_STACK_H
 #define _LINUX_SCHED_TASK_STACK_H
 
+/*
+ * task->stack (kernel stack) handling interfaces:
+ */
+
 #include <linux/sched.h>
 #include <linux/magic.h>
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+
+/*
+ * When accessing the stack of a non-current task that might exit, use
+ * try_get_task_stack() instead.  task_stack_page will return a pointer
+ * that could get freed out from under you.
+ */
+static inline void *task_stack_page(const struct task_struct *task)
+{
+	return task->stack;
+}
+
+#define setup_thread_stack(new,old)	do { } while(0)
+
+static inline unsigned long *end_of_stack(const struct task_struct *task)
+{
+	return task->stack;
+}
+
+#elif !defined(__HAVE_THREAD_FUNCTIONS)
+
+#define task_stack_page(task)	((void *)(task)->stack)
+
+static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
+{
+	*task_thread_info(p) = *task_thread_info(org);
+	task_thread_info(p)->task = p;
+}
+
+/*
+ * Return the address of the last usable long on the stack.
+ *
+ * When the stack grows down, this is just above the thread
+ * info struct. Going any lower will corrupt the threadinfo.
+ *
+ * When the stack grows up, this is the highest address.
+ * Beyond that position, we corrupt data on the next page.
+ */
+static inline unsigned long *end_of_stack(struct task_struct *p)
+{
+#ifdef CONFIG_STACK_GROWSUP
+	return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
+#else
+	return (unsigned long *)(task_thread_info(p) + 1);
+#endif
+}
+
+#endif
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+	return atomic_inc_not_zero(&tsk->stack_refcount) ?
+		task_stack_page(tsk) : NULL;
+}
+
+extern void put_task_stack(struct task_struct *tsk);
+#else
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+	return task_stack_page(tsk);
+}
+
+static inline void put_task_stack(struct task_struct *tsk) {}
+#endif
+
+#define task_stack_end_corrupted(task) \
+		(*(end_of_stack(task)) != STACK_END_MAGIC)
+
+static inline int object_is_on_stack(void *obj)
+{
+	void *stack = task_stack_page(current);
+
+	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
+}
+
+extern void thread_stack_cache_init(void);
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static inline unsigned long stack_not_used(struct task_struct *p)
+{
+	unsigned long *n = end_of_stack(p);
+
+	do { 	/* Skip over canary */
+# ifdef CONFIG_STACK_GROWSUP
+		n--;
+# else
+		n++;
+# endif
+	} while (!*n);
+
+# ifdef CONFIG_STACK_GROWSUP
+	return (unsigned long)end_of_stack(p) - (unsigned long)n;
+# else
+	return (unsigned long)n - (unsigned long)end_of_stack(p);
+# endif
+}
+#endif
+extern void set_task_stack_end_magic(struct task_struct *tsk);
+
 #endif /* _LINUX_SCHED_TASK_STACK_H */
-- 
cgit v1.2.3


From 70806b21e4d64f01193a2b64a053b75ff53a2b10 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:15:21 +0100
Subject: sched/headers: Move the 'root_task_group' declaration to
 <linux/sched/autogroup.h>

Also remove the duplicate declaration from <linux/init_task.h>.

( That declaration was originally duplicated for dependency hell reasons,
  but there's no problem including the much smaller <linux/sched/autogroup.h>
  header now, to pick up the right prototype. )

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h       | 2 --
 include/linux/sched.h           | 4 ----
 include/linux/sched/autogroup.h | 5 +++++
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f6841c19c913..91d9049f0039 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -151,8 +151,6 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-extern struct task_group root_task_group;
-
 #ifdef CONFIG_CGROUP_SCHED
 # define INIT_CGROUP_SCHED(tsk)						\
 	.sched_task_group = &root_task_group,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index af9590c8bfb0..4934733bd6cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1707,10 +1707,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-#ifdef CONFIG_CGROUP_SCHED
-extern struct task_group root_task_group;
-#endif /* CONFIG_CGROUP_SCHED */
-
 extern int task_can_switch_user(struct user_struct *up,
 					struct task_struct *tsk);
 
diff --git a/include/linux/sched/autogroup.h b/include/linux/sched/autogroup.h
index 586bdf37330d..fd6855548d0c 100644
--- a/include/linux/sched/autogroup.h
+++ b/include/linux/sched/autogroup.h
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 
 struct signal_struct;
+struct task_group;
 struct seq_file;
 
 #ifdef CONFIG_SCHED_AUTOGROUP
@@ -24,4 +25,8 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit_task(struct task_struct *p) { }
 #endif
 
+#ifdef CONFIG_CGROUP_SCHED
+extern struct task_group root_task_group;
+#endif /* CONFIG_CGROUP_SCHED */
+
 #endif /* _LINUX_SCHED_AUTOGROUP_H */
-- 
cgit v1.2.3


From 76960dbf9b235b957d9d730be2c6c2a70f709026 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:43:50 +0100
Subject: signals: Move signal data types from <linux/signal.h> to
 <linux/signal_types.h>

Separate out just the pure data types - sched.h will be able to use
this reduced size header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/signal.h       | 54 ----------------------------------------
 include/linux/signal_types.h | 59 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index d1c2b05a7a55..94ad6eea9550 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -8,24 +8,6 @@ struct task_struct;
 
 /* for sysctl */
 extern int print_fatal_signals;
-/*
- * Real Time signals may be queued.
- */
-
-struct sigqueue {
-	struct list_head list;
-	int flags;
-	siginfo_t info;
-	struct user_struct *user;
-};
-
-/* flags values. */
-#define SIGQUEUE_PREALLOC	1
-
-struct sigpending {
-	struct list_head list;
-	sigset_t signal;
-};
 
 #ifndef HAVE_ARCH_COPY_SIGINFO
 
@@ -271,42 +253,6 @@ extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 
-struct sigaction {
-#ifndef __ARCH_HAS_IRIX_SIGACTION
-	__sighandler_t	sa_handler;
-	unsigned long	sa_flags;
-#else
-	unsigned int	sa_flags;
-	__sighandler_t	sa_handler;
-#endif
-#ifdef __ARCH_HAS_SA_RESTORER
-	__sigrestore_t sa_restorer;
-#endif
-	sigset_t	sa_mask;	/* mask last for extensibility */
-};
-
-struct k_sigaction {
-	struct sigaction sa;
-#ifdef __ARCH_HAS_KA_RESTORER
-	__sigrestore_t ka_restorer;
-#endif
-};
- 
-#ifdef CONFIG_OLD_SIGACTION
-struct old_sigaction {
-	__sighandler_t sa_handler;
-	old_sigset_t sa_mask;
-	unsigned long sa_flags;
-	__sigrestore_t sa_restorer;
-};
-#endif
-
-struct ksignal {
-	struct k_sigaction ka;
-	siginfo_t info;
-	int sig;
-};
-
 extern int get_signal(struct ksignal *ksig);
 extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
 extern void exit_signals(struct task_struct *tsk);
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index 28799c88f490..16d862a3d8f3 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -1,7 +1,66 @@
 #ifndef _LINUX_SIGNAL_TYPES_H
 #define _LINUX_SIGNAL_TYPES_H
 
+/*
+ * Basic signal handling related data type definitions:
+ */
+
 #include <linux/list.h>
 #include <uapi/linux/signal.h>
 
+/*
+ * Real Time signals may be queued.
+ */
+
+struct sigqueue {
+	struct list_head list;
+	int flags;
+	siginfo_t info;
+	struct user_struct *user;
+};
+
+/* flags values. */
+#define SIGQUEUE_PREALLOC	1
+
+struct sigpending {
+	struct list_head list;
+	sigset_t signal;
+};
+
+struct sigaction {
+#ifndef __ARCH_HAS_IRIX_SIGACTION
+	__sighandler_t	sa_handler;
+	unsigned long	sa_flags;
+#else
+	unsigned int	sa_flags;
+	__sighandler_t	sa_handler;
+#endif
+#ifdef __ARCH_HAS_SA_RESTORER
+	__sigrestore_t sa_restorer;
+#endif
+	sigset_t	sa_mask;	/* mask last for extensibility */
+};
+
+struct k_sigaction {
+	struct sigaction sa;
+#ifdef __ARCH_HAS_KA_RESTORER
+	__sigrestore_t ka_restorer;
+#endif
+};
+
+#ifdef CONFIG_OLD_SIGACTION
+struct old_sigaction {
+	__sighandler_t sa_handler;
+	old_sigset_t sa_mask;
+	unsigned long sa_flags;
+	__sigrestore_t sa_restorer;
+};
+#endif
+
+struct ksignal {
+	struct k_sigaction ka;
+	siginfo_t info;
+	int sig;
+};
+
 #endif /* _LINUX_SIGNAL_TYPES_H */
-- 
cgit v1.2.3


From 9e7d2e44dd88ba7e29c165b6fca428e384afa5a8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 00:12:19 +0100
Subject: mm/headers, sched/headers: Move task related MM types from
 <linux/mm_types.> to <linux/mm_types_task.h>

Separate all the MM types that are embedded directly in 'struct task_struct'
into the <linux/mm_types_task.h> header.

The goal is to include this header in <linux/sched.h>, not the full <linux/mm_types.h>
header, to reduce the size, complexity and coupling of <linux/sched.h>.

(This patch does not change <linux/sched.h> yet.)

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h      | 49 --------------------------------------
 include/linux/mm_types_task.h | 55 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 28bc710d3467..f60f45fe226f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -24,11 +24,6 @@
 struct address_space;
 struct mem_cgroup;
 
-#define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
-#define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
-		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
-#define ALLOC_SPLIT_PTLOCKS	(SPINLOCK_SIZE > BITS_PER_LONG/8)
-
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -231,17 +226,6 @@ struct page {
 #endif
 ;
 
-struct page_frag {
-	struct page *page;
-#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
-	__u32 offset;
-	__u32 size;
-#else
-	__u16 offset;
-	__u16 size;
-#endif
-};
-
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 
@@ -360,18 +344,6 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 
-/*
- * The per task VMA cache array:
- */
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
-struct vmacache {
-	u32 seqnum;
-	struct vm_area_struct *vmas[VMACACHE_SIZE];
-};
-
 struct core_thread {
 	struct task_struct *task;
 	struct core_thread *next;
@@ -383,27 +355,6 @@ struct core_state {
 	struct completion startup;
 };
 
-enum {
-	MM_FILEPAGES,	/* Resident file mapping pages */
-	MM_ANONPAGES,	/* Resident anonymous pages */
-	MM_SWAPENTS,	/* Anonymous swap entries */
-	MM_SHMEMPAGES,	/* Resident shared memory pages */
-	NR_MM_COUNTERS
-};
-
-#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
-#define SPLIT_RSS_COUNTING
-/* per-thread cached information, */
-struct task_rss_stat {
-	int events;	/* for synchronization threshold */
-	int count[NR_MM_COUNTERS];
-};
-#endif /* USE_SPLIT_PTE_PTLOCKS */
-
-struct mm_rss_stat {
-	atomic_long_t count[NR_MM_COUNTERS];
-};
-
 struct kioctx_table;
 struct mm_struct {
 	struct vm_area_struct *mmap;		/* list of VMAs */
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 2419d302f03c..9526d8b9fe0e 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -1,10 +1,65 @@
 #ifndef _LINUX_MM_TYPES_TASK_H
 #define _LINUX_MM_TYPES_TASK_H
 
+/*
+ * Here are the definitions of the MM data types that are embedded in 'struct task_struct'.
+ *
+ * (These are defined separately to decouple sched.h from mm_types.h as much as possible.)
+ */
+
 #include <linux/types.h>
 #include <linux/threads.h>
 #include <linux/atomic.h>
 
 #include <asm/page.h>
 
+#define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
+#define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
+		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
+#define ALLOC_SPLIT_PTLOCKS	(SPINLOCK_SIZE > BITS_PER_LONG/8)
+
+/*
+ * The per task VMA cache array:
+ */
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
+
+struct vmacache {
+	u32 seqnum;
+	struct vm_area_struct *vmas[VMACACHE_SIZE];
+};
+
+enum {
+	MM_FILEPAGES,	/* Resident file mapping pages */
+	MM_ANONPAGES,	/* Resident anonymous pages */
+	MM_SWAPENTS,	/* Anonymous swap entries */
+	MM_SHMEMPAGES,	/* Resident shared memory pages */
+	NR_MM_COUNTERS
+};
+
+#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
+#define SPLIT_RSS_COUNTING
+/* per-thread cached information, */
+struct task_rss_stat {
+	int events;	/* for synchronization threshold */
+	int count[NR_MM_COUNTERS];
+};
+#endif /* USE_SPLIT_PTE_PTLOCKS */
+
+struct mm_rss_stat {
+	atomic_long_t count[NR_MM_COUNTERS];
+};
+
+struct page_frag {
+	struct page *page;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+	__u32 offset;
+	__u32 size;
+#else
+	__u16 offset;
+	__u16 size;
+#endif
+};
+
 #endif /* _LINUX_MM_TYPES_TASK_H */
-- 
cgit v1.2.3


From 77ba809e8b39b4e384df0433e2bd3dd0907dad29 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 00:16:44 +0100
Subject: sched/headers: Remove the <linux/mm_types.h> dependency from
 <linux/sched.h>

Use the freshly introduced, reduced size <linux/mm_types_task.h> header instead.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/video/fbdev/auo_k190x.c | 1 +
 include/linux/sched.h           | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/auo_k190x.c b/drivers/video/fbdev/auo_k190x.c
index 9580374667ba..0d06038324e0 100644
--- a/drivers/video/fbdev/auo_k190x.c
+++ b/drivers/video/fbdev/auo_k190x.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 #include <linux/kernel.h>
 #include <linux/gpio.h>
 #include <linux/platform_device.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4934733bd6cb..3eb284741790 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -8,7 +8,7 @@
 #include <linux/capability.h>
 #include <linux/mutex.h>
 #include <linux/plist.h>
-#include <linux/mm_types.h>
+#include <linux/mm_types_task.h>
 #include <asm/ptrace.h>
 
 #include <linux/sem.h>
-- 
cgit v1.2.3


From cdc75e9f7b14f29efcf4b162a3c673733e96db79 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 01:20:53 +0100
Subject: sched/headers: Move 'init_task' and 'init_thread_union' from
 <linux/sched.h> to <linux/sched/task.h>

'init_task' is really not part of core scheduler APIs but part of
the fork() interface between the scheduler and process management.

So move the declarations.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 3 ---
 include/linux/sched/task.h | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3eb284741790..4ab105a2a8f9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1469,9 +1469,6 @@ static inline int kstack_end(void *addr)
 }
 #endif
 
-extern union thread_union init_thread_union;
-extern struct task_struct init_task;
-
 extern struct pid_namespace init_pid_ns;
 
 /*
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 20ed9108f261..1be049a18d1b 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -8,6 +8,9 @@
 
 #include <linux/sched.h>
 
+struct task_struct;
+union thread_union;
+
 /*
  * This serializes "schedule()" and also protects
  * the run-queue from deletions/modifications (but
@@ -17,6 +20,9 @@
 extern rwlock_t tasklist_lock;
 extern spinlock_t mmlist_lock;
 
+extern union thread_union init_thread_union;
+extern struct task_struct init_task;
+
 #ifdef CONFIG_PROVE_RCU
 extern int lockdep_tasklist_lock_is_held(void);
 #endif /* #ifdef CONFIG_PROVE_RCU */
-- 
cgit v1.2.3


From 56cd697366b6d6f67acb6c58ac7f3b185d11ef07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 10:57:33 +0100
Subject: sched/headers: Move the task_lock()/unlock() APIs to
 <linux/sched/task.h>

The task_lock()/task_unlock() APIs are not realated to core scheduling,
they are task lifetime APIs, i.e. they belong into <linux/sched/task.h>.

Move them.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 20 --------------------
 include/linux/sched/task.h | 20 ++++++++++++++++++++
 kernel/cgroup/cgroup-v1.c  |  1 +
 kernel/cgroup/namespace.c  |  2 +-
 4 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ab105a2a8f9..d481c129a822 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1526,26 +1526,6 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-/*
- * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
- * subscriptions and synchronises with wait4().  Also used in procfs.  Also
- * pins the final release of task.io_context.  Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
- *
- * Nests both inside and outside of read_lock(&tasklist_lock).
- * It must not be nested with write_lock_irq(&tasklist_lock),
- * neither inside nor outside.
- */
-static inline void task_lock(struct task_struct *p)
-{
-	spin_lock(&p->alloc_lock);
-}
-
-static inline void task_unlock(struct task_struct *p)
-{
-	spin_unlock(&p->alloc_lock);
-}
-
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 1be049a18d1b..2be9fde588a7 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -91,4 +91,24 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 }
 #endif
 
+/*
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
+ * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+ * pins the final release of task.io_context.  Also protects ->cpuset and
+ * ->cgroup.subsys[]. And ->vfork_done.
+ *
+ * Nests both inside and outside of read_lock(&tasklist_lock).
+ * It must not be nested with write_lock_irq(&tasklist_lock),
+ * neither inside nor outside.
+ */
+static inline void task_lock(struct task_struct *p)
+{
+	spin_lock(&p->alloc_lock);
+}
+
+static inline void task_unlock(struct task_struct *p)
+{
+	spin_unlock(&p->alloc_lock);
+}
+
 #endif /* _LINUX_SCHED_TASK_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 08d2cb605101..abc585858685 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -6,6 +6,7 @@
 #include <linux/delay.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index cff7ea62c38f..96d38dab6fb2 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -1,6 +1,6 @@
 #include "cgroup-internal.h"
 
-#include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
 #include <linux/proc_ns.h>
-- 
cgit v1.2.3


From 1050b27c52f615bc0e772b3119881e5304ccde6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 11:48:36 +0100
Subject: sched/headers: Move cputime functionality from <linux/sched.h> and
 <linux/cputime.h> into <linux/sched/cputime.h>

Move cputime related functionality out of <linux/sched.h>, as most code
that includes <linux/sched.h> does not use that functionality.

Move data types that are not included in task_struct directly to
the signal definitions, into <linux/sched/signal.h>.

Also merge the (small) existing <linux/cputime.h> header into <linux/sched/cputime.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cputime.h       |  13 ---
 include/linux/sched.h         |  89 ---------------------
 include/linux/sched/cputime.h | 182 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/sched/signal.h  |  33 ++++++++
 kernel/sched/stats.h          | 111 --------------------------
 5 files changed, 214 insertions(+), 214 deletions(-)
 delete mode 100644 include/linux/cputime.h

(limited to 'include/linux')

diff --git a/include/linux/cputime.h b/include/linux/cputime.h
deleted file mode 100644
index a691dc4ddc13..000000000000
--- a/include/linux/cputime.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __LINUX_CPUTIME_H
-#define __LINUX_CPUTIME_H
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#include <asm/cputime.h>
-
-#ifndef cputime_to_nsecs
-# define cputime_to_nsecs(__ct)	\
-	(cputime_to_usecs(__ct) * NSEC_PER_USEC)
-#endif
-
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-#endif /* __LINUX_CPUTIME_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d481c129a822..2b43f55e4956 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -219,14 +219,6 @@ struct prev_cputime {
 #endif
 };
 
-static inline void prev_cputime_init(struct prev_cputime *prev)
-{
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	prev->utime = prev->stime = 0;
-	raw_spin_lock_init(&prev->lock);
-#endif
-}
-
 /**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in nanoseconds
@@ -248,40 +240,6 @@ struct task_cputime {
 #define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
-/*
- * This is the atomic variant of task_cputime, which can be used for
- * storing and updating task_cputime statistics without locking.
- */
-struct task_cputime_atomic {
-	atomic64_t utime;
-	atomic64_t stime;
-	atomic64_t sum_exec_runtime;
-};
-
-#define INIT_CPUTIME_ATOMIC \
-	(struct task_cputime_atomic) {				\
-		.utime = ATOMIC64_INIT(0),			\
-		.stime = ATOMIC64_INIT(0),			\
-		.sum_exec_runtime = ATOMIC64_INIT(0),		\
-	}
-
-/**
- * struct thread_group_cputimer - thread group interval timer counts
- * @cputime_atomic:	atomic thread group interval timers.
- * @running:		true when there are timers running and
- *			@cputime_atomic receives updates.
- * @checking_timer:	true when a thread in the group is in the
- *			process of checking for thread group timers.
- *
- * This structure contains the version of task_cputime, above, that is
- * used for thread group CPU timer calculations.
- */
-struct thread_group_cputimer {
-	struct task_cputime_atomic cputime_atomic;
-	bool running;
-	bool checking_timer;
-};
-
 #include <linux/rwsem.h>
 
 #ifdef CONFIG_SCHED_INFO
@@ -1234,44 +1192,6 @@ static inline void put_task_struct(struct task_struct *t)
 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
 struct task_struct *try_get_task_struct(struct task_struct **ptask);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void task_cputime(struct task_struct *t,
-			 u64 *utime, u64 *stime);
-extern u64 task_gtime(struct task_struct *t);
-#else
-static inline void task_cputime(struct task_struct *t,
-				u64 *utime, u64 *stime)
-{
-	*utime = t->utime;
-	*stime = t->stime;
-}
-
-static inline u64 task_gtime(struct task_struct *t)
-{
-	return t->gtime;
-}
-#endif
-
-#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
-static inline void task_cputime_scaled(struct task_struct *t,
-				       u64 *utimescaled,
-				       u64 *stimescaled)
-{
-	*utimescaled = t->utimescaled;
-	*stimescaled = t->stimescaled;
-}
-#else
-static inline void task_cputime_scaled(struct task_struct *t,
-				       u64 *utimescaled,
-				       u64 *stimescaled)
-{
-	task_cputime(t, utimescaled, stimescaled);
-}
-#endif
-
-extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-
 /*
  * Per process flags
  */
@@ -1395,9 +1315,6 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 #define cpu_relax_yield() cpu_relax()
 #endif
 
-extern unsigned long long
-task_sched_runtime(struct task_struct *task);
-
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
 extern void sched_exec(void);
@@ -1629,12 +1546,6 @@ static __always_inline bool need_resched(void)
 	return unlikely(tif_need_resched());
 }
 
-/*
- * Thread group CPU time accounting.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
-
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
  */
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 6ed4fe43de28..4c5b9735c1ae 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -2,6 +2,186 @@
 #define _LINUX_SCHED_CPUTIME_H
 
 #include <linux/sched/signal.h>
-#include <linux/cputime.h>
+
+/*
+ * cputime accounting APIs:
+ */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#include <asm/cputime.h>
+
+#ifndef cputime_to_nsecs
+# define cputime_to_nsecs(__ct)	\
+	(cputime_to_usecs(__ct) * NSEC_PER_USEC)
+#endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void task_cputime(struct task_struct *t,
+			 u64 *utime, u64 *stime);
+extern u64 task_gtime(struct task_struct *t);
+#else
+static inline void task_cputime(struct task_struct *t,
+				u64 *utime, u64 *stime)
+{
+	*utime = t->utime;
+	*stime = t->stime;
+}
+
+static inline u64 task_gtime(struct task_struct *t)
+{
+	return t->gtime;
+}
+#endif
+
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+static inline void task_cputime_scaled(struct task_struct *t,
+				       u64 *utimescaled,
+				       u64 *stimescaled)
+{
+	*utimescaled = t->utimescaled;
+	*stimescaled = t->stimescaled;
+}
+#else
+static inline void task_cputime_scaled(struct task_struct *t,
+				       u64 *utimescaled,
+				       u64 *stimescaled)
+{
+	task_cputime(t, utimescaled, stimescaled);
+}
+#endif
+
+extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
+extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
+
+
+/*
+ * Thread group CPU time accounting.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
+
+
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick.  None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+/**
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
+ *
+ * @tsk:	Pointer to target task.
+ */
+#ifdef CONFIG_POSIX_TIMERS
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running))
+		return NULL;
+
+	/*
+	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+	 * in __exit_signal(), we won't account to the signal struct further
+	 * cputime consumed by that task, even though the task can still be
+	 * ticking after __exit_signal().
+	 *
+	 * In order to keep a consistent behaviour between thread group cputime
+	 * and thread group cputimer accounting, lets also ignore the cputime
+	 * elapsing after __exit_signal() in any thread group timer running.
+	 *
+	 * This makes sure that POSIX CPU clocks and timers are synchronized, so
+	 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+	 * clock delta is behind the expiring timer value.
+	 */
+	if (unlikely(!tsk->sighand))
+		return NULL;
+
+	return cputimer;
+}
+#else
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the utime field of the
+ *		thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+					   u64 cputime)
+{
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
+
+	if (!cputimer)
+		return;
+
+	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
+}
+
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the stime field of the
+ *		thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+					     u64 cputime)
+{
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
+
+	if (!cputimer)
+		return;
+
+	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
+}
+
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk:	Pointer to task structure.
+ * @ns:		Time value by which to increment the sum_exec_runtime field
+ *		of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					      unsigned long long ns)
+{
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
+
+	if (!cputimer)
+		return;
+
+	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
+}
+
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	prev->utime = prev->stime = 0;
+	raw_spin_lock_init(&prev->lock);
+#endif
+}
+
+extern unsigned long long
+task_sched_runtime(struct task_struct *task);
 
 #endif /* _LINUX_SCHED_CPUTIME_H */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index b3545fb4333a..2cf446704cd4 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -35,6 +35,39 @@ struct cpu_itimer {
 	u64 incr;
 };
 
+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+	(struct task_cputime_atomic) {				\
+		.utime = ATOMIC64_INIT(0),			\
+		.stime = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),		\
+	}
+/**
+ * struct thread_group_cputimer - thread group interval timer counts
+ * @cputime_atomic:	atomic thread group interval timers.
+ * @running:		true when there are timers running and
+ *			@cputime_atomic receives updates.
+ * @checking_timer:	true when a thread in the group is in the
+ *			process of checking for thread group timers.
+ *
+ * This structure contains the version of task_cputime, above, that is
+ * used for thread group CPU timer calculations.
+ */
+struct thread_group_cputimer {
+	struct task_cputime_atomic cputime_atomic;
+	bool running;
+	bool checking_timer;
+};
+
 /*
  * NOTE! "signal_struct" does not have its own
  * locking, because a shared signal_struct always
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index bf0da0aa0a14..d5710651043b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -164,114 +164,3 @@ sched_info_switch(struct rq *rq,
 #define sched_info_arrive(rq, next)		do { } while (0)
 #define sched_info_switch(rq, t, next)		do { } while (0)
 #endif /* CONFIG_SCHED_INFO */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick.  None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
- *
- * @tsk:	Pointer to target task.
- */
-#ifdef CONFIG_POSIX_TIMERS
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	/* Check if cputimer isn't running. This is accessed without locking. */
-	if (!READ_ONCE(cputimer->running))
-		return NULL;
-
-	/*
-	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
-	 * in __exit_signal(), we won't account to the signal struct further
-	 * cputime consumed by that task, even though the task can still be
-	 * ticking after __exit_signal().
-	 *
-	 * In order to keep a consistent behaviour between thread group cputime
-	 * and thread group cputimer accounting, lets also ignore the cputime
-	 * elapsing after __exit_signal() in any thread group timer running.
-	 *
-	 * This makes sure that POSIX CPU clocks and timers are synchronized, so
-	 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
-	 * clock delta is behind the expiring timer value.
-	 */
-	if (unlikely(!tsk->sighand))
-		return NULL;
-
-	return cputimer;
-}
-#else
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
-	return NULL;
-}
-#endif
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the utime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-					   u64 cputime)
-{
-	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
-	if (!cputimer)
-		return;
-
-	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the stime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
-					     u64 cputime)
-{
-	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
-	if (!cputimer)
-		return;
-
-	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					      unsigned long long ns)
-{
-	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
-	if (!cputimer)
-		return;
-
-	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-}
-- 
cgit v1.2.3


From a2d7a7463c38dd14b845721aac3583a26a97c66d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 12:07:04 +0100
Subject: sched/headers: Move sched_info_on() and force_schedstat_enabled()
 from <linux/sched.h> to <linux/sched/stat.h>

These APIs are not core scheduler but scheduler statistics related.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 16 ----------------
 include/linux/sched/stat.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b43f55e4956..707eee099332 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,22 +254,6 @@ struct sched_info {
 };
 #endif /* CONFIG_SCHED_INFO */
 
-static inline int sched_info_on(void)
-{
-#ifdef CONFIG_SCHEDSTATS
-	return 1;
-#elif defined(CONFIG_TASK_DELAY_ACCT)
-	extern int delayacct_on;
-	return delayacct_on;
-#else
-	return 0;
-#endif
-}
-
-#ifdef CONFIG_SCHEDSTATS
-void force_schedstat_enabled(void);
-#endif
-
 /*
  * Integer metrics need fixed point arithmetic, e.g., sched/fair
  * has a few: load, load_avg, util_avg, freq, and capacity.
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 0d52ceb6d3ae..d8cedf27083e 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -21,4 +21,20 @@ extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
+static inline int sched_info_on(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	return 1;
+#elif defined(CONFIG_TASK_DELAY_ACCT)
+	extern int delayacct_on;
+	return delayacct_on;
+#else
+	return 0;
+#endif
+}
+
+#ifdef CONFIG_SCHEDSTATS
+void force_schedstat_enabled(void);
+#endif
+
 #endif /* _LINUX_SCHED_STAT_H */
-- 
cgit v1.2.3


From 28851755990c832d7050d5e1e2c91ed9d9e6fe59 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:24:31 +0100
Subject: sched/headers, vfs/execve: Move the do_execve*() prototypes from
 <linux/sched.h> to <linux/binfmts.h>

These are not scheduler related.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/binfmts.h | 10 ++++++++++
 include/linux/sched.h   |  8 --------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 1303b570b18c..05488da3aee9 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -6,6 +6,8 @@
 #include <asm/exec.h>
 #include <uapi/linux/binfmts.h>
 
+struct filename;
+
 #define CORENAME_MAX_SIZE 128
 
 /*
@@ -123,4 +125,12 @@ extern void install_exec_creds(struct linux_binprm *bprm);
 extern void set_binfmt(struct linux_binfmt *new);
 extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
 
+extern int do_execve(struct filename *,
+		     const char __user * const __user *,
+		     const char __user * const __user *);
+extern int do_execveat(int, struct filename *,
+		       const char __user * const __user *,
+		       const char __user * const __user *,
+		       int);
+
 #endif /* _LINUX_BINFMTS_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 707eee099332..5f2267e41fde 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,14 +1400,6 @@ extern void exit_files(struct task_struct *);
 
 extern void exit_itimers(struct signal_struct *);
 
-extern int do_execve(struct filename *,
-		     const char __user * const __user *,
-		     const char __user * const __user *);
-extern int do_execveat(int, struct filename *,
-		       const char __user * const __user *,
-		       const char __user * const __user *,
-		       int);
-
 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
 static inline void set_task_comm(struct task_struct *tsk, const char *from)
 {
-- 
cgit v1.2.3


From 9049863a32fad5d7158318e45f0a41a0f2192c0c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:31:22 +0100
Subject: sched/headers: Move kstack_end() from <linux/sched.h> to
 <linux/sched/task_stack.h>

This is a task-stack related API, not core scheduler functionality.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h            | 10 ----------
 include/linux/sched/task_stack.h | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f2267e41fde..309eb76b7eab 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1360,16 +1360,6 @@ static inline struct thread_info *task_thread_info(struct task_struct *task)
 # define task_thread_info(task)	((struct thread_info *)(task)->stack)
 #endif
 
-#ifndef __HAVE_ARCH_KSTACK_END
-static inline int kstack_end(void *addr)
-{
-	/* Reliable end of stack detection:
-	 * Some APM bios versions misalign the stack
-	 */
-	return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
-}
-#endif
-
 extern struct pid_namespace init_pid_ns;
 
 /*
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index aaa5c2a6a0e9..df6ea6665b31 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -108,4 +108,14 @@ static inline unsigned long stack_not_used(struct task_struct *p)
 #endif
 extern void set_task_stack_end_magic(struct task_struct *tsk);
 
+#ifndef __HAVE_ARCH_KSTACK_END
+static inline int kstack_end(void *addr)
+{
+	/* Reliable end of stack detection:
+	 * Some APM bios versions misalign the stack
+	 */
+	return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
+}
+#endif
+
 #endif /* _LINUX_SCHED_TASK_STACK_H */
-- 
cgit v1.2.3


From 42011db0ed5a9c92b1281e1300eb3d026f3764a8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:35:41 +0100
Subject: sched/headers: Move exit_files() and exit_itimers() from
 <linux/sched.h> to <linux/sched/task.h>

These two functions are task management related, not core scheduler APIs.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 4 ----
 include/linux/sched/task.h | 3 +++
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 309eb76b7eab..4a03c49e3bcc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1386,10 +1386,6 @@ extern void wake_up_new_task(struct task_struct *tsk);
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
 
-extern void exit_files(struct task_struct *);
-
-extern void exit_itimers(struct signal_struct *);
-
 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
 static inline void set_task_comm(struct task_struct *tsk, const char *from)
 {
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 2be9fde588a7..a33a8121da9a 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -67,6 +67,9 @@ static inline void exit_thread(struct task_struct *tsk)
 #endif
 extern void do_group_exit(int);
 
+extern void exit_files(struct task_struct *);
+extern void exit_itimers(struct signal_struct *);
+
 extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
-- 
cgit v1.2.3


From c5a21921d55a2aee9d1e031a78cef6cda3eab78b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:47:12 +0100
Subject: sched/headers: Move _init() prototypes from <linux/sched.h> to
 <linux/sched/init.h>

trap_init() and cpu_init() belong into <linux/cpu.h>, sched_init*() into <linux/sched/init.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 5 -----
 include/linux/sched/init.h | 7 +++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a03c49e3bcc..0fb56ae0abc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,15 +176,10 @@ struct uts_namespace;
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
-extern void sched_init(void);
-extern void sched_init_smp(void);
-
 extern cpumask_var_t cpu_isolated_map;
 
 extern int runqueue_is_locked(int cpu);
 
-extern void cpu_init (void);
-extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
diff --git a/include/linux/sched/init.h b/include/linux/sched/init.h
index 15e46313e55e..f31d16075857 100644
--- a/include/linux/sched/init.h
+++ b/include/linux/sched/init.h
@@ -3,4 +3,11 @@
 
 #include <linux/sched.h>
 
+/*
+ * Scheduler init related prototypes:
+ */
+
+extern void sched_init(void);
+extern void sched_init_smp(void);
+
 #endif /* _LINUX_SCHED_INIT_H */
-- 
cgit v1.2.3


From 93b5a9a7051e51ce50109046af0235268a261ba0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:53:36 +0100
Subject: sched/headers, timekeeping: Move the timer tick function prototypes
 to <linux/timekeeping.h>

Move the update_process_times() and xtime_update() prototypes to <linux/timekeeping.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       | 1 -
 include/linux/time.h        | 2 --
 include/linux/timekeeping.h | 4 ++++
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0fb56ae0abc7..dc8d94dc140f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -180,7 +180,6 @@ extern cpumask_var_t cpu_isolated_map;
 
 extern int runqueue_is_locked(int cpu);
 
-extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
diff --git a/include/linux/time.h b/include/linux/time.h
index 4f4ab65b6e61..23f0f5ce3090 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -167,8 +167,6 @@ static inline bool timespec_inject_offset_valid(const struct timespec *ts)
 extern u32 (*arch_gettimeoffset)(void);
 #endif
 
-extern void xtime_update(unsigned long ticks);
-
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index d2e804e15c3e..b598cbc7b576 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -8,6 +8,10 @@
 void timekeeping_init(void);
 extern int timekeeping_suspended;
 
+/* Architecture timer tick functions: */
+extern void update_process_times(int user);
+extern void xtime_update(unsigned long ticks);
+
 /*
  * Get and set timeofday
  */
-- 
cgit v1.2.3


From dcc2dc45f7cf267d37843059ff75b70260596c69 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 15:05:49 +0100
Subject: sched/headers, mm: Move 'struct tlbflush_unmap_batch' from
 <linux/sched.h> to <linux/mm_types_task.h>

Unclutter <linux/sched.h> some more.

Also move the CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH condition inside the
structure body definition, to remove a pair of #ifdefs from sched.h.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types_task.h | 22 ++++++++++++++++++++++
 include/linux/sched.h         | 21 ---------------------
 2 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 9526d8b9fe0e..136dfdf63ba1 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/threads.h>
 #include <linux/atomic.h>
+#include <linux/cpumask.h>
 
 #include <asm/page.h>
 
@@ -62,4 +63,25 @@ struct page_frag {
 #endif
 };
 
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	/*
+	 * Each bit set is a CPU that potentially has a TLB entry for one of
+	 * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+	 */
+	struct cpumask cpumask;
+
+	/* True if any bit in cpumask is set */
+	bool flush_required;
+
+	/*
+	 * If true then the PTE was dirty when unmapped. The entry must be
+	 * flushed before IO is initiated or a stale TLB entry potentially
+	 * allows an update without redirtying the page.
+	 */
+	bool writable;
+#endif
+};
+
 #endif /* _LINUX_MM_TYPES_TASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc8d94dc140f..8d6b7aa7f3ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -484,25 +484,6 @@ struct wake_q_node {
 	struct wake_q_node *next;
 };
 
-/* Track pages that require TLB flushes */
-struct tlbflush_unmap_batch {
-	/*
-	 * Each bit set is a CPU that potentially has a TLB entry for one of
-	 * the PFNs being flushed. See set_tlb_ubc_flush_pending().
-	 */
-	struct cpumask cpumask;
-
-	/* True if any bit in cpumask is set */
-	bool flush_required;
-
-	/*
-	 * If true then the PTE was dirty when unmapped. The entry must be
-	 * flushed before IO is initiated or a stale TLB entry potentially
-	 * allows an update without redirtying the page.
-	 */
-	bool writable;
-};
-
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -895,9 +876,7 @@ struct task_struct {
 	unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	struct tlbflush_unmap_batch tlb_ubc;
-#endif
 
 	struct rcu_head rcu;
 
-- 
cgit v1.2.3


From cda66725c1444db67127115d611c982b62b45d8c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 15:30:50 +0100
Subject: sched/headers: Move the get_task_struct()/put_task_struct() and
 related APIs from <linux/sched.h> to <linux/sched/task.h>

These belong into the task lifetime API header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 14 --------------
 include/linux/sched/task.h | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d6b7aa7f3ac..b68358c60560 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1135,20 +1135,6 @@ static inline int is_global_init(struct task_struct *tsk)
 
 extern struct pid *cad_pid;
 
-extern void free_task(struct task_struct *tsk);
-#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-
-extern void __put_task_struct(struct task_struct *t);
-
-static inline void put_task_struct(struct task_struct *t)
-{
-	if (atomic_dec_and_test(&t->usage))
-		__put_task_struct(t);
-}
-
-struct task_struct *task_rcu_dereference(struct task_struct **ptask);
-struct task_struct *try_get_task_struct(struct task_struct **ptask);
-
 /*
  * Per process flags
  */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a33a8121da9a..3886ae64148f 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -75,6 +75,21 @@ extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, i
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
+extern void free_task(struct task_struct *tsk);
+
+#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+
+extern void __put_task_struct(struct task_struct *t);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		__put_task_struct(t);
+}
+
+struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+struct task_struct *try_get_task_struct(struct task_struct **ptask);
+
 
 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
 extern int arch_task_struct_size __read_mostly;
-- 
cgit v1.2.3


From 6f175fc9536355d8aa5c2d4854848a97c244a031 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 11:12:45 +0100
Subject: sched/headers: Move the sched_exec() prototype to
 <linux/sched/task.h>

sched_exec() better fits into the task lifetime APIs than into the core scheduler
APIs.

This reduces the size of <linux/sched.h> a bit.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 7 -------
 include/linux/sched/task.h | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b68358c60560..bd89fc17767c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1258,13 +1258,6 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 #define cpu_relax_yield() cpu_relax()
 #endif
 
-/* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
-extern void sched_exec(void);
-#else
-#define sched_exec()   {}
-#endif
-
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 3886ae64148f..a978d7189cfd 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -77,6 +77,13 @@ extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
 extern void free_task(struct task_struct *tsk);
 
+/* sched_exec is called by processes performing an exec */
+#ifdef CONFIG_SMP
+extern void sched_exec(void);
+#else
+#define sched_exec()   {}
+#endif
+
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
 extern void __put_task_struct(struct task_struct *t);
-- 
cgit v1.2.3


From c03cb28e7c44055cd00d11b8581b9fa46a3e3764 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/topology.h>

The <linux/sched/topology.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index b3550b36e65d..0d6fceff37bb 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -1,8 +1,6 @@
 #ifndef _LINUX_SCHED_TOPOLOGY_H
 #define _LINUX_SCHED_TOPOLOGY_H
 
-#include <linux/sched.h>
-
 #include <linux/sched/idle.h>
 
 /*
-- 
cgit v1.2.3


From f411229e1dd623a3dee623824d094546789977e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:49:05 +0100
Subject: sched/headers: Remove tsk_is_polling()

It's not used by anything.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/idle.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 66ee32f87f0b..5ca63ebad6b4 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -17,10 +17,6 @@ extern void wake_up_if_idle(int cpu);
  * polling state.
  */
 #ifdef TIF_POLLING_NRFLAG
-static inline int tsk_is_polling(struct task_struct *p)
-{
-	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
-}
 
 static inline void __current_set_polling(void)
 {
@@ -59,7 +55,6 @@ static inline bool __must_check current_clr_polling_and_test(void)
 }
 
 #else
-static inline int tsk_is_polling(struct task_struct *p) { return 0; }
 static inline void __current_set_polling(void) { }
 static inline void __current_clr_polling(void) { }
 
-- 
cgit v1.2.3


From ea94763950e49d1aa622c59218ee2fc3b434ba6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/clock.h>

The <linux/sched/clock.h> file is a largely self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/clock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index ac12f71d359c..4a68c6791207 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_SCHED_CLOCK_H
 #define _LINUX_SCHED_CLOCK_H
 
-#include <linux/sched.h>
+#include <linux/smp.h>
 
 /*
  * Do not use outside of architecture code which knows its limitations.
-- 
cgit v1.2.3


From cc689c5b352d4a7510b11c975c5c94d4785b37e7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Remove <linux/sched.h> and <linux/slab.h> from
 <linux/delayacct.h>

The <linux/delayacct.h> file is a self-contained header and users of
it either don't need <linux/sched.h> and <linux/slab.h> - or have
already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/delayacct.h    | 2 --
 include/linux/fault-inject.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 6b769cbd1000..4178d2493547 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -18,8 +18,6 @@
 #define _LINUX_DELAYACCT_H
 
 #include <uapi/linux/taskstats.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
 
 /*
  * Per-task flags relevant to delay accounting
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 9f4956d8601c..728d4e0292aa 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -61,6 +61,8 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name,
 
 #endif /* CONFIG_FAULT_INJECTION */
 
+struct kmem_cache;
+
 #ifdef CONFIG_FAILSLAB
 extern bool should_failslab(struct kmem_cache *s, gfp_t gfpflags);
 #else
-- 
cgit v1.2.3


From 5a2d6880f461faa416c0d329d46a128cf342c1eb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:54 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/loadavg.h>

The <linux/sched/loadavg.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/loadavg.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index c392e28ce0ac..4264bc6b2c27 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -1,8 +1,6 @@
 #ifndef _LINUX_SCHED_LOADAVG_H
 #define _LINUX_SCHED_LOADAVG_H
 
-#include <linux/sched.h>
-
 /*
  * These are the constant used to fake the fixed-point load-average
  * counting. Some notes:
-- 
cgit v1.2.3


From 5fd73157bdfb57370b69be7c0067f08e610e7daa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:54 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/autogroup.h>

The <linux/sched/autogroup.h> file is a largely self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

Add a 'task_struct' predeclaration to make it build standalone.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/autogroup.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/autogroup.h b/include/linux/sched/autogroup.h
index fd6855548d0c..55cd496df884 100644
--- a/include/linux/sched/autogroup.h
+++ b/include/linux/sched/autogroup.h
@@ -1,9 +1,8 @@
 #ifndef _LINUX_SCHED_AUTOGROUP_H
 #define _LINUX_SCHED_AUTOGROUP_H
 
-#include <linux/sched.h>
-
 struct signal_struct;
+struct task_struct;
 struct task_group;
 struct seq_file;
 
-- 
cgit v1.2.3


From b8d6d80b37a98e3910f1b4e799f8ebea88e94bfa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:54 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/mm.h>

The <linux/sched/mm.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

Include kernel.h and atomic.h.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 62dca4e0b4e0..830953ebb391 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_SCHED_MM_H
 #define _LINUX_SCHED_MM_H
 
+#include <linux/kernel.h>
+#include <linux/atomic.h>
 #include <linux/sched.h>
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
-- 
cgit v1.2.3


From ae1cc8823204bb938d039afa43c28a84591ada9f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:54 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/coredump.h>

The <linux/sched/coredump.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

Include <linux/mm_types.h>.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/coredump.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index f46912aa4f4c..69eedcef8f03 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_SCHED_COREDUMP_H
 #define _LINUX_SCHED_COREDUMP_H
 
-#include <linux/sched.h>
 #include <linux/mm_types.h>
 
 #define SUID_DUMP_DISABLE	0	/* No setuid dumping */
-- 
cgit v1.2.3


From 556725839e543bc2b45bd73121c0b1c1d6c23714 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 10:31:24 +0100
Subject: sched/headers: Remove unused 'task_can_switch_user()' prototype

The function does not exist anymore.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd89fc17767c..43ed34ccb53a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1509,9 +1509,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-extern int task_can_switch_user(struct user_struct *up,
-					struct task_struct *tsk);
-
 #ifndef TASK_SIZE_OF
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
-- 
cgit v1.2.3


From de8deb0ad79f8a49cea5110c006c8676a11611f7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:55 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/user.h>

If we add <linux/uidgid.h> then <linux/sched/user.h> becomes a
self-contained header and users of it either don't need <linux/sched.h>
or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/user.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 40c6363da5ef..5d5415e129d4 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -1,7 +1,8 @@
 #ifndef _LINUX_SCHED_USER_H
 #define _LINUX_SCHED_USER_H
 
-#include <linux/sched.h>
+#include <linux/uidgid.h>
+#include <linux/atomic.h>
 
 struct key;
 
-- 
cgit v1.2.3


From ac4e620967bb72f86371154935aa8b67cf54e225 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 11:44:31 +0100
Subject: sched/headers: Remove #include <linux/capability.h> from
 <linux/sched.h>

The <linux/sched.h> header does not actually make use of any
types or APIs defined in <linux/capability.h>, so remove its inclusion.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43ed34ccb53a..54eefb2ad603 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -5,7 +5,6 @@
 
 #include <linux/sched/prio.h>
 
-#include <linux/capability.h>
 #include <linux/mutex.h>
 #include <linux/plist.h>
 #include <linux/mm_types_task.h>
-- 
cgit v1.2.3


From aa829c7679a13fca667f772db1f0ea03adc29122 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 11:54:05 +0100
Subject: sched/headers: Remove <linux/cgroup-defs.h> from <linux/sched.h>

It's not used by anything in <linux/sched.h> anymore.

This reduces the preprocessed size of <linux/sched.h> and
speeds up the build a bit.

Also fix code that implicitly relied on headers included by <linux/cgroup-defs.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h             | 1 -
 include/target/target_core_base.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54eefb2ad603..8991e4d7cd0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -28,7 +28,6 @@
 #include <linux/gfp.h>
 #include <linux/topology.h>
 #include <linux/magic.h>
-#include <linux/cgroup-defs.h>
 
 #include <asm/current.h>
 
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 8be9ba73383d..774c29b57e82 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -4,6 +4,7 @@
 #include <linux/configfs.h>      /* struct config_group */
 #include <linux/dma-direction.h> /* enum dma_data_direction */
 #include <linux/percpu_ida.h>    /* struct percpu_ida */
+#include <linux/percpu-refcount.h>
 #include <linux/semaphore.h>     /* struct semaphore */
 #include <linux/completion.h>
 
-- 
cgit v1.2.3


From ed53742d793bd92ed32114081370b199dc94467d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:55 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/cpufreq.h>

Make the <linux/sched/cpufreq.h> file a self-contained header and
remove the <linux/sched.h> dependency: users of it either don't
need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/cpufreq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 2bdcfbfc4c30..d2be2ccbb372 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_SCHED_CPUFREQ_H
 #define _LINUX_SCHED_CPUFREQ_H
 
-#include <linux/sched.h>
+#include <linux/types.h>
 
 /*
  * Interface between cpufreq drivers and the scheduler:
-- 
cgit v1.2.3


From 71af2ed5eeea639339e3a1497a0196bab7de4b57 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 09:57:00 +0100
Subject: kasan, sched/headers: Remove <linux/sched.h> from <linux/kasan.h>

<linux/kasan.h> is a low level header that is included early
in affected kernel headers. But it includes <linux/sched.h>
which complicates the cleanup of sched.h dependencies.

Remove it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kasan.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7793036eac80..ceb3fe78a0d3 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
-#include <linux/sched.h>
 #include <linux/types.h>
 
 struct kmem_cache;
-- 
cgit v1.2.3


From e26512fea5bcd6602dbf02a551ed073cd4529449 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 17:54:15 +0100
Subject: sched/headers: Remove <linux/cred.h> inclusion from <linux/sched.h>

This reduces header dependencies and speeds up the build.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8991e4d7cd0f..1be69735cef3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -24,7 +24,6 @@
 #include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
-#include <linux/cred.h>
 #include <linux/gfp.h>
 #include <linux/topology.h>
 #include <linux/magic.h>
-- 
cgit v1.2.3


From f780d89a0e820a529cf91fb78b52565e1b37b774 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 10:03:42 +0100
Subject: sched/headers: Remove <asm/ptrace.h> from <linux/sched.h>

This reduces header dependencies.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/sh/include/asm/fpu.h | 2 ++
 arch/sh/mm/extable_32.c   | 2 ++
 include/linux/sched.h     | 1 -
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h
index 09fc2bc8a790..50921c7cc3f0 100644
--- a/arch/sh/include/asm/fpu.h
+++ b/arch/sh/include/asm/fpu.h
@@ -3,6 +3,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/ptrace.h>
+
 struct task_struct;
 
 #ifdef CONFIG_SH_FPU
diff --git a/arch/sh/mm/extable_32.c b/arch/sh/mm/extable_32.c
index 24a75d315dcb..940e871bc816 100644
--- a/arch/sh/mm/extable_32.c
+++ b/arch/sh/mm/extable_32.c
@@ -7,6 +7,8 @@
 #include <linux/extable.h>
 #include <linux/uaccess.h>
 
+#include <asm/ptrace.h>
+
 int fixup_exception(struct pt_regs *regs)
 {
 	const struct exception_table_entry *fixup;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1be69735cef3..9dfa9c113570 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -8,7 +8,6 @@
 #include <linux/mutex.h>
 #include <linux/plist.h>
 #include <linux/mm_types_task.h>
-#include <asm/ptrace.h>
 
 #include <linux/sem.h>
 #include <linux/shm.h>
-- 
cgit v1.2.3


From 9c6da18109d603a99915b257929c0370c9d3ae40 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 10:08:30 +0100
Subject: sched/headers: Remove <linux/rtmutex.h> from <linux/sched.h>

This reduces header dependencies.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9dfa9c113570..cc5c99a27e21 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,7 +16,6 @@
 #include <linux/pid.h>
 #include <linux/seccomp.h>
 #include <linux/rculist.h>
-#include <linux/rtmutex.h>
 
 #include <linux/resource.h>
 #include <linux/hrtimer.h>
-- 
cgit v1.2.3


From f0a0eb699967f4f51567b563818bafe4017bef73 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 20:56:33 +0100
Subject: sched/headers: Remove the <linux/gfp.h> include from <linux/sched.h>

This reduces sched.h header dependencies.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cc5c99a27e21..ab80596dddfd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,7 +22,6 @@
 #include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
-#include <linux/gfp.h>
 #include <linux/topology.h>
 #include <linux/magic.h>
 
-- 
cgit v1.2.3


From dc1995392d79b0eeee147a792482b991f74c1494 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:57 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/stat.h>

The <linux/sched/stat.h> file is a largely self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

( Keep the <linux/percpu.h> dependency.)

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/stat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index d8cedf27083e..141b74c53fad 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_SCHED_STAT_H
 #define _LINUX_SCHED_STAT_H
 
-#include <linux/sched.h>
+#include <linux/percpu.h>
 
 /*
  * Various counters maintained by the scheduler and fork(),
-- 
cgit v1.2.3


From b0145d9b332a5117bd0abbb980ab89586a1d5f6c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:57 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/nohz.h>

The <linux/sched/nohz.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/nohz.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 9471f0736a3a..4995b717500b 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -1,8 +1,6 @@
 #ifndef _LINUX_SCHED_NOHZ_H
 #define _LINUX_SCHED_NOHZ_H
 
-#include <linux/sched.h>
-
 /*
  * This is the interface between the scheduler and nohz/dyntics:
  */
-- 
cgit v1.2.3


From 4f079e98a0db5f067c0981a526ff8938e21c52e2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:57 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/debug.h>

The <linux/sched/debug.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/debug.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index 853bbef0b47b..e0eaee54c5a4 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -1,8 +1,6 @@
 #ifndef _LINUX_SCHED_DEBUG_H
 #define _LINUX_SCHED_DEBUG_H
 
-#include <linux/sched.h>
-
 /*
  * Various scheduler/task debugging interfaces:
  */
-- 
cgit v1.2.3


From a906086ef35129d522047a296e7c4237af75fdcb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:57 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/hotplug.h>

The <linux/sched/hotplug.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/hotplug.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index c608d3c1ddb8..752ac7e628d7 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -1,8 +1,6 @@
 #ifndef _LINUX_SCHED_HOTPLUG_H
 #define _LINUX_SCHED_HOTPLUG_H
 
-#include <linux/sched.h>
-
 /*
  * Scheduler interfaces for hotplug CPU support:
  */
-- 
cgit v1.2.3


From fae3c30c2d1ae3ff93877f64e5d55e2443d8f82f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 12:00:08 +0100
Subject: sched/headers: Remove the runqueue_is_locked() prototype

Remove the runqueue_is_locked() prototype, the function does not exist anymore.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ab80596dddfd..ac98255d00fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -172,8 +172,6 @@ struct uts_namespace;
 
 extern cpumask_var_t cpu_isolated_map;
 
-extern int runqueue_is_locked(int cpu);
-
 extern void scheduler_tick(void);
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
-- 
cgit v1.2.3


From cd9c513be34ceaae8bf211474b91b6897574efdd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:58 +0100
Subject: sched/headers: Remove <linux/rwsem.h> from <linux/sched.h>

This is a stray header that is not needed by anything in sched.h,
so remove it.

Update files that relied on the stray inclusion.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h          | 2 --
 include/linux/user_namespace.h | 1 +
 kernel/utsname_sysctl.c        | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ac98255d00fb..b361f881fe44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -226,8 +226,6 @@ struct task_cputime {
 #define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
-#include <linux/rwsem.h>
-
 #ifdef CONFIG_SCHED_INFO
 struct sched_info {
 	/* cumulative counters */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 08264641b502..faa9bfb827da 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -5,6 +5,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/sched.h>
+#include <linux/rwsem.h>
 #include <linux/sysctl.h>
 #include <linux/err.h>
 
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index c8eac43267e9..233cd8fc6910 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#include <linux/rwsem.h>
 
 #ifdef CONFIG_PROC_SYSCTL
 
-- 
cgit v1.2.3


From 192c9414516e7178a44f9ed48d47501c4c19771c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:47:37 +0100
Subject: sched/headers: Remove <linux/signal.h> from <linux/sched.h>

Instead of including the full <linux/signal.h>, only include the types-only
<linux/signal_types.h> header in <linux/sched.h>, to further decouple the
scheduler header from the signal headers.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b361f881fe44..905d4f148d65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,7 +11,6 @@
 
 #include <linux/sem.h>
 #include <linux/shm.h>
-#include <linux/signal.h>
 #include <linux/signal_types.h>
 #include <linux/pid.h>
 #include <linux/seccomp.h>
-- 
cgit v1.2.3


From 1d1954e038435765a623884b626731784cde768f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 01:16:15 +0100
Subject: sched/headers: Remove the 'init_pid_ns' prototype from
 <linux/sched.h>

pid.h already defines it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 905d4f148d65..057b2a85ecc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1301,8 +1301,6 @@ static inline struct thread_info *task_thread_info(struct task_struct *task)
 # define task_thread_info(task)	((struct thread_info *)(task)->stack)
 #endif
 
-extern struct pid_namespace init_pid_ns;
-
 /*
  * find a task by one of its numerical ids
  *
-- 
cgit v1.2.3


From b68070e146b9c2b4ece8d869a4fab9a4f14bbfb4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 01:27:20 +0100
Subject: sched/headers: Remove <linux/rculist.h> from <linux/sched.h>

We don't actually need the full rculist.h header anymore, include
the smaller rcupdate.h header instead.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 057b2a85ecc7..5720d11f3224 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -14,7 +14,7 @@
 #include <linux/signal_types.h>
 #include <linux/pid.h>
 #include <linux/seccomp.h>
-#include <linux/rculist.h>
+#include <linux/rcupdate.h>
 
 #include <linux/resource.h>
 #include <linux/hrtimer.h>
-- 
cgit v1.2.3


From 2c873d55cd838deef8218b6d5fe9bd336cb3113a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 12:11:03 +0100
Subject: sched/core: Remove unused prefetch_stack()

prefetch_stack() is defined by IA64, but not actually used anywhere anymore.

Remove it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/include/asm/processor.h |  2 --
 arch/ia64/kernel/entry.S          | 23 -----------------------
 include/linux/sched.h             |  6 ------
 3 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index 03911a336406..26a63d69c599 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -19,8 +19,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
 
-#define ARCH_HAS_PREFETCH_SWITCH_STACK
-
 #define IA64_NUM_PHYS_STACK_REG	96
 #define IA64_NUM_DBG_REGS	8
 
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 6f27a663177c..e7a716b09350 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -455,29 +455,6 @@ GLOBAL_ENTRY(load_switch_stack)
 	br.cond.sptk.many b7
 END(load_switch_stack)
 
-GLOBAL_ENTRY(prefetch_stack)
-	add r14 = -IA64_SWITCH_STACK_SIZE, sp
-	add r15 = IA64_TASK_THREAD_KSP_OFFSET, in0
-	;;
-	ld8 r16 = [r15]				// load next's stack pointer
-	lfetch.fault.excl [r14], 128
-	;;
-	lfetch.fault.excl [r14], 128
-	lfetch.fault [r16], 128
-	;;
-	lfetch.fault.excl [r14], 128
-	lfetch.fault [r16], 128
-	;;
-	lfetch.fault.excl [r14], 128
-	lfetch.fault [r16], 128
-	;;
-	lfetch.fault.excl [r14], 128
-	lfetch.fault [r16], 128
-	;;
-	lfetch.fault [r16], 128
-	br.ret.sptk.many rp
-END(prefetch_stack)
-
 	/*
 	 * Invoke a system call, but do some tracing before and after the call.
 	 * We MUST preserve the current register frame throughout this routine
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5720d11f3224..bd0111a06aa2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -247,12 +247,6 @@ struct sched_info {
 # define SCHED_FIXEDPOINT_SHIFT	10
 # define SCHED_FIXEDPOINT_SCALE	(1L << SCHED_FIXEDPOINT_SHIFT)
 
-#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
-extern void prefetch_stack(struct task_struct *t);
-#else
-static inline void prefetch_stack(struct task_struct *t) { }
-#endif
-
 struct load_weight {
 	unsigned long weight;
 	u32 inv_weight;
-- 
cgit v1.2.3


From 5c0d0f36414f9f8a292b42e797f9284b127d79c2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:59 +0100
Subject: sched/headers: Remove <linux/sched.h> from <linux/sched/init.h>

The <linux/sched/init.h> file is a self-contained header and users of
it either don't need <linux/sched.h> - or have already included it.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/init.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/init.h b/include/linux/sched/init.h
index f31d16075857..127215045285 100644
--- a/include/linux/sched/init.h
+++ b/include/linux/sched/init.h
@@ -1,8 +1,6 @@
 #ifndef _LINUX_SCHED_INIT_H
 #define _LINUX_SCHED_INIT_H
 
-#include <linux/sched.h>
-
 /*
  * Scheduler init related prototypes:
  */
-- 
cgit v1.2.3


From 50ff9d130014f7b19541dbf607a615a72b75b778 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 16:03:58 +0100
Subject: sched/headers: Remove <linux/magic.h> from <linux/sched/task_stack.h>

It's not used by any of the scheduler methods, but <linux/sched/task_stack.h>
needs it to pick up STACK_END_MAGIC.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h     | 1 -
 kernel/cgroup/cgroup-v1.c | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd0111a06aa2..559be4f1aee5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,7 +22,6 @@
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/topology.h>
-#include <linux/magic.h>
 
 #include <asm/current.h>
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index abc585858685..56eba9caa632 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
-- 
cgit v1.2.3


From b2d5bfea2d00a0000da18f7667c2b0e2c2f168d9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 09:56:40 +0100
Subject: sched/headers, timers: Remove the <linux/sysctl.h> include from
 <linux/timer.h>

So we want to simplify <linux/sched.h>'s header dependencies, but one
roadblock of that is <linux/timer.h>'s inclusion of sysctl.h,
which brings in other, problematic headers.

Note that timer.h's inclusion of sysctl.h can be avoided if we
pre-declare ctl_table - so do that.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h          | 2 +-
 include/linux/user_namespace.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index c7bdf895179c..e6789b8757d5 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -212,7 +212,7 @@ struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-#include <linux/sysctl.h>
+struct ctl_table;
 
 extern unsigned int sysctl_timer_migration;
 int timer_migration_handler(struct ctl_table *table, int write,
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index faa9bfb827da..be765234c0a2 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -5,6 +5,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/sched.h>
+#include <linux/workqueue.h>
 #include <linux/rwsem.h>
 #include <linux/sysctl.h>
 #include <linux/err.h>
-- 
cgit v1.2.3


From 283cb90305cf1686594ed537c7a8cb188eba1a4d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 09:59:47 +0100
Subject: sched/headers, hrtimer: Remove the <linux/wait.h> include from
 <linux/hrtimer.h>

In our quest to simplify <linux/sched.h>'s header dependencies, remove
the <linux/wait.h> inclusion from <linux/hrtimer.h> - which does
not appear to be necessary, as hrtimer.h does not use waitqueues.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/hrtimer.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index e52b427223ba..249e579ecd4c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -19,7 +19,6 @@
 #include <linux/ktime.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/wait.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
 #include <linux/timerqueue.h>
-- 
cgit v1.2.3


From ee6a3d19f15b9b10075481088b8d4537f286d7b4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 10:01:09 +0100
Subject: sched/headers: Remove the <linux/topology.h> include from
 <linux/sched.h>

It's used only by a single (rarely used) inline function (task_node(p)),
which we can move to <linux/sched/topology.h>.

( Add <linux/nodemask.h>, because we rely on that. )

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/sgi-ip27/ip27-smp.c  | 1 +
 include/linux/sched.h          | 7 +------
 include/linux/sched/topology.h | 7 +++++++
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c
index f9ae6a8fa7c7..f5ed45e8f442 100644
--- a/arch/mips/sgi-ip27/ip27-smp.c
+++ b/arch/mips/sgi-ip27/ip27-smp.c
@@ -8,6 +8,7 @@
  */
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/topology.h>
 #include <linux/nodemask.h>
 #include <asm/page.h>
 #include <asm/processor.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 559be4f1aee5..f9dc9cfaf079 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -4,6 +4,7 @@
 #include <uapi/linux/sched.h>
 
 #include <linux/sched/prio.h>
+#include <linux/nodemask.h>
 
 #include <linux/mutex.h>
 #include <linux/plist.h>
@@ -21,7 +22,6 @@
 #include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
-#include <linux/topology.h>
 
 #include <asm/current.h>
 
@@ -1454,11 +1454,6 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 #endif
 }
 
-static inline int task_node(const struct task_struct *p)
-{
-	return cpu_to_node(task_cpu(p));
-}
-
 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
 
 #else
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 0d6fceff37bb..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_SCHED_TOPOLOGY_H
 #define _LINUX_SCHED_TOPOLOGY_H
 
+#include <linux/topology.h>
+
 #include <linux/sched/idle.h>
 
 /*
@@ -216,4 +218,9 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 
 #endif	/* !CONFIG_SMP */
 
+static inline int task_node(const struct task_struct *p)
+{
+	return cpu_to_node(task_cpu(p));
+}
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
-- 
cgit v1.2.3


From 7f5f8e8d97d77edf33f2836259d1f19c6f4d94f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 11:44:12 +0100
Subject: sched/headers: Remove #ifdefs from <linux/sched.h>

We can remove two pairs of #ifdefs by defining structures in a smarter way.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f9dc9cfaf079..341f5792fbe0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -224,8 +224,8 @@ struct task_cputime {
 #define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
-#ifdef CONFIG_SCHED_INFO
 struct sched_info {
+#ifdef CONFIG_SCHED_INFO
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
 	unsigned long long run_delay; /* time spent waiting on a runqueue */
@@ -233,8 +233,8 @@ struct sched_info {
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
 			   last_queued;	/* when we were last queued to run */
-};
 #endif /* CONFIG_SCHED_INFO */
+};
 
 /*
  * Integer metrics need fixed point arithmetic, e.g., sched/fair
@@ -309,8 +309,8 @@ struct sched_avg {
 	unsigned long load_avg, util_avg;
 };
 
-#ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
+#ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
 	u64			wait_count;
@@ -342,8 +342,8 @@ struct sched_statistics {
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
-};
 #endif
+};
 
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
@@ -358,9 +358,7 @@ struct sched_entity {
 
 	u64			nr_migrations;
 
-#ifdef CONFIG_SCHEDSTATS
 	struct sched_statistics statistics;
-#endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	int			depth;
@@ -530,9 +528,7 @@ struct task_struct {
 	int rcu_tasks_idle_cpu;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
-#ifdef CONFIG_SCHED_INFO
 	struct sched_info sched_info;
-#endif
 
 	struct list_head tasks;
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 5eca1c10cbaa9c366c18ca79f81f21c731e3dcc7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 22:06:35 +0100
Subject: sched/headers: Clean up <linux/sched.h>

Now that <linux/sched.h> dependencies have been sorted out,
do various trivial cleanups:

 - remove unnecessary structure predeclarations
 - fix various typos
 - update comments where necessary
 - remove pointless comments
 - use consistent types
 - tabulate consistently
 - use a consistent comment style
 - clean up the header section a bit
 - use a consistent style of a single field per line
 - remove line-breaks where they make the code look worse
 - etc ...

No change in functionality.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 1222 ++++++++++++++++++++++++++-----------------------
 1 file changed, 651 insertions(+), 571 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 341f5792fbe0..d67eee84fd43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1,38 +1,38 @@
 #ifndef _LINUX_SCHED_H
 #define _LINUX_SCHED_H
 
-#include <uapi/linux/sched.h>
+/*
+ * Define 'struct task_struct' and provide the main scheduler
+ * APIs (schedule(), wakeup variants, etc.)
+ */
 
-#include <linux/sched/prio.h>
-#include <linux/nodemask.h>
+#include <uapi/linux/sched.h>
 
-#include <linux/mutex.h>
-#include <linux/plist.h>
-#include <linux/mm_types_task.h>
+#include <asm/current.h>
 
+#include <linux/pid.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
-#include <linux/signal_types.h>
-#include <linux/pid.h>
+#include <linux/kcov.h>
+#include <linux/mutex.h>
+#include <linux/plist.h>
+#include <linux/hrtimer.h>
 #include <linux/seccomp.h>
+#include <linux/nodemask.h>
 #include <linux/rcupdate.h>
-
 #include <linux/resource.h>
-#include <linux/hrtimer.h>
-#include <linux/kcov.h>
-#include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
+#include <linux/sched/prio.h>
+#include <linux/signal_types.h>
+#include <linux/mm_types_task.h>
+#include <linux/task_io_accounting.h>
 
-#include <asm/current.h>
-
-/* task_struct member predeclarations: */
+/* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
-struct autogroup;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
 struct cfs_rq;
-struct filename;
 struct fs_struct;
 struct futex_pi_state;
 struct io_context;
@@ -52,8 +52,6 @@ struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
-struct task_struct;
-struct uts_namespace;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -65,50 +63,53 @@ struct uts_namespace;
  * modifying one set can't modify the other one by
  * mistake.
  */
-#define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define __TASK_STOPPED		4
-#define __TASK_TRACED		8
-/* in tsk->exit_state */
-#define EXIT_DEAD		16
-#define EXIT_ZOMBIE		32
-#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
-/* in tsk->state again */
-#define TASK_DEAD		64
-#define TASK_WAKEKILL		128
-#define TASK_WAKING		256
-#define TASK_PARKED		512
-#define TASK_NOLOAD		1024
-#define TASK_NEW		2048
-#define TASK_STATE_MAX		4096
-
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
-
-/* Convenience macros for the sake of set_current_state */
-#define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
-#define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
-#define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
-
-#define TASK_IDLE		(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
-
-/* Convenience macros for the sake of wake_up */
-#define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
-#define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
-
-/* get_task_state() */
-#define TASK_REPORT		(TASK_RUNNING | TASK_INTERRUPTIBLE | \
-				 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
-				 __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
-
-#define task_is_traced(task)	((task->state & __TASK_TRACED) != 0)
-#define task_is_stopped(task)	((task->state & __TASK_STOPPED) != 0)
-#define task_is_stopped_or_traced(task)	\
-			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
-#define task_contributes_to_load(task)	\
-				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
-				 (task->flags & PF_FROZEN) == 0 && \
-				 (task->state & TASK_NOLOAD) == 0)
+
+/* Used in tsk->state: */
+#define TASK_RUNNING			0
+#define TASK_INTERRUPTIBLE		1
+#define TASK_UNINTERRUPTIBLE		2
+#define __TASK_STOPPED			4
+#define __TASK_TRACED			8
+/* Used in tsk->exit_state: */
+#define EXIT_DEAD			16
+#define EXIT_ZOMBIE			32
+#define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
+/* Used in tsk->state again: */
+#define TASK_DEAD			64
+#define TASK_WAKEKILL			128
+#define TASK_WAKING			256
+#define TASK_PARKED			512
+#define TASK_NOLOAD			1024
+#define TASK_NEW			2048
+#define TASK_STATE_MAX			4096
+
+#define TASK_STATE_TO_CHAR_STR		"RSDTtXZxKWPNn"
+
+/* Convenience macros for the sake of set_current_state: */
+#define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
+#define TASK_STOPPED			(TASK_WAKEKILL | __TASK_STOPPED)
+#define TASK_TRACED			(TASK_WAKEKILL | __TASK_TRACED)
+
+#define TASK_IDLE			(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+
+/* Convenience macros for the sake of wake_up(): */
+#define TASK_NORMAL			(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
+#define TASK_ALL			(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
+
+/* get_task_state(): */
+#define TASK_REPORT			(TASK_RUNNING | TASK_INTERRUPTIBLE | \
+					 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
+					 __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
+
+#define task_is_traced(task)		((task->state & __TASK_TRACED) != 0)
+
+#define task_is_stopped(task)		((task->state & __TASK_STOPPED) != 0)
+
+#define task_is_stopped_or_traced(task)	((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+
+#define task_contributes_to_load(task)	((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
+					 (task->flags & PF_FROZEN) == 0 && \
+					 (task->state & TASK_NOLOAD) == 0)
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
@@ -158,26 +159,24 @@ struct uts_namespace;
  *
  * Also see the comments of try_to_wake_up().
  */
-#define __set_current_state(state_value)		\
-	do { current->state = (state_value); } while (0)
-#define set_current_state(state_value)			\
-	smp_store_mb(current->state, (state_value))
-
+#define __set_current_state(state_value) do { current->state = (state_value); } while (0)
+#define set_current_state(state_value)	 smp_store_mb(current->state, (state_value))
 #endif
 
-/* Task command name length */
-#define TASK_COMM_LEN 16
+/* Task command name length: */
+#define TASK_COMM_LEN			16
 
-extern cpumask_var_t cpu_isolated_map;
+extern cpumask_var_t			cpu_isolated_map;
 
 extern void scheduler_tick(void);
 
-#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
-extern signed long schedule_timeout(signed long timeout);
-extern signed long schedule_timeout_interruptible(signed long timeout);
-extern signed long schedule_timeout_killable(signed long timeout);
-extern signed long schedule_timeout_uninterruptible(signed long timeout);
-extern signed long schedule_timeout_idle(signed long timeout);
+#define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
+
+extern long schedule_timeout(long timeout);
+extern long schedule_timeout_interruptible(long timeout);
+extern long schedule_timeout_killable(long timeout);
+extern long schedule_timeout_uninterruptible(long timeout);
+extern long schedule_timeout_idle(long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 
@@ -197,9 +196,9 @@ extern void io_schedule(void);
  */
 struct prev_cputime {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	u64 utime;
-	u64 stime;
-	raw_spinlock_t lock;
+	u64				utime;
+	u64				stime;
+	raw_spinlock_t			lock;
 #endif
 };
 
@@ -214,25 +213,34 @@ struct prev_cputime {
  * these counts together and treat all three of them in parallel.
  */
 struct task_cputime {
-	u64 utime;
-	u64 stime;
-	unsigned long long sum_exec_runtime;
+	u64				utime;
+	u64				stime;
+	unsigned long long		sum_exec_runtime;
 };
 
-/* Alternate field names when used to cache expirations. */
-#define virt_exp	utime
-#define prof_exp	stime
-#define sched_exp	sum_exec_runtime
+/* Alternate field names when used on cache expirations: */
+#define virt_exp			utime
+#define prof_exp			stime
+#define sched_exp			sum_exec_runtime
 
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
-	/* cumulative counters */
-	unsigned long pcount;	      /* # of times run on this cpu */
-	unsigned long long run_delay; /* time spent waiting on a runqueue */
+	/* Cumulative counters: */
+
+	/* # of times we have run on this CPU: */
+	unsigned long			pcount;
+
+	/* Time spent waiting on a runqueue: */
+	unsigned long long		run_delay;
+
+	/* Timestamps: */
+
+	/* When did we last run on a CPU? */
+	unsigned long long		last_arrival;
+
+	/* When were we last queued to run? */
+	unsigned long long		last_queued;
 
-	/* timestamps */
-	unsigned long long last_arrival,/* when we last ran on a cpu */
-			   last_queued;	/* when we were last queued to run */
 #endif /* CONFIG_SCHED_INFO */
 };
 
@@ -243,12 +251,12 @@ struct sched_info {
  * We define a basic fixed point arithmetic range, and then formalize
  * all these metrics based on that basic range.
  */
-# define SCHED_FIXEDPOINT_SHIFT	10
-# define SCHED_FIXEDPOINT_SCALE	(1L << SCHED_FIXEDPOINT_SHIFT)
+# define SCHED_FIXEDPOINT_SHIFT		10
+# define SCHED_FIXEDPOINT_SCALE		(1L << SCHED_FIXEDPOINT_SHIFT)
 
 struct load_weight {
-	unsigned long weight;
-	u32 inv_weight;
+	unsigned long			weight;
+	u32				inv_weight;
 };
 
 /*
@@ -304,69 +312,73 @@ struct load_weight {
  * issues.
  */
 struct sched_avg {
-	u64 last_update_time, load_sum;
-	u32 util_sum, period_contrib;
-	unsigned long load_avg, util_avg;
+	u64				last_update_time;
+	u64				load_sum;
+	u32				util_sum;
+	u32				period_contrib;
+	unsigned long			load_avg;
+	unsigned long			util_avg;
 };
 
 struct sched_statistics {
 #ifdef CONFIG_SCHEDSTATS
-	u64			wait_start;
-	u64			wait_max;
-	u64			wait_count;
-	u64			wait_sum;
-	u64			iowait_count;
-	u64			iowait_sum;
-
-	u64			sleep_start;
-	u64			sleep_max;
-	s64			sum_sleep_runtime;
-
-	u64			block_start;
-	u64			block_max;
-	u64			exec_max;
-	u64			slice_max;
-
-	u64			nr_migrations_cold;
-	u64			nr_failed_migrations_affine;
-	u64			nr_failed_migrations_running;
-	u64			nr_failed_migrations_hot;
-	u64			nr_forced_migrations;
-
-	u64			nr_wakeups;
-	u64			nr_wakeups_sync;
-	u64			nr_wakeups_migrate;
-	u64			nr_wakeups_local;
-	u64			nr_wakeups_remote;
-	u64			nr_wakeups_affine;
-	u64			nr_wakeups_affine_attempts;
-	u64			nr_wakeups_passive;
-	u64			nr_wakeups_idle;
+	u64				wait_start;
+	u64				wait_max;
+	u64				wait_count;
+	u64				wait_sum;
+	u64				iowait_count;
+	u64				iowait_sum;
+
+	u64				sleep_start;
+	u64				sleep_max;
+	s64				sum_sleep_runtime;
+
+	u64				block_start;
+	u64				block_max;
+	u64				exec_max;
+	u64				slice_max;
+
+	u64				nr_migrations_cold;
+	u64				nr_failed_migrations_affine;
+	u64				nr_failed_migrations_running;
+	u64				nr_failed_migrations_hot;
+	u64				nr_forced_migrations;
+
+	u64				nr_wakeups;
+	u64				nr_wakeups_sync;
+	u64				nr_wakeups_migrate;
+	u64				nr_wakeups_local;
+	u64				nr_wakeups_remote;
+	u64				nr_wakeups_affine;
+	u64				nr_wakeups_affine_attempts;
+	u64				nr_wakeups_passive;
+	u64				nr_wakeups_idle;
 #endif
 };
 
 struct sched_entity {
-	struct load_weight	load;		/* for load-balancing */
-	struct rb_node		run_node;
-	struct list_head	group_node;
-	unsigned int		on_rq;
+	/* For load-balancing: */
+	struct load_weight		load;
+	struct rb_node			run_node;
+	struct list_head		group_node;
+	unsigned int			on_rq;
 
-	u64			exec_start;
-	u64			sum_exec_runtime;
-	u64			vruntime;
-	u64			prev_sum_exec_runtime;
+	u64				exec_start;
+	u64				sum_exec_runtime;
+	u64				vruntime;
+	u64				prev_sum_exec_runtime;
 
-	u64			nr_migrations;
+	u64				nr_migrations;
 
-	struct sched_statistics statistics;
+	struct sched_statistics		statistics;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	int			depth;
-	struct sched_entity	*parent;
+	int				depth;
+	struct sched_entity		*parent;
 	/* rq on which this entity is (to be) queued: */
-	struct cfs_rq		*cfs_rq;
+	struct cfs_rq			*cfs_rq;
 	/* rq "owned" by this entity/group: */
-	struct cfs_rq		*my_q;
+	struct cfs_rq			*my_q;
 #endif
 
 #ifdef CONFIG_SMP
@@ -376,49 +388,49 @@ struct sched_entity {
 	 * Put into separate cache line so it does not
 	 * collide with read-mostly values above.
 	 */
-	struct sched_avg	avg ____cacheline_aligned_in_smp;
+	struct sched_avg		avg ____cacheline_aligned_in_smp;
 #endif
 };
 
 struct sched_rt_entity {
-	struct list_head run_list;
-	unsigned long timeout;
-	unsigned long watchdog_stamp;
-	unsigned int time_slice;
-	unsigned short on_rq;
-	unsigned short on_list;
-
-	struct sched_rt_entity *back;
+	struct list_head		run_list;
+	unsigned long			timeout;
+	unsigned long			watchdog_stamp;
+	unsigned int			time_slice;
+	unsigned short			on_rq;
+	unsigned short			on_list;
+
+	struct sched_rt_entity		*back;
 #ifdef CONFIG_RT_GROUP_SCHED
-	struct sched_rt_entity	*parent;
+	struct sched_rt_entity		*parent;
 	/* rq on which this entity is (to be) queued: */
-	struct rt_rq		*rt_rq;
+	struct rt_rq			*rt_rq;
 	/* rq "owned" by this entity/group: */
-	struct rt_rq		*my_q;
+	struct rt_rq			*my_q;
 #endif
 };
 
 struct sched_dl_entity {
-	struct rb_node	rb_node;
+	struct rb_node			rb_node;
 
 	/*
 	 * Original scheduling parameters. Copied here from sched_attr
 	 * during sched_setattr(), they will remain the same until
 	 * the next sched_setattr().
 	 */
-	u64 dl_runtime;		/* maximum runtime for each instance	*/
-	u64 dl_deadline;	/* relative deadline of each instance	*/
-	u64 dl_period;		/* separation of two instances (period) */
-	u64 dl_bw;		/* dl_runtime / dl_deadline		*/
+	u64				dl_runtime;	/* Maximum runtime for each instance	*/
+	u64				dl_deadline;	/* Relative deadline of each instance	*/
+	u64				dl_period;	/* Separation of two instances (period) */
+	u64				dl_bw;		/* dl_runtime / dl_deadline		*/
 
 	/*
 	 * Actual scheduling parameters. Initialized with the values above,
 	 * they are continously updated during task execution. Note that
 	 * the remaining runtime could be < 0 in case we are in overrun.
 	 */
-	s64 runtime;		/* remaining runtime for this instance	*/
-	u64 deadline;		/* absolute deadline for this instance	*/
-	unsigned int flags;	/* specifying the scheduler behaviour	*/
+	s64				runtime;	/* Remaining runtime for this instance	*/
+	u64				deadline;	/* Absolute deadline for this instance	*/
+	unsigned int			flags;		/* Specifying the scheduler behaviour	*/
 
 	/*
 	 * Some bool flags:
@@ -431,24 +443,28 @@ struct sched_dl_entity {
 	 * outside bandwidth enforcement mechanism (but only until we
 	 * exit the critical section);
 	 *
-	 * @dl_yielded tells if task gave up the cpu before consuming
+	 * @dl_yielded tells if task gave up the CPU before consuming
 	 * all its available runtime during the last job.
 	 */
-	int dl_throttled, dl_boosted, dl_yielded;
+	int				dl_throttled;
+	int				dl_boosted;
+	int				dl_yielded;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
 	 * own bandwidth to be enforced, thus we need one timer per task.
 	 */
-	struct hrtimer dl_timer;
+	struct hrtimer			dl_timer;
 };
 
 union rcu_special {
 	struct {
-		u8 blocked;
-		u8 need_qs;
-		u8 exp_need_qs;
-		u8 pad;	/* Otherwise the compiler can store garbage here. */
+		u8			blocked;
+		u8			need_qs;
+		u8			exp_need_qs;
+
+		/* Otherwise the compiler can store garbage here: */
+		u8			pad;
 	} b; /* Bits. */
 	u32 s; /* Set of bits. */
 };
@@ -470,361 +486,417 @@ struct task_struct {
 	 * For reasons of header soup (see current_thread_info()), this
 	 * must be the first element of task_struct.
 	 */
-	struct thread_info thread_info;
+	struct thread_info		thread_info;
 #endif
-	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
-	void *stack;
-	atomic_t usage;
-	unsigned int flags;	/* per process flags, defined below */
-	unsigned int ptrace;
+	/* -1 unrunnable, 0 runnable, >0 stopped: */
+	volatile long			state;
+	void				*stack;
+	atomic_t			usage;
+	/* Per task flags (PF_*), defined further below: */
+	unsigned int			flags;
+	unsigned int			ptrace;
 
 #ifdef CONFIG_SMP
-	struct llist_node wake_entry;
-	int on_cpu;
+	struct llist_node		wake_entry;
+	int				on_cpu;
 #ifdef CONFIG_THREAD_INFO_IN_TASK
-	unsigned int cpu;	/* current CPU */
+	/* Current CPU: */
+	unsigned int			cpu;
 #endif
-	unsigned int wakee_flips;
-	unsigned long wakee_flip_decay_ts;
-	struct task_struct *last_wakee;
+	unsigned int			wakee_flips;
+	unsigned long			wakee_flip_decay_ts;
+	struct task_struct		*last_wakee;
 
-	int wake_cpu;
+	int				wake_cpu;
 #endif
-	int on_rq;
+	int				on_rq;
+
+	int				prio;
+	int				static_prio;
+	int				normal_prio;
+	unsigned int			rt_priority;
 
-	int prio, static_prio, normal_prio;
-	unsigned int rt_priority;
-	const struct sched_class *sched_class;
-	struct sched_entity se;
-	struct sched_rt_entity rt;
+	const struct sched_class	*sched_class;
+	struct sched_entity		se;
+	struct sched_rt_entity		rt;
 #ifdef CONFIG_CGROUP_SCHED
-	struct task_group *sched_task_group;
+	struct task_group		*sched_task_group;
 #endif
-	struct sched_dl_entity dl;
+	struct sched_dl_entity		dl;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
-	/* list of struct preempt_notifier: */
-	struct hlist_head preempt_notifiers;
+	/* List of struct preempt_notifier: */
+	struct hlist_head		preempt_notifiers;
 #endif
 
 #ifdef CONFIG_BLK_DEV_IO_TRACE
-	unsigned int btrace_seq;
+	unsigned int			btrace_seq;
 #endif
 
-	unsigned int policy;
-	int nr_cpus_allowed;
-	cpumask_t cpus_allowed;
+	unsigned int			policy;
+	int				nr_cpus_allowed;
+	cpumask_t			cpus_allowed;
 
 #ifdef CONFIG_PREEMPT_RCU
-	int rcu_read_lock_nesting;
-	union rcu_special rcu_read_unlock_special;
-	struct list_head rcu_node_entry;
-	struct rcu_node *rcu_blocked_node;
+	int				rcu_read_lock_nesting;
+	union rcu_special		rcu_read_unlock_special;
+	struct list_head		rcu_node_entry;
+	struct rcu_node			*rcu_blocked_node;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
+
 #ifdef CONFIG_TASKS_RCU
-	unsigned long rcu_tasks_nvcsw;
-	bool rcu_tasks_holdout;
-	struct list_head rcu_tasks_holdout_list;
-	int rcu_tasks_idle_cpu;
+	unsigned long			rcu_tasks_nvcsw;
+	bool				rcu_tasks_holdout;
+	struct list_head		rcu_tasks_holdout_list;
+	int				rcu_tasks_idle_cpu;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
-	struct sched_info sched_info;
+	struct sched_info		sched_info;
 
-	struct list_head tasks;
+	struct list_head		tasks;
 #ifdef CONFIG_SMP
-	struct plist_node pushable_tasks;
-	struct rb_node pushable_dl_tasks;
+	struct plist_node		pushable_tasks;
+	struct rb_node			pushable_dl_tasks;
 #endif
 
-	struct mm_struct *mm, *active_mm;
+	struct mm_struct		*mm;
+	struct mm_struct		*active_mm;
 
 	/* Per-thread vma caching: */
-	struct vmacache vmacache;
-
-#if defined(SPLIT_RSS_COUNTING)
-	struct task_rss_stat	rss_stat;
-#endif
-/* task state */
-	int exit_state;
-	int exit_code, exit_signal;
-	int pdeath_signal;  /*  The signal sent when the parent dies  */
-	unsigned long jobctl;	/* JOBCTL_*, siglock protected */
-
-	/* Used for emulating ABI behavior of previous Linux versions */
-	unsigned int personality;
-
-	/* scheduler bits, serialized by scheduler locks */
-	unsigned sched_reset_on_fork:1;
-	unsigned sched_contributes_to_load:1;
-	unsigned sched_migrated:1;
-	unsigned sched_remote_wakeup:1;
-	unsigned :0; /* force alignment to the next boundary */
-
-	/* unserialized, strictly 'current' */
-	unsigned in_execve:1; /* bit to tell LSMs we're in execve */
-	unsigned in_iowait:1;
-#if !defined(TIF_RESTORE_SIGMASK)
-	unsigned restore_sigmask:1;
+	struct vmacache			vmacache;
+
+#ifdef SPLIT_RSS_COUNTING
+	struct task_rss_stat		rss_stat;
+#endif
+	int				exit_state;
+	int				exit_code;
+	int				exit_signal;
+	/* The signal sent when the parent dies: */
+	int				pdeath_signal;
+	/* JOBCTL_*, siglock protected: */
+	unsigned long			jobctl;
+
+	/* Used for emulating ABI behavior of previous Linux versions: */
+	unsigned int			personality;
+
+	/* Scheduler bits, serialized by scheduler locks: */
+	unsigned			sched_reset_on_fork:1;
+	unsigned			sched_contributes_to_load:1;
+	unsigned			sched_migrated:1;
+	unsigned			sched_remote_wakeup:1;
+	/* Force alignment to the next boundary: */
+	unsigned			:0;
+
+	/* Unserialized, strictly 'current' */
+
+	/* Bit to tell LSMs we're in execve(): */
+	unsigned			in_execve:1;
+	unsigned			in_iowait:1;
+#ifndef TIF_RESTORE_SIGMASK
+	unsigned			restore_sigmask:1;
 #endif
 #ifdef CONFIG_MEMCG
-	unsigned memcg_may_oom:1;
+	unsigned			memcg_may_oom:1;
 #ifndef CONFIG_SLOB
-	unsigned memcg_kmem_skip_account:1;
+	unsigned			memcg_kmem_skip_account:1;
 #endif
 #endif
 #ifdef CONFIG_COMPAT_BRK
-	unsigned brk_randomized:1;
+	unsigned			brk_randomized:1;
 #endif
 
-	unsigned long atomic_flags; /* Flags needing atomic access. */
+	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
-	struct restart_block restart_block;
+	struct restart_block		restart_block;
 
-	pid_t pid;
-	pid_t tgid;
+	pid_t				pid;
+	pid_t				tgid;
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-	/* Canary value for the -fstack-protector gcc feature */
-	unsigned long stack_canary;
+	/* Canary value for the -fstack-protector GCC feature: */
+	unsigned long			stack_canary;
 #endif
 	/*
-	 * pointers to (original) parent process, youngest child, younger sibling,
+	 * Pointers to the (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with
 	 * p->real_parent->pid)
 	 */
-	struct task_struct __rcu *real_parent; /* real parent process */
-	struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
+
+	/* Real parent process: */
+	struct task_struct __rcu	*real_parent;
+
+	/* Recipient of SIGCHLD, wait4() reports: */
+	struct task_struct __rcu	*parent;
+
 	/*
-	 * children/sibling forms the list of my natural children
+	 * Children/sibling form the list of natural children:
 	 */
-	struct list_head children;	/* list of my children */
-	struct list_head sibling;	/* linkage in my parent's children list */
-	struct task_struct *group_leader;	/* threadgroup leader */
+	struct list_head		children;
+	struct list_head		sibling;
+	struct task_struct		*group_leader;
 
 	/*
-	 * ptraced is the list of tasks this task is using ptrace on.
+	 * 'ptraced' is the list of tasks this task is using ptrace() on.
+	 *
 	 * This includes both natural children and PTRACE_ATTACH targets.
-	 * p->ptrace_entry is p's link on the p->parent->ptraced list.
+	 * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
 	 */
-	struct list_head ptraced;
-	struct list_head ptrace_entry;
+	struct list_head		ptraced;
+	struct list_head		ptrace_entry;
 
 	/* PID/PID hash table linkage. */
-	struct pid_link pids[PIDTYPE_MAX];
-	struct list_head thread_group;
-	struct list_head thread_node;
+	struct pid_link			pids[PIDTYPE_MAX];
+	struct list_head		thread_group;
+	struct list_head		thread_node;
+
+	struct completion		*vfork_done;
 
-	struct completion *vfork_done;		/* for vfork() */
-	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
-	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
+	/* CLONE_CHILD_SETTID: */
+	int __user			*set_child_tid;
 
-	u64 utime, stime;
+	/* CLONE_CHILD_CLEARTID: */
+	int __user			*clear_child_tid;
+
+	u64				utime;
+	u64				stime;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
-	u64 utimescaled, stimescaled;
+	u64				utimescaled;
+	u64				stimescaled;
 #endif
-	u64 gtime;
-	struct prev_cputime prev_cputime;
+	u64				gtime;
+	struct prev_cputime		prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqcount_t vtime_seqcount;
-	unsigned long long vtime_snap;
+	seqcount_t			vtime_seqcount;
+	unsigned long long		vtime_snap;
 	enum {
-		/* Task is sleeping or running in a CPU with VTIME inactive */
+		/* Task is sleeping or running in a CPU with VTIME inactive: */
 		VTIME_INACTIVE = 0,
-		/* Task runs in userspace in a CPU with VTIME active */
+		/* Task runs in userspace in a CPU with VTIME active: */
 		VTIME_USER,
-		/* Task runs in kernelspace in a CPU with VTIME active */
+		/* Task runs in kernelspace in a CPU with VTIME active: */
 		VTIME_SYS,
 	} vtime_snap_whence;
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
-	atomic_t tick_dep_mask;
+	atomic_t			tick_dep_mask;
 #endif
-	unsigned long nvcsw, nivcsw; /* context switch counts */
-	u64 start_time;		/* monotonic time in nsec */
-	u64 real_start_time;	/* boot based time in nsec */
-/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
-	unsigned long min_flt, maj_flt;
+	/* Context switch counts: */
+	unsigned long			nvcsw;
+	unsigned long			nivcsw;
+
+	/* Monotonic time in nsecs: */
+	u64				start_time;
+
+	/* Boot based time in nsecs: */
+	u64				real_start_time;
+
+	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
+	unsigned long			min_flt;
+	unsigned long			maj_flt;
 
 #ifdef CONFIG_POSIX_TIMERS
-	struct task_cputime cputime_expires;
-	struct list_head cpu_timers[3];
-#endif
-
-/* process credentials */
-	const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
-	const struct cred __rcu *real_cred; /* objective and real subjective task
-					 * credentials (COW) */
-	const struct cred __rcu *cred;	/* effective (overridable) subjective task
-					 * credentials (COW) */
-	char comm[TASK_COMM_LEN]; /* executable name excluding path
-				     - access with [gs]et_task_comm (which lock
-				       it with task_lock())
-				     - initialized normally by setup_new_exec */
-/* file system info */
-	struct nameidata *nameidata;
+	struct task_cputime		cputime_expires;
+	struct list_head		cpu_timers[3];
+#endif
+
+	/* Process credentials: */
+
+	/* Tracer's credentials at attach: */
+	const struct cred __rcu		*ptracer_cred;
+
+	/* Objective and real subjective task credentials (COW): */
+	const struct cred __rcu		*real_cred;
+
+	/* Effective (overridable) subjective task credentials (COW): */
+	const struct cred __rcu		*cred;
+
+	/*
+	 * executable name, excluding path.
+	 *
+	 * - normally initialized setup_new_exec()
+	 * - access it with [gs]et_task_comm()
+	 * - lock it with task_lock()
+	 */
+	char				comm[TASK_COMM_LEN];
+
+	struct nameidata		*nameidata;
+
 #ifdef CONFIG_SYSVIPC
-/* ipc stuff */
-	struct sysv_sem sysvsem;
-	struct sysv_shm sysvshm;
+	struct sysv_sem			sysvsem;
+	struct sysv_shm			sysvshm;
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
-/* hung task detection */
-	unsigned long last_switch_count;
-#endif
-/* filesystem information */
-	struct fs_struct *fs;
-/* open file information */
-	struct files_struct *files;
-/* namespaces */
-	struct nsproxy *nsproxy;
-/* signal handlers */
-	struct signal_struct *signal;
-	struct sighand_struct *sighand;
-
-	sigset_t blocked, real_blocked;
-	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
-	struct sigpending pending;
-
-	unsigned long sas_ss_sp;
-	size_t sas_ss_size;
-	unsigned sas_ss_flags;
-
-	struct callback_head *task_works;
-
-	struct audit_context *audit_context;
+	unsigned long			last_switch_count;
+#endif
+	/* Filesystem information: */
+	struct fs_struct		*fs;
+
+	/* Open file information: */
+	struct files_struct		*files;
+
+	/* Namespaces: */
+	struct nsproxy			*nsproxy;
+
+	/* Signal handlers: */
+	struct signal_struct		*signal;
+	struct sighand_struct		*sighand;
+	sigset_t			blocked;
+	sigset_t			real_blocked;
+	/* Restored if set_restore_sigmask() was used: */
+	sigset_t			saved_sigmask;
+	struct sigpending		pending;
+	unsigned long			sas_ss_sp;
+	size_t				sas_ss_size;
+	unsigned int			sas_ss_flags;
+
+	struct callback_head		*task_works;
+
+	struct audit_context		*audit_context;
 #ifdef CONFIG_AUDITSYSCALL
-	kuid_t loginuid;
-	unsigned int sessionid;
+	kuid_t				loginuid;
+	unsigned int			sessionid;
 #endif
-	struct seccomp seccomp;
+	struct seccomp			seccomp;
 
-/* Thread group tracking */
-   	u32 parent_exec_id;
-   	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- * mempolicy */
-	spinlock_t alloc_lock;
+	/* Thread group tracking: */
+	u32				parent_exec_id;
+	u32				self_exec_id;
+
+	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
+	spinlock_t			alloc_lock;
 
 	/* Protection of the PI data structures: */
-	raw_spinlock_t pi_lock;
+	raw_spinlock_t			pi_lock;
 
-	struct wake_q_node wake_q;
+	struct wake_q_node		wake_q;
 
 #ifdef CONFIG_RT_MUTEXES
-	/* PI waiters blocked on a rt_mutex held by this task */
-	struct rb_root pi_waiters;
-	struct rb_node *pi_waiters_leftmost;
-	/* Deadlock detection and priority inheritance handling */
-	struct rt_mutex_waiter *pi_blocked_on;
+	/* PI waiters blocked on a rt_mutex held by this task: */
+	struct rb_root			pi_waiters;
+	struct rb_node			*pi_waiters_leftmost;
+	/* Deadlock detection and priority inheritance handling: */
+	struct rt_mutex_waiter		*pi_blocked_on;
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
-	/* mutex deadlock detection */
-	struct mutex_waiter *blocked_on;
+	/* Mutex deadlock detection: */
+	struct mutex_waiter		*blocked_on;
 #endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS
-	unsigned int irq_events;
-	unsigned long hardirq_enable_ip;
-	unsigned long hardirq_disable_ip;
-	unsigned int hardirq_enable_event;
-	unsigned int hardirq_disable_event;
-	int hardirqs_enabled;
-	int hardirq_context;
-	unsigned long softirq_disable_ip;
-	unsigned long softirq_enable_ip;
-	unsigned int softirq_disable_event;
-	unsigned int softirq_enable_event;
-	int softirqs_enabled;
-	int softirq_context;
+	unsigned int			irq_events;
+	unsigned long			hardirq_enable_ip;
+	unsigned long			hardirq_disable_ip;
+	unsigned int			hardirq_enable_event;
+	unsigned int			hardirq_disable_event;
+	int				hardirqs_enabled;
+	int				hardirq_context;
+	unsigned long			softirq_disable_ip;
+	unsigned long			softirq_enable_ip;
+	unsigned int			softirq_disable_event;
+	unsigned int			softirq_enable_event;
+	int				softirqs_enabled;
+	int				softirq_context;
 #endif
+
 #ifdef CONFIG_LOCKDEP
-# define MAX_LOCK_DEPTH 48UL
-	u64 curr_chain_key;
-	int lockdep_depth;
-	unsigned int lockdep_recursion;
-	struct held_lock held_locks[MAX_LOCK_DEPTH];
-	gfp_t lockdep_reclaim_gfp;
+# define MAX_LOCK_DEPTH			48UL
+	u64				curr_chain_key;
+	int				lockdep_depth;
+	unsigned int			lockdep_recursion;
+	struct held_lock		held_locks[MAX_LOCK_DEPTH];
+	gfp_t				lockdep_reclaim_gfp;
 #endif
+
 #ifdef CONFIG_UBSAN
-	unsigned int in_ubsan;
+	unsigned int			in_ubsan;
 #endif
 
-/* journalling filesystem info */
-	void *journal_info;
+	/* Journalling filesystem info: */
+	void				*journal_info;
 
-/* stacked block device info */
-	struct bio_list *bio_list;
+	/* Stacked block device info: */
+	struct bio_list			*bio_list;
 
 #ifdef CONFIG_BLOCK
-/* stack plugging */
-	struct blk_plug *plug;
+	/* Stack plugging: */
+	struct blk_plug			*plug;
 #endif
 
-/* VM state */
-	struct reclaim_state *reclaim_state;
+	/* VM state: */
+	struct reclaim_state		*reclaim_state;
+
+	struct backing_dev_info		*backing_dev_info;
 
-	struct backing_dev_info *backing_dev_info;
+	struct io_context		*io_context;
 
-	struct io_context *io_context;
+	/* Ptrace state: */
+	unsigned long			ptrace_message;
+	siginfo_t			*last_siginfo;
 
-	unsigned long ptrace_message;
-	siginfo_t *last_siginfo; /* For ptrace use.  */
-	struct task_io_accounting ioac;
-#if defined(CONFIG_TASK_XACCT)
-	u64 acct_rss_mem1;	/* accumulated rss usage */
-	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
-	u64 acct_timexpd;	/* stime + utime since last update */
+	struct task_io_accounting	ioac;
+#ifdef CONFIG_TASK_XACCT
+	/* Accumulated RSS usage: */
+	u64				acct_rss_mem1;
+	/* Accumulated virtual memory usage: */
+	u64				acct_vm_mem1;
+	/* stime + utime since last update: */
+	u64				acct_timexpd;
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;	/* Protected by alloc_lock */
-	seqcount_t mems_allowed_seq;	/* Seqence no to catch updates */
-	int cpuset_mem_spread_rotor;
-	int cpuset_slab_spread_rotor;
+	/* Protected by ->alloc_lock: */
+	nodemask_t			mems_allowed;
+	/* Seqence number to catch updates: */
+	seqcount_t			mems_allowed_seq;
+	int				cpuset_mem_spread_rotor;
+	int				cpuset_slab_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
-	/* Control Group info protected by css_set_lock */
-	struct css_set __rcu *cgroups;
-	/* cg_list protected by css_set_lock and tsk->alloc_lock */
-	struct list_head cg_list;
+	/* Control Group info protected by css_set_lock: */
+	struct css_set __rcu		*cgroups;
+	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
+	struct list_head		cg_list;
 #endif
 #ifdef CONFIG_INTEL_RDT_A
-	int closid;
+	int				closid;
 #endif
 #ifdef CONFIG_FUTEX
-	struct robust_list_head __user *robust_list;
+	struct robust_list_head __user	*robust_list;
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
-	struct list_head pi_state_list;
-	struct futex_pi_state *pi_state_cache;
+	struct list_head		pi_state_list;
+	struct futex_pi_state		*pi_state_cache;
 #endif
 #ifdef CONFIG_PERF_EVENTS
-	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
-	struct mutex perf_event_mutex;
-	struct list_head perf_event_list;
+	struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
+	struct mutex			perf_event_mutex;
+	struct list_head		perf_event_list;
 #endif
 #ifdef CONFIG_DEBUG_PREEMPT
-	unsigned long preempt_disable_ip;
+	unsigned long			preempt_disable_ip;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
-	short il_next;
-	short pref_node_fork;
+	/* Protected by alloc_lock: */
+	struct mempolicy		*mempolicy;
+	short				il_next;
+	short				pref_node_fork;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-	int numa_scan_seq;
-	unsigned int numa_scan_period;
-	unsigned int numa_scan_period_max;
-	int numa_preferred_nid;
-	unsigned long numa_migrate_retry;
-	u64 node_stamp;			/* migration stamp  */
-	u64 last_task_numa_placement;
-	u64 last_sum_exec_runtime;
-	struct callback_head numa_work;
-
-	struct list_head numa_entry;
-	struct numa_group *numa_group;
+	int				numa_scan_seq;
+	unsigned int			numa_scan_period;
+	unsigned int			numa_scan_period_max;
+	int				numa_preferred_nid;
+	unsigned long			numa_migrate_retry;
+	/* Migration stamp: */
+	u64				node_stamp;
+	u64				last_task_numa_placement;
+	u64				last_sum_exec_runtime;
+	struct callback_head		numa_work;
+
+	struct list_head		numa_entry;
+	struct numa_group		*numa_group;
 
 	/*
 	 * numa_faults is an array split into four regions:
@@ -840,8 +912,8 @@ struct task_struct {
 	 * during the current scan window. When the scan completes, the counts
 	 * in faults_memory and faults_cpu decay and these values are copied.
 	 */
-	unsigned long *numa_faults;
-	unsigned long total_numa_faults;
+	unsigned long			*numa_faults;
+	unsigned long			total_numa_faults;
 
 	/*
 	 * numa_faults_locality tracks if faults recorded during the last
@@ -849,119 +921,132 @@ struct task_struct {
 	 * period is adapted based on the locality of the faults with different
 	 * weights depending on whether they were shared or private faults
 	 */
-	unsigned long numa_faults_locality[3];
+	unsigned long			numa_faults_locality[3];
 
-	unsigned long numa_pages_migrated;
+	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
-	struct tlbflush_unmap_batch tlb_ubc;
+	struct tlbflush_unmap_batch	tlb_ubc;
 
-	struct rcu_head rcu;
+	struct rcu_head			rcu;
 
-	/*
-	 * cache last used pipe for splice
-	 */
-	struct pipe_inode_info *splice_pipe;
+	/* Cache last used pipe for splice(): */
+	struct pipe_inode_info		*splice_pipe;
 
-	struct page_frag task_frag;
+	struct page_frag		task_frag;
 
 #ifdef CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info		*delays;
 #endif
 
 #ifdef CONFIG_FAULT_INJECTION
-	int make_it_fail;
+	int				make_it_fail;
 #endif
 	/*
-	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
-	 * balance_dirty_pages() for some dirty throttling pause
+	 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
+	 * balance_dirty_pages() for a dirty throttling pause:
 	 */
-	int nr_dirtied;
-	int nr_dirtied_pause;
-	unsigned long dirty_paused_when; /* start of a write-and-pause period */
+	int				nr_dirtied;
+	int				nr_dirtied_pause;
+	/* Start of a write-and-pause period: */
+	unsigned long			dirty_paused_when;
 
 #ifdef CONFIG_LATENCYTOP
-	int latency_record_count;
-	struct latency_record latency_record[LT_SAVECOUNT];
+	int				latency_record_count;
+	struct latency_record		latency_record[LT_SAVECOUNT];
 #endif
 	/*
-	 * time slack values; these are used to round up poll() and
+	 * Time slack values; these are used to round up poll() and
 	 * select() etc timeout values. These are in nanoseconds.
 	 */
-	u64 timer_slack_ns;
-	u64 default_timer_slack_ns;
+	u64				timer_slack_ns;
+	u64				default_timer_slack_ns;
 
 #ifdef CONFIG_KASAN
-	unsigned int kasan_depth;
+	unsigned int			kasan_depth;
 #endif
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	/* Index of current stored address in ret_stack */
-	int curr_ret_stack;
-	/* Stack of return addresses for return function tracing */
-	struct ftrace_ret_stack	*ret_stack;
-	/* time stamp for last schedule */
-	unsigned long long ftrace_timestamp;
+	/* Index of current stored address in ret_stack: */
+	int				curr_ret_stack;
+
+	/* Stack of return addresses for return function tracing: */
+	struct ftrace_ret_stack		*ret_stack;
+
+	/* Timestamp for last schedule: */
+	unsigned long long		ftrace_timestamp;
+
 	/*
 	 * Number of functions that haven't been traced
-	 * because of depth overrun.
+	 * because of depth overrun:
 	 */
-	atomic_t trace_overrun;
-	/* Pause for the tracing */
-	atomic_t tracing_graph_pause;
+	atomic_t			trace_overrun;
+
+	/* Pause tracing: */
+	atomic_t			tracing_graph_pause;
 #endif
+
 #ifdef CONFIG_TRACING
-	/* state flags for use by tracers */
-	unsigned long trace;
-	/* bitmask and counter of trace recursion */
-	unsigned long trace_recursion;
+	/* State flags for use by tracers: */
+	unsigned long			trace;
+
+	/* Bitmask and counter of trace recursion: */
+	unsigned long			trace_recursion;
 #endif /* CONFIG_TRACING */
+
 #ifdef CONFIG_KCOV
-	/* Coverage collection mode enabled for this task (0 if disabled). */
-	enum kcov_mode kcov_mode;
-	/* Size of the kcov_area. */
-	unsigned	kcov_size;
-	/* Buffer for coverage collection. */
-	void		*kcov_area;
-	/* kcov desciptor wired with this task or NULL. */
-	struct kcov	*kcov;
+	/* Coverage collection mode enabled for this task (0 if disabled): */
+	enum kcov_mode			kcov_mode;
+
+	/* Size of the kcov_area: */
+	unsigned int			kcov_size;
+
+	/* Buffer for coverage collection: */
+	void				*kcov_area;
+
+	/* KCOV descriptor wired with this task or NULL: */
+	struct kcov			*kcov;
 #endif
+
 #ifdef CONFIG_MEMCG
-	struct mem_cgroup *memcg_in_oom;
-	gfp_t memcg_oom_gfp_mask;
-	int memcg_oom_order;
+	struct mem_cgroup		*memcg_in_oom;
+	gfp_t				memcg_oom_gfp_mask;
+	int				memcg_oom_order;
 
-	/* number of pages to reclaim on returning to userland */
-	unsigned int memcg_nr_pages_over_high;
+	/* Number of pages to reclaim on returning to userland: */
+	unsigned int			memcg_nr_pages_over_high;
 #endif
+
 #ifdef CONFIG_UPROBES
-	struct uprobe_task *utask;
+	struct uprobe_task		*utask;
 #endif
 #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
-	unsigned int	sequential_io;
-	unsigned int	sequential_io_avg;
+	unsigned int			sequential_io;
+	unsigned int			sequential_io_avg;
 #endif
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-	unsigned long	task_state_change;
+	unsigned long			task_state_change;
 #endif
-	int pagefault_disabled;
+	int				pagefault_disabled;
 #ifdef CONFIG_MMU
-	struct task_struct *oom_reaper_list;
+	struct task_struct		*oom_reaper_list;
 #endif
 #ifdef CONFIG_VMAP_STACK
-	struct vm_struct *stack_vm_area;
+	struct vm_struct		*stack_vm_area;
 #endif
 #ifdef CONFIG_THREAD_INFO_IN_TASK
-	/* A live task holds one reference. */
-	atomic_t stack_refcount;
+	/* A live task holds one reference: */
+	atomic_t			stack_refcount;
 #endif
-/* CPU-specific state of this task */
-	struct thread_struct thread;
-/*
- * WARNING: on x86, 'thread_struct' contains a variable-sized
- * structure.  It *MUST* be at the end of 'task_struct'.
- *
- * Do not put anything below here!
- */
+	/* CPU-specific state of this task: */
+	struct thread_struct		thread;
+
+	/*
+	 * WARNING: on x86, 'thread_struct' contains a variable-sized
+	 * structure.  It *MUST* be at the end of 'task_struct'.
+	 *
+	 * Do not put anything below here!
+	 */
 };
 
 static inline struct pid *task_pid(struct task_struct *task)
@@ -975,7 +1060,7 @@ static inline struct pid *task_tgid(struct task_struct *task)
 }
 
 /*
- * Without tasklist or rcu lock it is not safe to dereference
+ * Without tasklist or RCU lock it is not safe to dereference
  * the result of task_pgrp/task_session even if task == current,
  * we can race with another thread doing sys_setsid/sys_setpgid.
  */
@@ -1002,16 +1087,14 @@ static inline struct pid *task_session(struct task_struct *task)
  *
  * see also pid_nr() etc in include/linux/pid.h
  */
-pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
-			struct pid_namespace *ns);
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
 
 static inline pid_t task_pid_nr(struct task_struct *tsk)
 {
 	return tsk->pid;
 }
 
-static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
-					struct pid_namespace *ns)
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
 }
@@ -1027,15 +1110,28 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk)
 	return tsk->tgid;
 }
 
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+extern pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 
 static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 {
 	return pid_vnr(task_tgid(tsk));
 }
 
+/**
+ * pid_alive - check that a task structure is not stale
+ * @p: Task structure to be checked.
+ *
+ * Test if a process is not yet dead (at most zombie state)
+ * If pid_alive fails, then pointers within the task structure
+ * can be stale and must not be dereferenced.
+ *
+ * Return: 1 if the process is alive. 0 otherwise.
+ */
+static inline int pid_alive(const struct task_struct *p)
+{
+	return p->pids[PIDTYPE_PID].pid != NULL;
+}
 
-static inline int pid_alive(const struct task_struct *p);
 static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
 {
 	pid_t pid = 0;
@@ -1053,8 +1149,7 @@ static inline pid_t task_ppid_nr(const struct task_struct *tsk)
 	return task_ppid_nr_ns(tsk, &init_pid_ns);
 }
 
-static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
-					struct pid_namespace *ns)
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
 }
@@ -1065,8 +1160,7 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_session_nr_ns(struct task_struct *tsk,
-					struct pid_namespace *ns)
+static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
 }
@@ -1076,27 +1170,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk)
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
-/* obsolete, do not use */
+/* Obsolete, do not use: */
 static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 {
 	return task_pgrp_nr_ns(tsk, &init_pid_ns);
 }
 
-/**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
-static inline int pid_alive(const struct task_struct *p)
-{
-	return p->pids[PIDTYPE_PID].pid != NULL;
-}
-
 /**
  * is_global_init - check if a task structure is init. Since init
  * is free to have sub-threads we need to check tgid.
@@ -1116,34 +1195,34 @@ extern struct pid *cad_pid;
 /*
  * Per process flags
  */
-#define PF_IDLE		0x00000002	/* I am an IDLE thread */
-#define PF_EXITING	0x00000004	/* getting shut down */
-#define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
-#define PF_VCPU		0x00000010	/* I'm a virtual CPU */
-#define PF_WQ_WORKER	0x00000020	/* I'm a workqueue worker */
-#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
-#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
-#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
-#define PF_DUMPCORE	0x00000200	/* dumped core */
-#define PF_SIGNALED	0x00000400	/* killed by a signal */
-#define PF_MEMALLOC	0x00000800	/* Allocating memory */
-#define PF_NPROC_EXCEEDED 0x00001000	/* set_user noticed that RLIMIT_NPROC was exceeded */
-#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
-#define PF_USED_ASYNC	0x00004000	/* used async_schedule*(), used by module init */
-#define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
-#define PF_FROZEN	0x00010000	/* frozen for system suspend */
-#define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
-#define PF_KSWAPD	0x00040000	/* I am kswapd */
-#define PF_MEMALLOC_NOIO 0x00080000	/* Allocating memory without IO involved */
-#define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_KTHREAD	0x00200000	/* I am a kernel thread */
-#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
-#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
-#define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
-#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
-#define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
-#define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
-#define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
+#define PF_IDLE			0x00000002	/* I am an IDLE thread */
+#define PF_EXITING		0x00000004	/* Getting shut down */
+#define PF_EXITPIDONE		0x00000008	/* PI exit done on shut down */
+#define PF_VCPU			0x00000010	/* I'm a virtual CPU */
+#define PF_WQ_WORKER		0x00000020	/* I'm a workqueue worker */
+#define PF_FORKNOEXEC		0x00000040	/* Forked but didn't exec */
+#define PF_MCE_PROCESS		0x00000080      /* Process policy on mce errors */
+#define PF_SUPERPRIV		0x00000100	/* Used super-user privileges */
+#define PF_DUMPCORE		0x00000200	/* Dumped core */
+#define PF_SIGNALED		0x00000400	/* Killed by a signal */
+#define PF_MEMALLOC		0x00000800	/* Allocating memory */
+#define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
+#define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
+#define PF_USED_ASYNC		0x00004000	/* Used async_schedule*(), used by module init */
+#define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
+#define PF_FROZEN		0x00010000	/* Frozen for system suspend */
+#define PF_FSTRANS		0x00020000	/* Inside a filesystem transaction */
+#define PF_KSWAPD		0x00040000	/* I am kswapd */
+#define PF_MEMALLOC_NOIO	0x00080000	/* Allocating memory without IO involved */
+#define PF_LESS_THROTTLE	0x00100000	/* Throttle me less: I clean memory */
+#define PF_KTHREAD		0x00200000	/* I am a kernel thread */
+#define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
+#define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
+#define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
+#define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
+#define PF_MUTEX_TESTER		0x20000000	/* Thread belongs to the rt mutex tester */
+#define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
+#define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -1156,33 +1235,38 @@ extern struct pid *cad_pid;
  * child is not running and in turn not changing child->flags
  * at the same time the parent does it.
  */
-#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
-#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
-#define clear_used_math() clear_stopped_child_used_math(current)
-#define set_used_math() set_stopped_child_used_math(current)
+#define clear_stopped_child_used_math(child)	do { (child)->flags &= ~PF_USED_MATH; } while (0)
+#define set_stopped_child_used_math(child)	do { (child)->flags |= PF_USED_MATH; } while (0)
+#define clear_used_math()			clear_stopped_child_used_math(current)
+#define set_used_math()				set_stopped_child_used_math(current)
+
 #define conditional_stopped_child_used_math(condition, child) \
 	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
-#define conditional_used_math(condition) \
-	conditional_stopped_child_used_math(condition, current)
+
+#define conditional_used_math(condition)	conditional_stopped_child_used_math(condition, current)
+
 #define copy_to_stopped_child_used_math(child) \
 	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
+
 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
-#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
-#define used_math() tsk_used_math(current)
+#define tsk_used_math(p)			((p)->flags & PF_USED_MATH)
+#define used_math()				tsk_used_math(current)
 
 /* Per-process atomic flags. */
-#define PFA_NO_NEW_PRIVS 0	/* May not gain new privileges. */
-#define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
-#define PFA_SPREAD_SLAB  2      /* Spread some slab caches over cpuset */
-#define PFA_LMK_WAITING  3      /* Lowmemorykiller is waiting */
+#define PFA_NO_NEW_PRIVS		0	/* May not gain new privileges. */
+#define PFA_SPREAD_PAGE			1	/* Spread page cache over cpuset */
+#define PFA_SPREAD_SLAB			2	/* Spread some slab caches over cpuset */
+#define PFA_LMK_WAITING			3	/* Lowmemorykiller is waiting */
 
 
 #define TASK_PFA_TEST(name, func)					\
 	static inline bool task_##func(struct task_struct *p)		\
 	{ return test_bit(PFA_##name, &p->atomic_flags); }
+
 #define TASK_PFA_SET(name, func)					\
 	static inline void task_set_##func(struct task_struct *p)	\
 	{ set_bit(PFA_##name, &p->atomic_flags); }
+
 #define TASK_PFA_CLEAR(name, func)					\
 	static inline void task_clear_##func(struct task_struct *p)	\
 	{ clear_bit(PFA_##name, &p->atomic_flags); }
@@ -1201,30 +1285,23 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 
-static inline void tsk_restore_flags(struct task_struct *task,
-				unsigned long orig_flags, unsigned long flags)
+static inline void
+tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags)
 {
 	task->flags &= ~flags;
 	task->flags |= orig_flags & flags;
 }
 
-extern int cpuset_cpumask_can_shrink(const struct cpumask *cur,
-				     const struct cpumask *trial);
-extern int task_can_attach(struct task_struct *p,
-			   const struct cpumask *cs_cpus_allowed);
+extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
+extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
 #ifdef CONFIG_SMP
-extern void do_set_cpus_allowed(struct task_struct *p,
-			       const struct cpumask *new_mask);
-
-extern int set_cpus_allowed_ptr(struct task_struct *p,
-				const struct cpumask *new_mask);
+extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
 #else
-static inline void do_set_cpus_allowed(struct task_struct *p,
-				      const struct cpumask *new_mask)
+static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 }
-static inline int set_cpus_allowed_ptr(struct task_struct *p,
-				       const struct cpumask *new_mask)
+static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	if (!cpumask_test_cpu(0, new_mask))
 		return -EINVAL;
@@ -1239,6 +1316,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
+
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
@@ -1249,16 +1327,15 @@ static inline int task_nice(const struct task_struct *p)
 {
 	return PRIO_TO_NICE((p)->static_prio);
 }
+
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int,
-			      const struct sched_param *);
-extern int sched_setscheduler_nocheck(struct task_struct *, int,
-				      const struct sched_param *);
-extern int sched_setattr(struct task_struct *,
-			 const struct sched_attr *);
+extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
+extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
+extern int sched_setattr(struct task_struct *, const struct sched_attr *);
 extern struct task_struct *idle_task(int cpu);
+
 /**
  * is_idle_task - is the specified task an idle task?
  * @p: the task in question.
@@ -1269,6 +1346,7 @@ static inline bool is_idle_task(const struct task_struct *p)
 {
 	return !!(p->flags & PF_IDLE);
 }
+
 extern struct task_struct *curr_task(int cpu);
 extern void ia64_set_curr_task(int cpu, struct task_struct *p);
 
@@ -1302,23 +1380,25 @@ static inline struct thread_info *task_thread_info(struct task_struct *task)
  */
 
 extern struct task_struct *find_task_by_vpid(pid_t nr);
-extern struct task_struct *find_task_by_pid_ns(pid_t nr,
-		struct pid_namespace *ns);
+extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
+
 #ifdef CONFIG_SMP
- extern void kick_process(struct task_struct *tsk);
+extern void kick_process(struct task_struct *tsk);
 #else
- static inline void kick_process(struct task_struct *tsk) { }
+static inline void kick_process(struct task_struct *tsk) { }
 #endif
 
 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
+
 static inline void set_task_comm(struct task_struct *tsk, const char *from)
 {
 	__set_task_comm(tsk, from, false);
 }
+
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
@@ -1326,15 +1406,15 @@ void scheduler_ipi(void);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
 static inline void scheduler_ipi(void) { }
-static inline unsigned long wait_task_inactive(struct task_struct *p,
-					       long match_state)
+static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	return 1;
 }
 #endif
 
-/* set thread flags in other task's structures
- * - see asm/thread_info.h for TIF_xxxx flags available
+/*
+ * Set thread flags in other task's structures.
+ * See asm/thread_info.h for TIF_xxxx flags available:
  */
 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
-- 
cgit v1.2.3


From a528d35e8bfcc521d7cb70aaf03e1bd296c8493f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 31 Jan 2017 16:46:22 +0000
Subject: statx: Add a system call to make enhanced file info available

Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.

The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode.  This change is propagated to the vfs_getattr*()
function.

Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.

========
OVERVIEW
========

The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.

A number of requests were gathered for features to be included.  The
following have been included:

 (1) Make the fields a consistent size on all arches and make them large.

 (2) Spare space, request flags and information flags are provided for
     future expansion.

 (3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
     __s64).

 (4) Creation time: The SMB protocol carries the creation time, which could
     be exported by Samba, which will in turn help CIFS make use of
     FS-Cache as that can be used for coherency data (stx_btime).

     This is also specified in NFSv4 as a recommended attribute and could
     be exported by NFSD [Steve French].

 (5) Lightweight stat: Ask for just those details of interest, and allow a
     netfs (such as NFS) to approximate anything not of interest, possibly
     without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
     Dilger] (AT_STATX_DONT_SYNC).

 (6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
     its cached attributes are up to date [Trond Myklebust]
     (AT_STATX_FORCE_SYNC).

And the following have been left out for future extension:

 (7) Data version number: Could be used by userspace NFS servers [Aneesh
     Kumar].

     Can also be used to modify fill_post_wcc() in NFSD which retrieves
     i_version directly, but has just called vfs_getattr().  It could get
     it from the kstat struct if it used vfs_xgetattr() instead.

     (There's disagreement on the exact semantics of a single field, since
     not all filesystems do this the same way).

 (8) BSD stat compatibility: Including more fields from the BSD stat such
     as creation time (st_btime) and inode generation number (st_gen)
     [Jeremy Allison, Bernd Schubert].

 (9) Inode generation number: Useful for FUSE and userspace NFS servers
     [Bernd Schubert].

     (This was asked for but later deemed unnecessary with the
     open-by-handle capability available and caused disagreement as to
     whether it's a security hole or not).

(10) Extra coherency data may be useful in making backups [Andreas Dilger].

     (No particular data were offered, but things like last backup
     timestamp, the data version number and the DOS archive bit would come
     into this category).

(11) Allow the filesystem to indicate what it can/cannot provide: A
     filesystem can now say it doesn't support a standard stat feature if
     that isn't available, so if, for instance, inode numbers or UIDs don't
     exist or are fabricated locally...

     (This requires a separate system call - I have an fsinfo() call idea
     for this).

(12) Store a 16-byte volume ID in the superblock that can be returned in
     struct xstat [Steve French].

     (Deferred to fsinfo).

(13) Include granularity fields in the time data to indicate the
     granularity of each of the times (NFSv4 time_delta) [Steve French].

     (Deferred to fsinfo).

(14) FS_IOC_GETFLAGS value.  These could be translated to BSD's st_flags.
     Note that the Linux IOC flags are a mess and filesystems such as Ext4
     define flags that aren't in linux/fs.h, so translation in the kernel
     may be a necessity (or, possibly, we provide the filesystem type too).

     (Some attributes are made available in stx_attributes, but the general
     feeling was that the IOC flags were to ext[234]-specific and shouldn't
     be exposed through statx this way).

(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
     Michael Kerrisk].

     (Deferred, probably to fsinfo.  Finding out if there's an ACL or
     seclabal might require extra filesystem operations).

(16) Femtosecond-resolution timestamps [Dave Chinner].

     (A __reserved field has been left in the statx_timestamp struct for
     this - if there proves to be a need).

(17) A set multiple attributes syscall to go with this.

===============
NEW SYSTEM CALL
===============

The new system call is:

	int ret = statx(int dfd,
			const char *filename,
			unsigned int flags,
			unsigned int mask,
			struct statx *buffer);

The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat().  There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags.  There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.

Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):

 (1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
     respect.

 (2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
     its attributes with the server - which might require data writeback to
     occur to get the timestamps correct.

 (3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
     network filesystem.  The resulting values should be considered
     approximate.

mask is a bitmask indicating the fields in struct statx that are of
interest to the caller.  The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat().  It should be noted that asking for
more information may entail extra I/O operations.

buffer points to the destination for the data.  This must be 256 bytes in
size.

======================
MAIN ATTRIBUTES RECORD
======================

The following structures are defined in which to return the main attribute
set:

	struct statx_timestamp {
		__s64	tv_sec;
		__s32	tv_nsec;
		__s32	__reserved;
	};

	struct statx {
		__u32	stx_mask;
		__u32	stx_blksize;
		__u64	stx_attributes;
		__u32	stx_nlink;
		__u32	stx_uid;
		__u32	stx_gid;
		__u16	stx_mode;
		__u16	__spare0[1];
		__u64	stx_ino;
		__u64	stx_size;
		__u64	stx_blocks;
		__u64	__spare1[1];
		struct statx_timestamp	stx_atime;
		struct statx_timestamp	stx_btime;
		struct statx_timestamp	stx_ctime;
		struct statx_timestamp	stx_mtime;
		__u32	stx_rdev_major;
		__u32	stx_rdev_minor;
		__u32	stx_dev_major;
		__u32	stx_dev_minor;
		__u64	__spare2[14];
	};

The defined bits in request_mask and stx_mask are:

	STATX_TYPE		Want/got stx_mode & S_IFMT
	STATX_MODE		Want/got stx_mode & ~S_IFMT
	STATX_NLINK		Want/got stx_nlink
	STATX_UID		Want/got stx_uid
	STATX_GID		Want/got stx_gid
	STATX_ATIME		Want/got stx_atime{,_ns}
	STATX_MTIME		Want/got stx_mtime{,_ns}
	STATX_CTIME		Want/got stx_ctime{,_ns}
	STATX_INO		Want/got stx_ino
	STATX_SIZE		Want/got stx_size
	STATX_BLOCKS		Want/got stx_blocks
	STATX_BASIC_STATS	[The stuff in the normal stat struct]
	STATX_BTIME		Want/got stx_btime{,_ns}
	STATX_ALL		[All currently available stuff]

stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.

Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution.  Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.

The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does.  The following
attributes map to FS_*_FL flags and are the same numerical value:

	STATX_ATTR_COMPRESSED		File is compressed by the fs
	STATX_ATTR_IMMUTABLE		File is marked immutable
	STATX_ATTR_APPEND		File is append-only
	STATX_ATTR_NODUMP		File is not to be dumped
	STATX_ATTR_ENCRYPTED		File requires key to decrypt in fs

Within the kernel, the supported flags are listed by:

	KSTAT_ATTR_FS_IOC_FLAGS

[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]

New flags include:

	STATX_ATTR_AUTOMOUNT		Object is an automount trigger

These are for the use of GUI tools that might want to mark files specially,
depending on what they are.

Fields in struct statx come in a number of classes:

 (0) stx_dev_*, stx_blksize.

     These are local system information and are always available.

 (1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
     stx_size, stx_blocks.

     These will be returned whether the caller asks for them or not.  The
     corresponding bits in stx_mask will be set to indicate whether they
     actually have valid values.

     If the caller didn't ask for them, then they may be approximated.  For
     example, NFS won't waste any time updating them from the server,
     unless as a byproduct of updating something requested.

     If the values don't actually exist for the underlying object (such as
     UID or GID on a DOS file), then the bit won't be set in the stx_mask,
     even if the caller asked for the value.  In such a case, the returned
     value will be a fabrication.

     Note that there are instances where the type might not be valid, for
     instance Windows reparse points.

 (2) stx_rdev_*.

     This will be set only if stx_mode indicates we're looking at a
     blockdev or a chardev, otherwise will be 0.

 (3) stx_btime.

     Similar to (1), except this will be set to 0 if it doesn't exist.

=======
TESTING
=======

The following test program can be used to test the statx system call:

	samples/statx/test-statx.c

Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.

Here's some example output.  Firstly, an NFS directory that crosses to
another FSID.  Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.

	[root@andromeda ~]# /tmp/test-statx -A /warthog/data
	statx(/warthog/data) = 0
	results=7ff
	  Size: 4096            Blocks: 8          IO Block: 1048576  directory
	Device: 00:26           Inode: 1703937     Links: 125
	Access: (3777/drwxrwxrwx)  Uid:     0   Gid:  4041
	Access: 2016-11-24 09:02:12.219699527+0000
	Modify: 2016-11-17 10:44:36.225653653+0000
	Change: 2016-11-17 10:44:36.225653653+0000
	Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)

Secondly, the result of automounting on that directory.

	[root@andromeda ~]# /tmp/test-statx /warthog/data
	statx(/warthog/data) = 0
	results=7ff
	  Size: 4096            Blocks: 8          IO Block: 1048576  directory
	Device: 00:27           Inode: 2           Links: 125
	Access: (3777/drwxrwxrwx)  Uid:     0   Gid:  4041
	Access: 2016-11-24 09:02:12.219699527+0000
	Modify: 2016-11-17 10:44:36.225653653+0000
	Change: 2016-11-17 10:44:36.225653653+0000

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking                  |   3 +-
 Documentation/filesystems/vfs.txt                  |   3 +-
 arch/x86/entry/syscalls/syscall_32.tbl             |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl             |   1 +
 drivers/base/devtmpfs.c                            |   3 +-
 drivers/block/loop.c                               |   3 +-
 drivers/mtd/ubi/build.c                            |   2 +-
 drivers/mtd/ubi/kapi.c                             |   2 +-
 drivers/staging/lustre/lustre/llite/file.c         |   9 +-
 .../staging/lustre/lustre/llite/llite_internal.h   |   3 +-
 fs/9p/vfs_inode.c                                  |  10 +-
 fs/9p/vfs_inode_dotl.c                             |   5 +-
 fs/afs/inode.c                                     |   8 +-
 fs/afs/internal.h                                  |   2 +-
 fs/bad_inode.c                                     |   4 +-
 fs/btrfs/inode.c                                   |   6 +-
 fs/ceph/inode.c                                    |   6 +-
 fs/ceph/super.h                                    |   4 +-
 fs/cifs/cifsfs.h                                   |   2 +-
 fs/cifs/inode.c                                    |   5 +-
 fs/coda/coda_linux.h                               |   2 +-
 fs/coda/inode.c                                    |   7 +-
 fs/ecryptfs/inode.c                                |  13 +-
 fs/exportfs/expfs.c                                |   3 +-
 fs/ext4/ext4.h                                     |   3 +-
 fs/ext4/inode.c                                    |   6 +-
 fs/f2fs/f2fs.h                                     |   4 +-
 fs/f2fs/file.c                                     |   6 +-
 fs/fat/fat.h                                       |   4 +-
 fs/fat/file.c                                      |   5 +-
 fs/fuse/dir.c                                      |   6 +-
 fs/gfs2/inode.c                                    |  11 +-
 fs/kernfs/inode.c                                  |   8 +-
 fs/kernfs/kernfs-internal.h                        |   4 +-
 fs/libfs.c                                         |  12 +-
 fs/minix/inode.c                                   |  11 +-
 fs/minix/minix.h                                   |   2 +-
 fs/nfs/inode.c                                     |  13 +-
 fs/nfs/namespace.c                                 |   9 +-
 fs/nfsd/nfs4xdr.c                                  |   4 +-
 fs/nfsd/vfs.h                                      |   3 +-
 fs/ocfs2/file.c                                    |  11 +-
 fs/ocfs2/file.h                                    |   4 +-
 fs/orangefs/inode.c                                |  13 +-
 fs/orangefs/orangefs-kernel.h                      |   5 +-
 fs/overlayfs/copy_up.c                             |   6 +-
 fs/overlayfs/dir.c                                 |  10 +-
 fs/overlayfs/inode.c                               |   7 +-
 fs/proc/base.c                                     |  12 +-
 fs/proc/generic.c                                  |   6 +-
 fs/proc/internal.h                                 |   2 +-
 fs/proc/proc_net.c                                 |   6 +-
 fs/proc/proc_sysctl.c                              |   5 +-
 fs/proc/root.c                                     |   6 +-
 fs/stat.c                                          | 214 ++++++++++++++---
 fs/sysv/itree.c                                    |   7 +-
 fs/sysv/sysv.h                                     |   2 +-
 fs/ubifs/dir.c                                     |   6 +-
 fs/ubifs/ubifs.h                                   |   4 +-
 fs/udf/symlink.c                                   |   5 +-
 fs/xfs/xfs_iops.c                                  |   9 +-
 include/linux/fs.h                                 |  35 ++-
 include/linux/nfs_fs.h                             |   2 +-
 include/linux/stat.h                               |  24 +-
 include/linux/syscalls.h                           |   3 +
 include/uapi/linux/fcntl.h                         |   5 +
 include/uapi/linux/stat.h                          | 131 +++++++++++
 mm/shmem.c                                         |   6 +-
 samples/Kconfig                                    |   6 +
 samples/Makefile                                   |   2 +-
 samples/statx/Makefile                             |  10 +
 samples/statx/test-statx.c                         | 254 +++++++++++++++++++++
 72 files changed, 822 insertions(+), 214 deletions(-)
 create mode 100644 samples/statx/Makefile
 create mode 100644 samples/statx/test-statx.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index ace63cd7af8c..fdcfdd79682a 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -58,7 +58,8 @@ prototypes:
 	int (*permission) (struct inode *, int, unsigned int);
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
+	int (*getattr) (const struct path *, struct dentry *, struct kstat *,
+			u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 	void (*update_time)(struct inode *, struct timespec *, int);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index b968084eeac1..569211703721 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -382,7 +382,8 @@ struct inode_operations {
 	int (*permission) (struct inode *, int);
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+	int (*getattr) (const struct path *, struct dentry *, struct kstat *,
+			u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	void (*update_time)(struct inode *, struct timespec *, int);
 	int (*atomic_open)(struct inode *, struct dentry *, struct file *,
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2b3618542544..9ba050fe47f3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -389,3 +389,4 @@
 380	i386	pkey_mprotect		sys_pkey_mprotect
 381	i386	pkey_alloc		sys_pkey_alloc
 382	i386	pkey_free		sys_pkey_free
+383	i386	statx			sys_statx
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e93ef0b38db8..5aef183e2f85 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -338,6 +338,7 @@
 329	common	pkey_mprotect		sys_pkey_mprotect
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
+332	common	statx			sys_statx
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 44a74cf1372c..d2fb9c8ed205 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -309,7 +309,8 @@ static int handle_remove(const char *nodename, struct device *dev)
 	if (d_really_is_positive(dentry)) {
 		struct kstat stat;
 		struct path p = {.mnt = parent.mnt, .dentry = dentry};
-		err = vfs_getattr(&p, &stat);
+		err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE,
+				  AT_STATX_SYNC_AS_STAT);
 		if (!err && dev_mynode(dev, d_inode(dentry), &stat)) {
 			struct iattr newattrs;
 			/*
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index eeb1db73f44e..8f4051999741 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1175,7 +1175,8 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
-	error = vfs_getattr(&file->f_path, &stat);
+	error = vfs_getattr(&file->f_path, &stat,
+			    STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (error)
 		return error;
 	memset(info, 0, sizeof(*info));
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index 85d54f37e28f..77513195f50e 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -1159,7 +1159,7 @@ static struct mtd_info * __init open_mtd_by_chdev(const char *mtd_dev)
 	if (err)
 		return ERR_PTR(err);
 
-	err = vfs_getattr(&path, &stat);
+	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
 	path_put(&path);
 	if (err)
 		return ERR_PTR(err);
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 88b1897aeb40..d4b2e8744498 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -314,7 +314,7 @@ struct ubi_volume_desc *ubi_open_volume_path(const char *pathname, int mode)
 	if (error)
 		return ERR_PTR(error);
 
-	error = vfs_getattr(&path, &stat);
+	error = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
 	path_put(&path);
 	if (error)
 		return ERR_PTR(error);
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index 10adfcdd7035..481c0d01d4c6 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -2952,15 +2952,16 @@ static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
 	return rc;
 }
 
-int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+int ll_getattr(const struct path *path, struct kstat *stat,
+	       u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(de);
+	struct inode *inode = d_inode(path->dentry);
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ll_inode_info *lli = ll_i2info(inode);
 	int res;
 
-	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
-				      MDS_INODELOCK_LOOKUP);
+	res = ll_inode_revalidate(path->dentry,
+				  MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP);
 	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
 
 	if (res)
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index ecdfd0c29b7f..55f68acd85d1 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -750,7 +750,8 @@ int ll_file_open(struct inode *inode, struct file *file);
 int ll_file_release(struct inode *inode, struct file *file);
 int ll_release_openhandle(struct inode *, struct lookup_intent *);
 int ll_md_real_close(struct inode *inode, fmode_t fmode);
-int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+int ll_getattr(const struct path *path, struct kstat *stat,
+	       u32 request_mask, unsigned int flags);
 struct posix_acl *ll_get_acl(struct inode *inode, int type);
 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 	       const char *name, int namelen);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f4f4450119e4..f1d96233670c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1047,16 +1047,18 @@ done:
 
 /**
  * v9fs_vfs_getattr - retrieve file metadata
- * @mnt: mount information
- * @dentry: file to get attributes on
+ * @path: Object to query
  * @stat: metadata structure to populate
+ * @request_mask: Mask of STATX_xxx flags indicating the caller's interests
+ * @flags: AT_STATX_xxx setting
  *
  */
 
 static int
-v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_wstat *st;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5999bd050678..570e63ee5b71 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -468,9 +468,10 @@ error:
 }
 
 static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_stat_dotl *st;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 86cc7264c21c..1e4897a048d2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -375,12 +375,10 @@ error_unlock:
 /*
  * read the attributes of an inode
  */
-int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		      struct kstat *stat)
+int afs_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode;
-
-	inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 
 	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8acf3670e756..5dfa56903a2d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -533,7 +533,7 @@ extern struct inode *afs_iget(struct super_block *, struct key *,
 			      struct afs_callback *);
 extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
-extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int afs_setattr(struct dentry *, struct iattr *);
 extern void afs_evict_inode(struct inode *);
 extern int afs_drop_inode(struct inode *);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f685c819298..bb53728c7a31 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -89,8 +89,8 @@ static int bad_inode_permission(struct inode *inode, int mask)
 	return -EIO;
 }
 
-static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat)
+static int bad_inode_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
 	return -EIO;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ee6978d80491..c40060cc481f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9413,11 +9413,11 @@ fail:
 	return -ENOMEM;
 }
 
-static int btrfs_getattr(struct vfsmount *mnt,
-			 struct dentry *dentry, struct kstat *stat)
+static int btrfs_getattr(const struct path *path, struct kstat *stat,
+			 u32 request_mask, unsigned int flags)
 {
 	u64 delalloc_bytes;
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	u32 blocksize = inode->i_sb->s_blocksize;
 
 	generic_fillattr(inode, stat);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fd8f771f99b7..d449e1c03cbd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2187,10 +2187,10 @@ int ceph_permission(struct inode *inode, int mask)
  * Get all attributes.  Hopefully somedata we'll have a statlite()
  * and can limit the fields we require to be accurate.
  */
-int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+int ceph_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int err;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e9410bcf4113..fe6b9cfc4013 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -784,8 +784,8 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
 extern int ceph_permission(struct inode *inode, int mask);
 extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat);
+extern int ceph_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
 
 /* xattr.c */
 int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c9c00a862036..da717fee3026 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -83,7 +83,7 @@ extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_revalidate_mapping(struct inode *inode);
 extern int cifs_zap_mapping(struct inode *inode);
-extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
 extern const struct inode_operations cifs_file_inode_ops;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7ab5be7944aa..1363fff460b9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1990,9 +1990,10 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 	return cifs_revalidate_mapping(inode);
 }
 
-int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+int cifs_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 	struct inode *inode = d_inode(dentry);
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 5104d84c4f64..d3c361883c28 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -47,7 +47,7 @@ int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
 int coda_permission(struct inode *inode, int mask);
 int coda_revalidate_inode(struct inode *);
-int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_getattr(const struct path *, struct kstat *, u32, unsigned int);
 int coda_setattr(struct dentry *, struct iattr *);
 
 /* this file:  heloers */
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 71dbe7e287ce..2dea594da199 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -255,11 +255,12 @@ static void coda_evict_inode(struct inode *inode)
 	coda_cache_clear_inode(inode);
 }
 
-int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int coda_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	int err = coda_revalidate_inode(d_inode(dentry));
+	int err = coda_revalidate_inode(d_inode(path->dentry));
 	if (!err)
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(d_inode(path->dentry), stat);
 	return err;
 }
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e7413f82d27b..efc2db42d175 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -959,9 +959,10 @@ out:
 	return rc;
 }
 
-static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
-				 struct kstat *stat)
+static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
+				 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
 	int rc = 0;
 
@@ -983,13 +984,15 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
 	return rc;
 }
 
-static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			    struct kstat *stat)
+static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
+			    u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct kstat lower_stat;
 	int rc;
 
-	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat);
+	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat,
+			 request_mask, flags);
 	if (!rc) {
 		fsstack_copy_attr_all(d_inode(dentry),
 				      ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index a4b531be9168..f2d24bb8d745 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -299,7 +299,8 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 	 * filesystem supports 64-bit inode numbers.  So we need to
 	 * actually call ->getattr, not just read i_ino:
 	 */
-	error = vfs_getattr_nosec(&child_path, &stat);
+	error = vfs_getattr_nosec(&child_path, &stat,
+				  STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (error)
 		return error;
 	buffer.ino = stat.ino;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2fd17e8e4984..025d2e85f454 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2462,8 +2462,7 @@ extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
-extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
-				struct kstat *stat);
+extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 971f66342080..7385e6a6b6cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5387,13 +5387,13 @@ err_out:
 	return error;
 }
 
-int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+int ext4_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode;
 	unsigned long long delalloc_blocks;
 
-	inode = d_inode(dentry);
+	inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 
 	/*
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d1483136fed6..e849f83d6114 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2040,8 +2040,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void truncate_data_blocks(struct dnode_of_data *dn);
 int truncate_blocks(struct inode *inode, u64 from, bool lock);
 int f2fs_truncate(struct inode *inode);
-int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat);
+int f2fs_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
 int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
 int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
 int truncate_data_blocks_range(struct dnode_of_data *dn, int count);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 78e65288f2b2..5f7317875a67 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -633,10 +633,10 @@ int f2fs_truncate(struct inode *inode)
 	return 0;
 }
 
-int f2fs_getattr(struct vfsmount *mnt,
-			 struct dentry *dentry, struct kstat *stat)
+int f2fs_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	stat->blocks <<= 3;
 	return 0;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6b764a17a9c..051dac1ce3be 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -364,8 +364,8 @@ extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
-extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		       struct kstat *stat);
+extern int fat_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int flags);
 extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync);
 
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3d04b124bce0..4724cc9ad650 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -365,9 +365,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
-int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int fat_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 811fd8929a18..beb3d64f16e2 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1777,10 +1777,10 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	return ret;
 }
 
-static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
-			struct kstat *stat)
+static int fuse_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(entry);
+	struct inode *inode = d_inode(path->dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	if (!fuse_allow_current_process(fc))
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index eb7724b8578a..288c15f385bd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1959,9 +1959,10 @@ out:
 
 /**
  * gfs2_getattr - Read out an inode's attributes
- * @mnt: The vfsmount the inode is being accessed from
- * @dentry: The dentry to stat
+ * @path: Object to query
  * @stat: The inode's stats
+ * @request_mask: Mask of STATX_xxx flags indicating the caller's interests
+ * @flags: AT_STATX_xxx setting
  *
  * This may be called from the VFS directly, or from within GFS2 with the
  * inode locked, so we look to see if the glock is already locked and only
@@ -1972,10 +1973,10 @@ out:
  * Returns: errno
  */
 
-static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat)
+static int gfs2_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int error;
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index ac9e108ce1ea..fb4b4a79a0d6 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -200,11 +200,11 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 		set_nlink(inode, kn->dir.subdirs + 2);
 }
 
-int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		   struct kstat *stat)
+int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int query_flags)
 {
-	struct kernfs_node *kn = dentry->d_fsdata;
-	struct inode *inode = d_inode(dentry);
+	struct kernfs_node *kn = path->dentry->d_fsdata;
+	struct inode *inode = d_inode(path->dentry);
 
 	mutex_lock(&kernfs_mutex);
 	kernfs_refresh_inode(kn, inode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3100987cf8ba..2d5144ab4251 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -80,8 +80,8 @@ extern const struct xattr_handler *kernfs_xattr_handlers[];
 void kernfs_evict_inode(struct inode *inode);
 int kernfs_iop_permission(struct inode *inode, int mask);
 int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
-int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		       struct kstat *stat);
+int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int query_flags);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
 
 /*
diff --git a/fs/libfs.c b/fs/libfs.c
index 28d6f35feed6..1dfaf8f606c0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,10 +20,10 @@
 
 #include "internal.h"
 
-int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		   struct kstat *stat)
+int simple_getattr(const struct path *path, struct kstat *stat,
+		   u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
 	return 0;
@@ -1143,10 +1143,10 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
 	return ERR_PTR(-ENOENT);
 }
 
-static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
-				 struct kstat *stat)
+static int empty_dir_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	return 0;
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e7d9bf86d975..6ac76b0434e9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -622,11 +622,14 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return err;
 }
 
-int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int minix_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags)
 {
-	struct super_block *sb = dentry->d_sb;
-	generic_fillattr(d_inode(dentry), stat);
-	if (INODE_VERSION(d_inode(dentry)) == MINIX_V1)
+	struct super_block *sb = path->dentry->d_sb;
+	struct inode *inode = d_inode(path->dentry);
+
+	generic_fillattr(inode, stat);
+	if (INODE_VERSION(inode) == MINIX_V1)
 		stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
 	else
 		stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 01ad81dcacc5..663d66138d06 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -51,7 +51,7 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb);
 extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
 extern unsigned long minix_count_free_blocks(struct super_block *sb);
-extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int minix_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
 
 extern void V1_minix_truncate(struct inode *);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5ca4d96b1942..b5425315adcc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -703,9 +703,10 @@ static bool nfs_need_revalidate_inode(struct inode *inode)
 	return false;
 }
 
-int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int nfs_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err = 0;
 
@@ -726,17 +727,17 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
 	 *    no point in checking those.
 	 */
- 	if ((mnt->mnt_flags & MNT_NOATIME) ||
- 	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+	if ((path->mnt->mnt_flags & MNT_NOATIME) ||
+	    ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		need_atime = 0;
 
 	if (need_atime || nfs_need_revalidate_inode(inode)) {
 		struct nfs_server *server = NFS_SERVER(inode);
 
-		nfs_readdirplus_parent_cache_miss(dentry);
+		nfs_readdirplus_parent_cache_miss(path->dentry);
 		err = __nfs_revalidate_inode(server, inode);
 	} else
-		nfs_readdirplus_parent_cache_hit(dentry);
+		nfs_readdirplus_parent_cache_hit(path->dentry);
 	if (!err) {
 		generic_fillattr(inode, stat);
 		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e49d831c4e85..786f17580582 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -178,11 +178,12 @@ out_nofree:
 }
 
 static int
-nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+nfs_namespace_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int query_flags)
 {
-	if (NFS_FH(d_inode(dentry))->size != 0)
-		return nfs_getattr(mnt, dentry, stat);
-	generic_fillattr(d_inode(dentry), stat);
+	if (NFS_FH(d_inode(path->dentry))->size != 0)
+		return nfs_getattr(path, stat, request_mask, query_flags);
+	generic_fillattr(d_inode(path->dentry), stat);
 	return 0;
 }
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 382c1fd05b4c..33017d652b1d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2301,7 +2301,7 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
 		if (path.dentry != path.mnt->mnt_root)
 			break;
 	}
-	err = vfs_getattr(&path, stat);
+	err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 	path_put(&path);
 	return err;
 }
@@ -2385,7 +2385,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out;
 	}
 
-	err = vfs_getattr(&path, &stat);
+	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_nfserr;
 	if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index db98c48c735a..1bbdccecbf3d 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -135,7 +135,8 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
 {
 	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
 			 .dentry = fh->fh_dentry};
-	return nfserrno(vfs_getattr(&p, stat));
+	return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
+				    AT_STATX_SYNC_AS_STAT));
 }
 
 static inline int nfsd_create_is_exclusive(int createmode)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8836305eb378..bfeb647459d9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1306,16 +1306,15 @@ bail:
 	return status;
 }
 
-int ocfs2_getattr(struct vfsmount *mnt,
-		  struct dentry *dentry,
-		  struct kstat *stat)
+int ocfs2_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
-	struct super_block *sb = dentry->d_sb;
+	struct inode *inode = d_inode(path->dentry);
+	struct super_block *sb = path->dentry->d_sb;
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 
-	err = ocfs2_inode_revalidate(dentry);
+	err = ocfs2_inode_revalidate(path->dentry);
 	if (err) {
 		if (err != -ENOENT)
 			mlog_errno(err);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 897fd9a2e51d..1fdc9839cd93 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -68,8 +68,8 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 		u32 clusters_to_add, int mark_unwritten);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
-int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
+int ocfs2_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags);
 int ocfs2_permission(struct inode *inode, int mask);
 
 int ocfs2_should_update_atime(struct inode *inode,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 5cd617980fbf..a304bf34b212 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -245,25 +245,24 @@ out:
 /*
  * Obtain attributes of an object given a dentry
  */
-int orangefs_getattr(struct vfsmount *mnt,
-		  struct dentry *dentry,
-		  struct kstat *kstat)
+int orangefs_getattr(const struct path *path, struct kstat *stat,
+		     u32 request_mask, unsigned int flags)
 {
 	int ret = -ENOENT;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	struct orangefs_inode_s *orangefs_inode = NULL;
 
 	gossip_debug(GOSSIP_INODE_DEBUG,
 		     "orangefs_getattr: called on %pd\n",
-		     dentry);
+		     path->dentry);
 
 	ret = orangefs_inode_getattr(inode, 0, 0);
 	if (ret == 0) {
-		generic_fillattr(inode, kstat);
+		generic_fillattr(inode, stat);
 
 		/* override block size reported to stat */
 		orangefs_inode = ORANGEFS_I(inode);
-		kstat->blksize = orangefs_inode->blksize;
+		stat->blksize = orangefs_inode->blksize;
 	}
 	return ret;
 }
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 70355a9a2596..0c4f03c22ce0 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -439,9 +439,8 @@ struct inode *orangefs_new_inode(struct super_block *sb,
 
 int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
 
-int orangefs_getattr(struct vfsmount *mnt,
-		  struct dentry *dentry,
-		  struct kstat *kstat);
+int orangefs_getattr(const struct path *path, struct kstat *stat,
+		     u32 request_mask, unsigned int flags);
 
 int orangefs_permission(struct inode *inode, int mask);
 
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f57043dace62..a6f9ca621e0b 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -346,7 +346,8 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
 	ovl_path_upper(parent, &parentpath);
 	upperdir = parentpath.dentry;
 
-	err = vfs_getattr(&parentpath, &pstat);
+	err = vfs_getattr(&parentpath, &pstat,
+			  STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT);
 	if (err)
 		return err;
 
@@ -409,7 +410,8 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags)
 		}
 
 		ovl_path_lower(next, &lowerpath);
-		err = vfs_getattr(&lowerpath, &stat);
+		err = vfs_getattr(&lowerpath, &stat,
+				  STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 		/* maybe truncate regular file. this has no effect on dirs */
 		if (flags & O_TRUNC)
 			stat.size = 0;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 16e06dd89457..6515796460df 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,9 +138,10 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
 	return err;
 }
 
-static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			 struct kstat *stat)
+static int ovl_dir_getattr(const struct path *path, struct kstat *stat,
+			   u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	int err;
 	enum ovl_path_type type;
 	struct path realpath;
@@ -148,7 +149,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 	type = ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&realpath, stat);
+	err = vfs_getattr(&realpath, stat, request_mask, flags);
 	revert_creds(old_cred);
 	if (err)
 		return err;
@@ -264,7 +265,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 		goto out;
 
 	ovl_path_upper(dentry, &upperpath);
-	err = vfs_getattr(&upperpath, &stat);
+	err = vfs_getattr(&upperpath, &stat,
+			  STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_unlock;
 
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 08643ac44a02..d4bb54f7b6b4 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -56,16 +56,17 @@ out:
 	return err;
 }
 
-static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			 struct kstat *stat)
+static int ovl_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct path realpath;
 	const struct cred *old_cred;
 	int err;
 
 	ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&realpath, stat);
+	err = vfs_getattr(&realpath, stat, request_mask, flags);
 	revert_creds(old_cred);
 	return err;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1e1e182d571b..3b5e6aa2a326 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1724,11 +1724,12 @@ out_unlock:
 	return NULL;
 }
 
-int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct task_struct *task;
-	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
+	struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
 
@@ -3511,9 +3512,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	return 0;
 }
 
-static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+static int proc_task_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct task_struct *p = get_proc_task(inode);
 	generic_fillattr(inode, stat);
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 06c73904d497..ee27feb34cf4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -118,10 +118,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	return 0;
 }
 
-static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat)
+static int proc_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct proc_dir_entry *de = PDE(inode);
 	if (de && de->nlink)
 		set_nlink(inode, de->nlink);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5d6960f5f1c0..e93cdc6ddb31 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -149,7 +149,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
  * base.c
  */
 extern const struct dentry_operations pid_dentry_operations;
-extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int proc_setattr(struct dentry *, struct iattr *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern int pid_revalidate(struct dentry *, unsigned int);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index ffd72a6c6e04..9db1df2537fc 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -140,10 +140,10 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
 	return de;
 }
 
-static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		struct kstat *stat)
+static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
+				 u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct net *net;
 
 	net = get_proc_task_net(inode);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 3e64c6502dc8..3d8726445ad1 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -801,9 +801,10 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+			    u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b90da888b81a..fb1955c82274 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,10 +149,10 @@ void __init proc_root_init(void)
 	proc_sys_init();
 }
 
-static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
-)
+static int proc_root_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(d_inode(path->dentry), stat);
 	stat->nlink = proc_root.nlink + nr_processes();
 	return 0;
 }
diff --git a/fs/stat.c b/fs/stat.c
index 3f14d1ef0868..a3804feadade 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -18,6 +18,15 @@
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 
+/**
+ * generic_fillattr - Fill in the basic attributes from the inode struct
+ * @inode: Inode to use as the source
+ * @stat: Where to fill in the attributes
+ *
+ * Fill in the basic attributes in the kstat structure from data that's to be
+ * found on the VFS inode structure.  This is the default if no getattr inode
+ * operation is supplied.
+ */
 void generic_fillattr(struct inode *inode, struct kstat *stat)
 {
 	stat->dev = inode->i_sb->s_dev;
@@ -33,81 +42,147 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
 	stat->ctime = inode->i_ctime;
 	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
-}
 
+	if (IS_NOATIME(inode))
+		stat->result_mask &= ~STATX_ATIME;
+	if (IS_AUTOMOUNT(inode))
+		stat->attributes |= STATX_ATTR_AUTOMOUNT;
+}
 EXPORT_SYMBOL(generic_fillattr);
 
 /**
  * vfs_getattr_nosec - getattr without security checks
  * @path: file to get attributes from
  * @stat: structure to return attributes in
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
  *
  * Get attributes without calling security_inode_getattr.
  *
  * Currently the only caller other than vfs_getattr is internal to the
- * filehandle lookup code, which uses only the inode number and returns
- * no attributes to any user.  Any other code probably wants
- * vfs_getattr.
+ * filehandle lookup code, which uses only the inode number and returns no
+ * attributes to any user.  Any other code probably wants vfs_getattr.
  */
-int vfs_getattr_nosec(struct path *path, struct kstat *stat)
+int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
+		      u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_backing_inode(path->dentry);
 
+	memset(stat, 0, sizeof(*stat));
+	stat->result_mask |= STATX_BASIC_STATS;
+	request_mask &= STATX_ALL;
+	query_flags &= KSTAT_QUERY_FLAGS;
 	if (inode->i_op->getattr)
-		return inode->i_op->getattr(path->mnt, path->dentry, stat);
+		return inode->i_op->getattr(path, stat, request_mask,
+					    query_flags);
 
 	generic_fillattr(inode, stat);
 	return 0;
 }
-
 EXPORT_SYMBOL(vfs_getattr_nosec);
 
-int vfs_getattr(struct path *path, struct kstat *stat)
+/*
+ * vfs_getattr - Get the enhanced basic attributes of a file
+ * @path: The file of interest
+ * @stat: Where to return the statistics
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
+ *
+ * Ask the filesystem for a file's attributes.  The caller must indicate in
+ * request_mask and query_flags to indicate what they want.
+ *
+ * If the file is remote, the filesystem can be forced to update the attributes
+ * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can
+ * suppress the update by passing AT_STATX_DONT_SYNC.
+ *
+ * Bits must have been set in request_mask to indicate which attributes the
+ * caller wants retrieving.  Any such attribute not requested may be returned
+ * anyway, but the value may be approximate, and, if remote, may not have been
+ * synchronised with the server.
+ *
+ * 0 will be returned on success, and a -ve error code if unsuccessful.
+ */
+int vfs_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
 	int retval;
 
 	retval = security_inode_getattr(path);
 	if (retval)
 		return retval;
-	return vfs_getattr_nosec(path, stat);
+	return vfs_getattr_nosec(path, stat, request_mask, query_flags);
 }
-
 EXPORT_SYMBOL(vfs_getattr);
 
-int vfs_fstat(unsigned int fd, struct kstat *stat)
+/**
+ * vfs_statx_fd - Get the enhanced basic attributes by file descriptor
+ * @fd: The file descriptor referring to the file of interest
+ * @stat: The result structure to fill in.
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
+ *
+ * This function is a wrapper around vfs_getattr().  The main difference is
+ * that it uses a file descriptor to determine the file location.
+ *
+ * 0 will be returned on success, and a -ve error code if unsuccessful.
+ */
+int vfs_statx_fd(unsigned int fd, struct kstat *stat,
+		 u32 request_mask, unsigned int query_flags)
 {
 	struct fd f = fdget_raw(fd);
 	int error = -EBADF;
 
 	if (f.file) {
-		error = vfs_getattr(&f.file->f_path, stat);
+		error = vfs_getattr(&f.file->f_path, stat,
+				    request_mask, query_flags);
 		fdput(f);
 	}
 	return error;
 }
-EXPORT_SYMBOL(vfs_fstat);
+EXPORT_SYMBOL(vfs_statx_fd);
 
-int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
-		int flag)
+/**
+ * vfs_statx - Get basic and extra attributes by filename
+ * @dfd: A file descriptor representing the base dir for a relative filename
+ * @filename: The name of the file of interest
+ * @flags: Flags to control the query
+ * @stat: The result structure to fill in.
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ *
+ * This function is a wrapper around vfs_getattr().  The main difference is
+ * that it uses a filename and base directory to determine the file location.
+ * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink
+ * at the given name from being referenced.
+ *
+ * The caller must have preset stat->request_mask as for vfs_getattr().  The
+ * flags are also used to load up stat->query_flags.
+ *
+ * 0 will be returned on success, and a -ve error code if unsuccessful.
+ */
+int vfs_statx(int dfd, const char __user *filename, int flags,
+	      struct kstat *stat, u32 request_mask)
 {
 	struct path path;
 	int error = -EINVAL;
-	unsigned int lookup_flags = 0;
+	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
 
-	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
-		      AT_EMPTY_PATH)) != 0)
-		goto out;
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+		       AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
+		return -EINVAL;
 
-	if (!(flag & AT_SYMLINK_NOFOLLOW))
-		lookup_flags |= LOOKUP_FOLLOW;
-	if (flag & AT_EMPTY_PATH)
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
+
 retry:
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
 		goto out;
 
-	error = vfs_getattr(&path, stat);
+	error = vfs_getattr(&path, stat, request_mask, flags);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -116,19 +191,7 @@ retry:
 out:
 	return error;
 }
-EXPORT_SYMBOL(vfs_fstatat);
-
-int vfs_stat(const char __user *name, struct kstat *stat)
-{
-	return vfs_fstatat(AT_FDCWD, name, stat, 0);
-}
-EXPORT_SYMBOL(vfs_stat);
-
-int vfs_lstat(const char __user *name, struct kstat *stat)
-{
-	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
-}
-EXPORT_SYMBOL(vfs_lstat);
+EXPORT_SYMBOL(vfs_statx);
 
 
 #ifdef __ARCH_WANT_OLD_STAT
@@ -141,7 +204,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 {
 	static int warncount = 5;
 	struct __old_kernel_stat tmp;
-	
+
 	if (warncount > 0) {
 		warncount--;
 		printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
@@ -166,7 +229,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 #if BITS_PER_LONG == 32
 	if (stat->size > MAX_NON_LFS)
 		return -EOVERFLOW;
-#endif	
+#endif
 	tmp.st_size = stat->size;
 	tmp.st_atime = stat->atime.tv_sec;
 	tmp.st_mtime = stat->mtime.tv_sec;
@@ -445,6 +508,81 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
 }
 #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
 
+static inline int __put_timestamp(struct timespec *kts,
+				  struct statx_timestamp __user *uts)
+{
+	return (__put_user(kts->tv_sec,		&uts->tv_sec		) ||
+		__put_user(kts->tv_nsec,	&uts->tv_nsec		) ||
+		__put_user(0,			&uts->__reserved	));
+}
+
+/*
+ * Set the statx results.
+ */
+static long statx_set_result(struct kstat *stat, struct statx __user *buffer)
+{
+	uid_t uid = from_kuid_munged(current_user_ns(), stat->uid);
+	gid_t gid = from_kgid_munged(current_user_ns(), stat->gid);
+
+	if (__put_user(stat->result_mask,	&buffer->stx_mask	) ||
+	    __put_user(stat->mode,		&buffer->stx_mode	) ||
+	    __clear_user(&buffer->__spare0, sizeof(buffer->__spare0))	  ||
+	    __put_user(stat->nlink,		&buffer->stx_nlink	) ||
+	    __put_user(uid,			&buffer->stx_uid	) ||
+	    __put_user(gid,			&buffer->stx_gid	) ||
+	    __put_user(stat->attributes,	&buffer->stx_attributes	) ||
+	    __put_user(stat->blksize,		&buffer->stx_blksize	) ||
+	    __put_user(MAJOR(stat->rdev),	&buffer->stx_rdev_major	) ||
+	    __put_user(MINOR(stat->rdev),	&buffer->stx_rdev_minor	) ||
+	    __put_user(MAJOR(stat->dev),	&buffer->stx_dev_major	) ||
+	    __put_user(MINOR(stat->dev),	&buffer->stx_dev_minor	) ||
+	    __put_timestamp(&stat->atime,	&buffer->stx_atime	) ||
+	    __put_timestamp(&stat->btime,	&buffer->stx_btime	) ||
+	    __put_timestamp(&stat->ctime,	&buffer->stx_ctime	) ||
+	    __put_timestamp(&stat->mtime,	&buffer->stx_mtime	) ||
+	    __put_user(stat->ino,		&buffer->stx_ino	) ||
+	    __put_user(stat->size,		&buffer->stx_size	) ||
+	    __put_user(stat->blocks,		&buffer->stx_blocks	) ||
+	    __clear_user(&buffer->__spare1, sizeof(buffer->__spare1))	  ||
+	    __clear_user(&buffer->__spare2, sizeof(buffer->__spare2)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * sys_statx - System call to get enhanced stats
+ * @dfd: Base directory to pathwalk from *or* fd to stat.
+ * @filename: File to stat *or* NULL.
+ * @flags: AT_* flags to control pathwalk.
+ * @mask: Parts of statx struct actually required.
+ * @buffer: Result buffer.
+ *
+ * Note that if filename is NULL, then it does the equivalent of fstat() using
+ * dfd to indicate the file of interest.
+ */
+SYSCALL_DEFINE5(statx,
+		int, dfd, const char __user *, filename, unsigned, flags,
+		unsigned int, mask,
+		struct statx __user *, buffer)
+{
+	struct kstat stat;
+	int error;
+
+	if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
+		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, buffer, sizeof(*buffer)))
+		return -EFAULT;
+
+	if (filename)
+		error = vfs_statx(dfd, filename, flags, &stat, mask);
+	else
+		error = vfs_statx_fd(dfd, &stat, mask, flags);
+	if (error)
+		return error;
+	return statx_set_result(&stat, buffer);
+}
+
 /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
 void __inode_add_bytes(struct inode *inode, loff_t bytes)
 {
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 08d3e630b49c..83809f5b5eca 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -440,10 +440,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
 	return blocks;
 }
 
-int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int sysv_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	struct super_block *s = dentry->d_sb;
-	generic_fillattr(d_inode(dentry), stat);
+	struct super_block *s = path->dentry->d_sb;
+	generic_fillattr(d_inode(path->dentry), stat);
 	stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
 	stat->blksize = s->s_blocksize;
 	return 0;
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 6c212288adcb..1e7e27c729af 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
 extern int sysv_sync_inode(struct inode *);
 extern void sysv_set_inode(struct inode *, dev_t);
-extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int sysv_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int sysv_init_icache(void);
 extern void sysv_destroy_icache(void);
 
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 528369f3e472..30825d882aa9 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1622,11 +1622,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
-int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat)
+int ubifs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags)
 {
 	loff_t size;
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	mutex_lock(&ui->ui_mutex);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f0c86f076535..4d57e488038e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1749,8 +1749,8 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			      umode_t mode);
-int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
+int ubifs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags);
 int ubifs_check_dir_empty(struct inode *dir);
 
 /* xattr.c */
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index f7dfef53f739..6023c97c6da2 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -152,9 +152,10 @@ out_unmap:
 	return err;
 }
 
-static int udf_symlink_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			       struct kstat *stat)
+static int udf_symlink_getattr(const struct path *path, struct kstat *stat,
+				u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct inode *inode = d_backing_inode(dentry);
 	struct page *page;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 22c16155f1b4..229cc6a6d8ef 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -489,11 +489,12 @@ xfs_vn_get_link_inline(
 
 STATIC int
 xfs_vn_getattr(
-	struct vfsmount		*mnt,
-	struct dentry		*dentry,
-	struct kstat		*stat)
+	const struct path	*path,
+	struct kstat		*stat,
+	u32			request_mask,
+	unsigned int		query_flags)
 {
-	struct inode		*inode = d_inode(dentry);
+	struct inode		*inode = d_inode(path->dentry);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 52350947c670..aad3fd0ff5f8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1709,7 +1709,7 @@ struct inode_operations {
 	int (*rename) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+	int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
@@ -2902,8 +2902,8 @@ extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
 extern void generic_fillattr(struct inode *, struct kstat *);
-int vfs_getattr_nosec(struct path *path, struct kstat *stat);
-extern int vfs_getattr(struct path *, struct kstat *);
+extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
+extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
 void inode_add_bytes(struct inode *inode, loff_t bytes);
 void __inode_sub_bytes(struct inode *inode, loff_t bytes);
@@ -2916,10 +2916,29 @@ extern const struct inode_operations simple_symlink_inode_operations;
 
 extern int iterate_dir(struct file *, struct dir_context *);
 
-extern int vfs_stat(const char __user *, struct kstat *);
-extern int vfs_lstat(const char __user *, struct kstat *);
-extern int vfs_fstat(unsigned int, struct kstat *);
-extern int vfs_fstatat(int , const char __user *, struct kstat *, int);
+extern int vfs_statx(int, const char __user *, int, struct kstat *, u32);
+extern int vfs_statx_fd(unsigned int, struct kstat *, u32, unsigned int);
+
+static inline int vfs_stat(const char __user *filename, struct kstat *stat)
+{
+	return vfs_statx(AT_FDCWD, filename, 0, stat, STATX_BASIC_STATS);
+}
+static inline int vfs_lstat(const char __user *name, struct kstat *stat)
+{
+	return vfs_statx(AT_FDCWD, name, AT_SYMLINK_NOFOLLOW,
+			 stat, STATX_BASIC_STATS);
+}
+static inline int vfs_fstatat(int dfd, const char __user *filename,
+			      struct kstat *stat, int flags)
+{
+	return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
+}
+static inline int vfs_fstat(int fd, struct kstat *stat)
+{
+	return vfs_statx_fd(fd, stat, STATX_BASIC_STATS, 0);
+}
+
+
 extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
 extern int vfs_readlink(struct dentry *, char __user *, int);
 
@@ -2949,7 +2968,7 @@ extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, struct dir_context *);
 extern int simple_setattr(struct dentry *, struct iattr *);
-extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_open(struct inode *inode, struct file *file);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f1da8c8dd473..287f34161086 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -335,7 +335,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr);
-extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int nfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct inode *, int);
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 075cb0c7eb2a..c76e524fb34b 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -18,20 +18,32 @@
 #include <linux/time.h>
 #include <linux/uidgid.h>
 
+#define KSTAT_QUERY_FLAGS (AT_STATX_SYNC_TYPE)
+
 struct kstat {
-	u64		ino;
-	dev_t		dev;
+	u32		result_mask;	/* What fields the user got */
 	umode_t		mode;
 	unsigned int	nlink;
+	uint32_t	blksize;	/* Preferred I/O size */
+	u64		attributes;
+#define KSTAT_ATTR_FS_IOC_FLAGS				\
+	(STATX_ATTR_COMPRESSED |			\
+	 STATX_ATTR_IMMUTABLE |				\
+	 STATX_ATTR_APPEND |				\
+	 STATX_ATTR_NODUMP |				\
+	 STATX_ATTR_ENCRYPTED				\
+	 )/* Attrs corresponding to FS_*_FL flags */
+	u64		ino;
+	dev_t		dev;
+	dev_t		rdev;
 	kuid_t		uid;
 	kgid_t		gid;
-	dev_t		rdev;
 	loff_t		size;
-	struct timespec  atime;
+	struct timespec	atime;
 	struct timespec	mtime;
 	struct timespec	ctime;
-	unsigned long	blksize;
-	unsigned long long	blocks;
+	struct timespec	btime;			/* File creation time */
+	u64		blocks;
 };
 
 #endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 91a740f6b884..980c3c9b06f8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,6 +48,7 @@ struct stat;
 struct stat64;
 struct statfs;
 struct statfs64;
+struct statx;
 struct __sysctl_args;
 struct sysinfo;
 struct timespec;
@@ -902,5 +903,7 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
 				  unsigned long prot, int pkey);
 asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 asmlinkage long sys_pkey_free(int pkey);
+asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
+			  unsigned mask, struct statx __user *buffer);
 
 #endif
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index beed138bd359..813afd6eee71 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -63,5 +63,10 @@
 #define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
 #define AT_EMPTY_PATH		0x1000	/* Allow empty relative pathname */
 
+#define AT_STATX_SYNC_TYPE	0x6000	/* Type of synchronisation required from statx() */
+#define AT_STATX_SYNC_AS_STAT	0x0000	/* - Do whatever stat() does */
+#define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */
+#define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */
+
 
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 7fec7e36d921..51a6b86e3700 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -1,6 +1,7 @@
 #ifndef _UAPI_LINUX_STAT_H
 #define _UAPI_LINUX_STAT_H
 
+#include <linux/types.h>
 
 #if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2)
 
@@ -41,5 +42,135 @@
 
 #endif
 
+/*
+ * Timestamp structure for the timestamps in struct statx.
+ *
+ * tv_sec holds the number of seconds before (negative) or after (positive)
+ * 00:00:00 1st January 1970 UTC.
+ *
+ * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is
+ * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time.
+ *
+ * Note that if both tv_sec and tv_nsec are non-zero, then the two values must
+ * either be both positive or both negative.
+ *
+ * __reserved is held in case we need a yet finer resolution.
+ */
+struct statx_timestamp {
+	__s64	tv_sec;
+	__s32	tv_nsec;
+	__s32	__reserved;
+};
+
+/*
+ * Structures for the extended file attribute retrieval system call
+ * (statx()).
+ *
+ * The caller passes a mask of what they're specifically interested in as a
+ * parameter to statx().  What statx() actually got will be indicated in
+ * st_mask upon return.
+ *
+ * For each bit in the mask argument:
+ *
+ * - if the datum is not supported:
+ *
+ *   - the bit will be cleared, and
+ *
+ *   - the datum will be set to an appropriate fabricated value if one is
+ *     available (eg. CIFS can take a default uid and gid), otherwise
+ *
+ *   - the field will be cleared;
+ *
+ * - otherwise, if explicitly requested:
+ *
+ *   - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
+ *     set or if the datum is considered out of date, and
+ *
+ *   - the field will be filled in and the bit will be set;
+ *
+ * - otherwise, if not requested, but available in approximate form without any
+ *   effort, it will be filled in anyway, and the bit will be set upon return
+ *   (it might not be up to date, however, and no attempt will be made to
+ *   synchronise the internal state first);
+ *
+ * - otherwise the field and the bit will be cleared before returning.
+ *
+ * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
+ * will have values installed for compatibility purposes so that stat() and
+ * co. can be emulated in userspace.
+ */
+struct statx {
+	/* 0x00 */
+	__u32	stx_mask;	/* What results were written [uncond] */
+	__u32	stx_blksize;	/* Preferred general I/O size [uncond] */
+	__u64	stx_attributes;	/* Flags conveying information about the file [uncond] */
+	/* 0x10 */
+	__u32	stx_nlink;	/* Number of hard links */
+	__u32	stx_uid;	/* User ID of owner */
+	__u32	stx_gid;	/* Group ID of owner */
+	__u16	stx_mode;	/* File mode */
+	__u16	__spare0[1];
+	/* 0x20 */
+	__u64	stx_ino;	/* Inode number */
+	__u64	stx_size;	/* File size */
+	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
+	__u64	__spare1[1];
+	/* 0x40 */
+	struct statx_timestamp	stx_atime;	/* Last access time */
+	struct statx_timestamp	stx_btime;	/* File creation time */
+	struct statx_timestamp	stx_ctime;	/* Last attribute change time */
+	struct statx_timestamp	stx_mtime;	/* Last data modification time */
+	/* 0x80 */
+	__u32	stx_rdev_major;	/* Device ID of special file [if bdev/cdev] */
+	__u32	stx_rdev_minor;
+	__u32	stx_dev_major;	/* ID of device containing file [uncond] */
+	__u32	stx_dev_minor;
+	/* 0x90 */
+	__u64	__spare2[14];	/* Spare space for future expansion */
+	/* 0x100 */
+};
+
+/*
+ * Flags to be stx_mask
+ *
+ * Query request/result mask for statx() and struct statx::stx_mask.
+ *
+ * These bits should be set in the mask argument of statx() to request
+ * particular items when calling statx().
+ */
+#define STATX_TYPE		0x00000001U	/* Want/got stx_mode & S_IFMT */
+#define STATX_MODE		0x00000002U	/* Want/got stx_mode & ~S_IFMT */
+#define STATX_NLINK		0x00000004U	/* Want/got stx_nlink */
+#define STATX_UID		0x00000008U	/* Want/got stx_uid */
+#define STATX_GID		0x00000010U	/* Want/got stx_gid */
+#define STATX_ATIME		0x00000020U	/* Want/got stx_atime */
+#define STATX_MTIME		0x00000040U	/* Want/got stx_mtime */
+#define STATX_CTIME		0x00000080U	/* Want/got stx_ctime */
+#define STATX_INO		0x00000100U	/* Want/got stx_ino */
+#define STATX_SIZE		0x00000200U	/* Want/got stx_size */
+#define STATX_BLOCKS		0x00000400U	/* Want/got stx_blocks */
+#define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
+#define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
+#define STATX_ALL		0x00000fffU	/* All currently supported flags */
+
+/*
+ * Attributes to be found in stx_attributes
+ *
+ * These give information about the features or the state of a file that might
+ * be of use to ordinary userspace programs such as GUIs or ls rather than
+ * specialised tools.
+ *
+ * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
+ * semantically.  Where possible, the numerical value is picked to correspond
+ * also.
+ */
+#define STATX_ATTR_COMPRESSED		0x00000004 /* [I] File is compressed by the fs */
+#define STATX_ATTR_IMMUTABLE		0x00000010 /* [I] File is marked immutable */
+#define STATX_ATTR_APPEND		0x00000020 /* [I] File is append-only */
+#define STATX_ATTR_NODUMP		0x00000040 /* [I] File is not to be dumped */
+#define STATX_ATTR_ENCRYPTED		0x00000800 /* [I] File requires key to decrypt in fs */
+
+#define STATX_ATTR_AUTOMOUNT		0x00001000 /* Dir: Automount trigger */
+
 
 #endif /* _UAPI_LINUX_STAT_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index a26649a6633f..e07728f716b2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -958,10 +958,10 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			 struct kstat *stat)
+static int shmem_getattr(const struct path *path, struct kstat *stat,
+			 u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
diff --git a/samples/Kconfig b/samples/Kconfig
index b124f62ed6cb..9cb63188d3ef 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -112,4 +112,10 @@ config SAMPLE_VFIO_MDEV_MTTY
 	  Build a virtual tty sample driver for use as a VFIO
 	  mediated device
 
+config SAMPLE_STATX
+	bool "Build example extended-stat using code"
+	depends on BROKEN
+	help
+	  Build example userspace program to use the new extended-stat syscall.
+
 endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
index 86a137e451d9..db54e766ddb1 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -3,4 +3,4 @@
 obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ trace_events/ livepatch/ \
 			   hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \
 			   configfs/ connector/ v4l/ trace_printk/ blackfin/ \
-			   vfio-mdev/
+			   vfio-mdev/ statx/
diff --git a/samples/statx/Makefile b/samples/statx/Makefile
new file mode 100644
index 000000000000..1f80a3d8cf45
--- /dev/null
+++ b/samples/statx/Makefile
@@ -0,0 +1,10 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
diff --git a/samples/statx/test-statx.c b/samples/statx/test-statx.c
new file mode 100644
index 000000000000..8571d766331d
--- /dev/null
+++ b/samples/statx/test-statx.c
@@ -0,0 +1,254 @@
+/* Test the statx() system call.
+ *
+ * Note that the output of this program is intended to look like the output of
+ * /bin/stat where possible.
+ *
+ * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define _GNU_SOURCE
+#define _ATFILE_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <sys/stat.h>
+
+#define AT_STATX_SYNC_TYPE	0x6000
+#define AT_STATX_SYNC_AS_STAT	0x0000
+#define AT_STATX_FORCE_SYNC	0x2000
+#define AT_STATX_DONT_SYNC	0x4000
+
+static __attribute__((unused))
+ssize_t statx(int dfd, const char *filename, unsigned flags,
+	      unsigned int mask, struct statx *buffer)
+{
+	return syscall(__NR_statx, dfd, filename, flags, mask, buffer);
+}
+
+static void print_time(const char *field, struct statx_timestamp *ts)
+{
+	struct tm tm;
+	time_t tim;
+	char buffer[100];
+	int len;
+
+	tim = ts->tv_sec;
+	if (!localtime_r(&tim, &tm)) {
+		perror("localtime_r");
+		exit(1);
+	}
+	len = strftime(buffer, 100, "%F %T", &tm);
+	if (len == 0) {
+		perror("strftime");
+		exit(1);
+	}
+	printf("%s", field);
+	fwrite(buffer, 1, len, stdout);
+	printf(".%09u", ts->tv_nsec);
+	len = strftime(buffer, 100, "%z", &tm);
+	if (len == 0) {
+		perror("strftime2");
+		exit(1);
+	}
+	fwrite(buffer, 1, len, stdout);
+	printf("\n");
+}
+
+static void dump_statx(struct statx *stx)
+{
+	char buffer[256], ft = '?';
+
+	printf("results=%x\n", stx->stx_mask);
+
+	printf(" ");
+	if (stx->stx_mask & STATX_SIZE)
+		printf(" Size: %-15llu", (unsigned long long)stx->stx_size);
+	if (stx->stx_mask & STATX_BLOCKS)
+		printf(" Blocks: %-10llu", (unsigned long long)stx->stx_blocks);
+	printf(" IO Block: %-6llu", (unsigned long long)stx->stx_blksize);
+	if (stx->stx_mask & STATX_TYPE) {
+		switch (stx->stx_mode & S_IFMT) {
+		case S_IFIFO:	printf("  FIFO\n");			ft = 'p'; break;
+		case S_IFCHR:	printf("  character special file\n");	ft = 'c'; break;
+		case S_IFDIR:	printf("  directory\n");		ft = 'd'; break;
+		case S_IFBLK:	printf("  block special file\n");	ft = 'b'; break;
+		case S_IFREG:	printf("  regular file\n");		ft = '-'; break;
+		case S_IFLNK:	printf("  symbolic link\n");		ft = 'l'; break;
+		case S_IFSOCK:	printf("  socket\n");			ft = 's'; break;
+		default:
+			printf(" unknown type (%o)\n", stx->stx_mode & S_IFMT);
+			break;
+		}
+	} else {
+		printf(" no type\n");
+	}
+
+	sprintf(buffer, "%02x:%02x", stx->stx_dev_major, stx->stx_dev_minor);
+	printf("Device: %-15s", buffer);
+	if (stx->stx_mask & STATX_INO)
+		printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino);
+	if (stx->stx_mask & STATX_NLINK)
+		printf(" Links: %-5u", stx->stx_nlink);
+	if (stx->stx_mask & STATX_TYPE) {
+		switch (stx->stx_mode & S_IFMT) {
+		case S_IFBLK:
+		case S_IFCHR:
+			printf(" Device type: %u,%u",
+			       stx->stx_rdev_major, stx->stx_rdev_minor);
+			break;
+		}
+	}
+	printf("\n");
+
+	if (stx->stx_mask & STATX_MODE)
+		printf("Access: (%04o/%c%c%c%c%c%c%c%c%c%c)  ",
+		       stx->stx_mode & 07777,
+		       ft,
+		       stx->stx_mode & S_IRUSR ? 'r' : '-',
+		       stx->stx_mode & S_IWUSR ? 'w' : '-',
+		       stx->stx_mode & S_IXUSR ? 'x' : '-',
+		       stx->stx_mode & S_IRGRP ? 'r' : '-',
+		       stx->stx_mode & S_IWGRP ? 'w' : '-',
+		       stx->stx_mode & S_IXGRP ? 'x' : '-',
+		       stx->stx_mode & S_IROTH ? 'r' : '-',
+		       stx->stx_mode & S_IWOTH ? 'w' : '-',
+		       stx->stx_mode & S_IXOTH ? 'x' : '-');
+	if (stx->stx_mask & STATX_UID)
+		printf("Uid: %5d   ", stx->stx_uid);
+	if (stx->stx_mask & STATX_GID)
+		printf("Gid: %5d\n", stx->stx_gid);
+
+	if (stx->stx_mask & STATX_ATIME)
+		print_time("Access: ", &stx->stx_atime);
+	if (stx->stx_mask & STATX_MTIME)
+		print_time("Modify: ", &stx->stx_mtime);
+	if (stx->stx_mask & STATX_CTIME)
+		print_time("Change: ", &stx->stx_ctime);
+	if (stx->stx_mask & STATX_BTIME)
+		print_time(" Birth: ", &stx->stx_btime);
+
+	if (stx->stx_attributes) {
+		unsigned char bits;
+		int loop, byte;
+
+		static char attr_representation[64 + 1] =
+			/* STATX_ATTR_ flags: */
+			"????????"	/* 63-56 */
+			"????????"	/* 55-48 */
+			"????????"	/* 47-40 */
+			"????????"	/* 39-32 */
+			"????????"	/* 31-24	0x00000000-ff000000 */
+			"????????"	/* 23-16	0x00000000-00ff0000 */
+			"???me???"	/* 15- 8	0x00000000-0000ff00 */
+			"?dai?c??"	/*  7- 0	0x00000000-000000ff */
+			;
+
+		printf("Attributes: %016llx (", stx->stx_attributes);
+		for (byte = 64 - 8; byte >= 0; byte -= 8) {
+			bits = stx->stx_attributes >> byte;
+			for (loop = 7; loop >= 0; loop--) {
+				int bit = byte + loop;
+
+				if (bits & 0x80)
+					putchar(attr_representation[63 - bit]);
+				else
+					putchar('-');
+				bits <<= 1;
+			}
+			if (byte)
+				putchar(' ');
+		}
+		printf(")\n");
+	}
+}
+
+static void dump_hex(unsigned long long *data, int from, int to)
+{
+	unsigned offset, print_offset = 1, col = 0;
+
+	from /= 8;
+	to = (to + 7) / 8;
+
+	for (offset = from; offset < to; offset++) {
+		if (print_offset) {
+			printf("%04x: ", offset * 8);
+			print_offset = 0;
+		}
+		printf("%016llx", data[offset]);
+		col++;
+		if ((col & 3) == 0) {
+			printf("\n");
+			print_offset = 1;
+		} else {
+			printf(" ");
+		}
+	}
+
+	if (!print_offset)
+		printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	struct statx stx;
+	int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW;
+
+	unsigned int mask = STATX_ALL;
+
+	for (argv++; *argv; argv++) {
+		if (strcmp(*argv, "-F") == 0) {
+			atflag &= ~AT_STATX_SYNC_TYPE;
+			atflag |= AT_STATX_FORCE_SYNC;
+			continue;
+		}
+		if (strcmp(*argv, "-D") == 0) {
+			atflag &= ~AT_STATX_SYNC_TYPE;
+			atflag |= AT_STATX_DONT_SYNC;
+			continue;
+		}
+		if (strcmp(*argv, "-L") == 0) {
+			atflag &= ~AT_SYMLINK_NOFOLLOW;
+			continue;
+		}
+		if (strcmp(*argv, "-O") == 0) {
+			mask &= ~STATX_BASIC_STATS;
+			continue;
+		}
+		if (strcmp(*argv, "-A") == 0) {
+			atflag |= AT_NO_AUTOMOUNT;
+			continue;
+		}
+		if (strcmp(*argv, "-R") == 0) {
+			raw = 1;
+			continue;
+		}
+
+		memset(&stx, 0xbf, sizeof(stx));
+		ret = statx(AT_FDCWD, *argv, atflag, mask, &stx);
+		printf("statx(%s) = %d\n", *argv, ret);
+		if (ret < 0) {
+			perror(*argv);
+			exit(1);
+		}
+
+		if (raw)
+			dump_hex((unsigned long long *)&stx, 0, sizeof(stx));
+
+		dump_statx(&stx);
+	}
+	return 0;
+}
-- 
cgit v1.2.3